From 79aa8316684b1e8fb577ca1adddcee42b5c0ab8a Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 8 Apr 2026 22:03:55 +0200 Subject: [PATCH 01/64] fix typos in CHANGELOG.md --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c60cc75..ea675fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,7 +47,7 @@ supported by the collections, i.e., classes for which storage- and validation-endpoints exist. -- Add `/maintenance`-endpoint to temporarilly lock collections for non-curator +- Add `/maintenance`-endpoint to temporarily lock collections for non-curator access. @@ -569,7 +569,7 @@ ## New features -- Factor out a Schema Type Layer (STL) from the `record_dir` backend." The STL +- Factor out a Schema Type Layer (STL) from the `record_dir` backend. The STL can be used with every backend. It removes top-level `schema_type`-entries from records before they are stored. It also adds the correct top-level `schema_type`-entry to records that are read from a store. This functionality -- 2.52.0 From 0cc51a33a498b065fad3c9733efaf643af0d9c93 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 15 Apr 2026 08:32:10 +0200 Subject: [PATCH 02/64] add dynamic collection creation --- README.md | 9 +- dump_things_service/__init__.py | 4 + dump_things_service/abstract_config.py | 171 +++++++ dump_things_service/admin.py | 21 + dump_things_service/audit/__init__.py | 9 +- dump_things_service/auth/config.py | 2 +- dump_things_service/collection.py | 477 ++++++++++++++++++ dump_things_service/collection_endpoints.py | 192 +++++++ dump_things_service/config.py | 51 +- dump_things_service/exceptions.py | 4 + dump_things_service/main.py | 11 +- dump_things_service/manifest.py | 118 +++++ dump_things_service/mapping_functions.py | 82 +++ .../tests/test_token_endpoints.py | 45 ++ dump_things_service/token.py | 1 + dump_things_service/token_endpoints.py | 193 +++++++ 16 files changed, 1370 insertions(+), 20 deletions(-) create mode 100644 dump_things_service/abstract_config.py create mode 100644 dump_things_service/admin.py create mode 100644 dump_things_service/collection.py create mode 100644 dump_things_service/collection_endpoints.py create mode 100644 dump_things_service/manifest.py create mode 100644 dump_things_service/mapping_functions.py create mode 100644 dump_things_service/tests/test_token_endpoints.py create mode 100644 dump_things_service/token_endpoints.py diff --git a/README.md b/README.md index f126c06..024aee7 100644 --- a/README.md +++ b/README.md @@ -702,15 +702,16 @@ collections: incoming: datamgt tokens: - anon_read: + anon_read: # The name of the token (serves also as representation if no representation is defines) user_id: anonymous collections: - datamgt: - mode: READ_CURATED - incoming_label: "" + datamgt: # per collection token configuration; contains: + mode: READ_CURATED # - token mode + incoming_label: "" # - the label for the incoming area for this token and this collection, i.e., collection: "datamgt". trusted-submitter-token: user_id: trusted_submitter + representation: 00112233445566778899aabbccdd # The representation that the client has to send in an `x-dumpthings-token`-header (if not given, the token name will be the representation) collections: datamgt: mode: WRITE_COLLECTION diff --git a/dump_things_service/__init__.py b/dump_things_service/__init__.py index 5fe7ad2..dbd6b51 100644 --- a/dump_things_service/__init__.py +++ b/dump_things_service/__init__.py @@ -6,11 +6,13 @@ from typing import ( from starlette.status import ( HTTP_200_OK, + HTTP_201_CREATED, HTTP_300_MULTIPLE_CHOICES, HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND, + HTTP_409_CONFLICT, HTTP_500_INTERNAL_SERVER_ERROR, HTTP_503_SERVICE_UNAVAILABLE, ) @@ -24,11 +26,13 @@ from starlette.status import ( __all__ = [ 'Format', 'HTTP_200_OK', + 'HTTP_201_CREATED', 'HTTP_300_MULTIPLE_CHOICES', 'HTTP_400_BAD_REQUEST', 'HTTP_401_UNAUTHORIZED', 'HTTP_403_FORBIDDEN', 'HTTP_404_NOT_FOUND', + 'HTTP_409_CONFLICT', 'HTTP_413_CONTENT_TOO_LARGE', 'HTTP_422_UNPROCESSABLE_CONTENT', 'HTTP_500_INTERNAL_SERVER_ERROR', diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py new file mode 100644 index 0000000..7083063 --- /dev/null +++ b/dump_things_service/abstract_config.py @@ -0,0 +1,171 @@ +import enum +import logging +from pathlib import ( + Path, + PurePosixPath, +) +from typing import Literal + +from pydantic import ( + BaseModel, + ConfigDict, + Field, +) + +from dump_things_service.mapping_functions import ( + MappingMethod, + mapping_functions, +) + + +logger = logging.getLogger('dump_things_service') + + +class StrictModel(BaseModel): + model_config = ConfigDict( + extra='forbid', + use_enum_values=True, + ) + + +class ConfigAuthSpec(BaseModel): + type: Literal['config'] = 'config' + + +class ForgejoAuthSpec(BaseModel): + type: Literal['forgejo'] + url: str + organization: str + team: str + label_type: Literal['team', 'user'] + repository: str | None = None + + +class TagSpec(BaseModel): + submitter_id_tag: str = 'http://purl.obolibrary.org/obo/NCIT_C54269' + submission_time_tag: str = 'http://semanticscience.org/resource/SIO_001083' + + +class RecordDirBackendConfig(StrictModel): + type: Literal['record_dir', 'record_dir+stl'] + mapping_method: MappingMethod = MappingMethod.digest_md5 + class Config: + use_enum_values = True + + +class SQLiteBackendConfig(StrictModel): + type: Literal['sqlite', 'sqlite+stl'] + + +class GitAuditBackendConfig(StrictModel): + type: Literal['gitaudit'] + path: Path + auto_flush_timeout: int = 60 + + +class CollectionConfig(BaseModel): + model_config = ConfigDict(extra='forbid', use_enum_values=True) + name: str + default_token: str + curated: PurePosixPath + schema: str + incoming: PurePosixPath | None = None + backend: RecordDirBackendConfig | SQLiteBackendConfig = RecordDirBackendConfig(type='record_dir+stl') + auth_sources: list[ForgejoAuthSpec | ConfigAuthSpec] = [ConfigAuthSpec()] + audit_backends: list[GitAuditBackendConfig] = [] + submission_tags: TagSpec = TagSpec() + use_classes: list[str] = [] + ignore_classes: list[str] = [] + + +class TokenModes(enum.Enum): + READ_CURATED = 'READ_CURATED' + READ_COLLECTION = 'READ_COLLECTION' + WRITE_COLLECTION = 'WRITE_COLLECTION' + READ_SUBMISSIONS = 'READ_SUBMISSIONS' + WRITE_SUBMISSIONS = 'WRITE_SUBMISSIONS' + SUBMIT = 'SUBMIT' + SUBMIT_ONLY = 'SUBMIT_ONLY' + NOTHING = 'NOTHING' + CURATOR = 'CURATOR' + ADMIN = 'ADMIN' + + +class TokenCollectionConfig(StrictModel): + model_config = ConfigDict(extra='forbid', use_enum_values=True) + mode: TokenModes + incoming_label: str = Field(strict=True) + + +class TokenConfig(StrictModel): + user_id: str + collections: dict[str, TokenCollectionConfig] + hashed: bool = False + + +dump_things_config_iri = 'dump_things:config' +dump_things_private_path = Path('__dump_things__') +config_backend_path = dump_things_private_path / 'config_store' +config_audit_path = dump_things_private_path / 'config_audit' +config_backend = None +config_audit = None + + +class Configuration(BaseModel): + collections: dict[str, CollectionConfig] = {} + tokens: dict[str, TokenConfig] = {} + pid: str = dump_things_config_iri + + +from dump_things_service.audit.gitaudit import GitAuditBackend +from dump_things_service.backends.record_dir import _RecordDirStore, RecordDirStore + + +def get_config_backends( + store_path: Path, +) -> tuple[_RecordDirStore, GitAuditBackend]: + global config_audit + global config_backend + + config_path = store_path / config_backend_path + if not config_path.exists(): + config_path.mkdir(parents=True) + + if config_backend is None: + config_backend = RecordDirStore( + config_path, + mapping_functions[MappingMethod.digest_md5], + 'yaml' + ) + + audit_path = store_path / config_audit_path + if not audit_path.exists(): + audit_path.mkdir(parents=True) + + if config_audit is None: + config_audit = GitAuditBackend(audit_path) + return config_backend, config_audit + + +def read_config(store_path) -> Configuration: + config_backend, _ = get_config_backends(store_path) + record_info = config_backend.get_record_by_iri(dump_things_config_iri) + return Configuration(**(record_info.json_object)) if record_info else Configuration() + + +def store_config( + store_path, + config: Configuration, +): + config_backend, audit_backend = get_config_backends(store_path) + json_object = config.model_dump(mode='json', exclude_none=True) + json_object['pid'] = dump_things_config_iri + config_backend.add_record( + iri=dump_things_config_iri, + class_name='DumpThingsConfig', + json_object=json_object + ) + audit_backend.add_record( + record=json_object, + committer_id='__dump_things_server__', + ) diff --git a/dump_things_service/admin.py b/dump_things_service/admin.py new file mode 100644 index 0000000..4b5c561 --- /dev/null +++ b/dump_things_service/admin.py @@ -0,0 +1,21 @@ +import sys + +from fastapi import HTTPException + +from dump_things_service import ( + HTTP_401_UNAUTHORIZED, +) +from dump_things_service.config import InstanceConfig + + +def authenticate_admin( + instance_config: InstanceConfig, + api_key: str, +): + print('IMPLEMENT: authenticate_admin() ', file=sys.stderr, flush=True) + if api_key != 'admin-1': + detail = f'invalid admin token: {api_key}' + raise HTTPException( + status_code=HTTP_401_UNAUTHORIZED, + detail=detail, + ) diff --git a/dump_things_service/audit/__init__.py b/dump_things_service/audit/__init__.py index c7520ce..58b8458 100644 --- a/dump_things_service/audit/__init__.py +++ b/dump_things_service/audit/__init__.py @@ -20,14 +20,7 @@ class AuditBackend(metaclass=ABCMeta): :param committer_id: the ID of the user who adds the record. :param author_id: the ID of the user who modified the record, defaults to `committer_id` if not given. - :return: A dictionary where the keys are time stamps of the changes, - the values are tuples containing the elements: - (user_id, diff, resulting_record), where user_id is the - `user_id` that was used in `add_record`, `resulting_record` is - the YAML-representation of `record` that was given to - `add_record`, and diff is path the transfers the previous - version of the record to the version provided in `record` (in - git-diff format). + :return: None """ raise NotImplementedError diff --git a/dump_things_service/auth/config.py b/dump_things_service/auth/config.py index 3dc50ba..a4ce625 100644 --- a/dump_things_service/auth/config.py +++ b/dump_things_service/auth/config.py @@ -31,7 +31,7 @@ class ConfigAuthenticationSource(AuthenticationSource): ) -> AuthenticationInfo: token = self._resolve_hashed_token(token) - token_info = self.instance_config.tokens.get(self.collection, {}).get(token, missing) + token_info = self.instance_config.tokens_per_collection.get(self.collection, {}).get(token, missing) if token_info is missing: msg = f'Token not valid for collection `{self.collection}`' raise InvalidTokenError(msg) diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py new file mode 100644 index 0000000..069f007 --- /dev/null +++ b/dump_things_service/collection.py @@ -0,0 +1,477 @@ +import logging +import shutil +import sys +from pathlib import Path +from typing import Any + +from fastapi import ( + FastAPI, + HTTPException, +) +from pydantic import ( + BaseModel, + TypeAdapter, + ValidationError, +) + +from dump_things_service import ( + HTTP_400_BAD_REQUEST, + HTTP_403_FORBIDDEN, + HTTP_422_UNPROCESSABLE_CONTENT, +) +from dump_things_service.abstract_config import ( + CollectionConfig, + RecordDirBackendConfig, + SQLiteBackendConfig, +) +from dump_things_service.backends.sqlite import record_file_name as sqlite_db_filename +from dump_things_service.config import ( + InstanceConfig, + get_config, +) +from dump_things_service.converter import FormatConverter +from dump_things_service.exceptions import ( + ConfigError, + ConfigCollisionError, + CurieResolutionError, +) +from dump_things_service.model import get_model_for_schema +from dump_things_service.utils import ( + check_collection, + combine_ttl, + get_default_token_name, + get_token_store, + join_default_token_permissions, + wrap_http_exception, +) + + +# This following two lines are required for dynamic endpoint generation +from typing import Annotated +from fastapi import Body, Depends +from dump_things_service import Format +from dump_things_service.api_key import api_key_header_scheme +from starlette.responses import JSONResponse, PlainTextResponse + + +logger = logging.getLogger('dump_things_service') + +_endpoint_template = """ +async def {name}( + data: {model_var_name}.{class_name} | Annotated[str, Body(media_type='text/plain')], + api_key: str = Depends(api_key_header_scheme), + format: Format = Format.json, +) -> JSONResponse | PlainTextResponse: + logger.info('{name}(%s, %s, %s, %s)', repr(data), repr('{class_name}'), repr({model_var_name}), repr(format)) + return await {handler}('{collection}', data, '{class_name}', {model_var_name}, format, api_key) +""" + + +_endpoint_curated_template = """ +async def {name}( + data: {model_var_name}.{class_name}, + author_id: str | None = None, + api_key: str = Depends(api_key_header_scheme), +) -> JSONResponse: + logger.info( + '{name}(%s, %s, %s)', + repr(data), + repr(author_id), + repr({model_var_name}), + ) + return await store_curated_record( + '{collection}', + data, + '{class_name}', + author_id, + api_key, + ) +""" + +_endpoint_incoming_template = """ +async def {name}( + data: {model_var_name}.{class_name}, + label: str, + api_key: str = Depends(api_key_header_scheme), +) -> JSONResponse: + logger.info( + '{name}(%s, %s, %s)', + repr(data), + repr(label), + repr({model_var_name}), + ) + return await store_incoming_record( + '{collection}', + label, + data, + '{class_name}', + api_key, + ) +""" + + +def create_collection( + instance_config: InstanceConfig, + collection_configuration: CollectionConfig, +): + """Create a collection as specified by `collection_configuration` + + Reuse existing disk structures, if they are compatible. If they are not + compatible, raise an error. + + :param instance_config: + :param collection_configuration: + :return: + """ + + curated_path = Path(instance_config.store_path / collection_configuration.curated) + incoming_path = ( + None + if collection_configuration.incoming is None + else Path(instance_config.store_path / collection_configuration.incoming) + ) + + # Check for compatibility of all existing stores before creating any + # structures on disk. + if curated_path.exists(): + check_store_compatibility( + curated_path, + collection_configuration.backend, + collection_configuration.schema, + ) + + if incoming_path and incoming_path.exists(): + check_store_compatibility( + incoming_path, + collection_configuration.backend, + collection_configuration.schema, + ) + + for audit_backend in collection_configuration.audit_backends: + audit_path = Path(instance_config.store_path / audit_backend.path) + if audit_path.exists(): + check_audit_compatibility(audit_path) + + # We knoe now that all existing structures are compatible with the + # collection specification. We record what was created in order to delete + # it in case of an error. + created_directories = [] + try: + if not curated_path.exists(): + curated_path.mkdir(parents=True) + created_directories.append(curated_path) + + if incoming_path and not incoming_path.exists(): + incoming_path.mkdir(parents=True) + created_directories.append(incoming_path) + + for audit_backend in collection_configuration.audit_backends: + audit_path = Path(instance_config.store_path / audit_backend.path) + if not audit_path.exists(): + audit_path.mkdir(parents=True) + created_directories.append(audit_path) + + except ConfigError as e: + # Delete all directories that were created in this + for directory in created_directories: + shutil.rmtree(directory) + raise + + # Create the backends + create_backend( + curated_path, + collection_configuration.backend, + collection_configuration.schema, + ) + created_directories.append(curated_path) + + if incoming_path: + create_backend( + incoming_path, + collection_configuration.backend, + collection_configuration.schema, + ) + created_directories.append(incoming_path) + + # Create the audit log + for audit_backend in collection_configuration.audit_backends: + audit_path = Path(instance_config.store_path / audit_backend.path) + if not audit_path.exists(): + create_audit_store(audit_path) + created_directories.append(audit_path) + + # Create the dynamic endpoints for record storing & validation, for + # inbox-storing, and for curated area storing. + create_endpoints_for_collection( + collection_configuration, + instance_config.fastapi_app, + ) + + # Create the collection configuration element + instance_config.xxx_collections[collection_configuration.name] = collection_configuration + + +def create_backend( + incoming_path: Path, + backend_config: RecordDirBackendConfig | SQLiteBackendConfig, + schema: str, +): + assert backend_config.type == 'record_dir+stl' + + print(f'Incoming path: {incoming_path}') + print(f'backend spec: {backend_config}') + print(f'schema: {schema}') + + (incoming_path / '.dumpthings.yaml').write_text(f""" +type: records +version: 1 +schema: {schema} +format: yaml +idfx: {backend_config.mapping_method} +""" + ) + + +def create_audit_store(*args, **kwargs): + return + + +def check_store_compatibility( + store_path: Path, + backend_config: RecordDirBackendConfig|SQLiteBackendConfig, + schema: str, +): + """Check if an existing store is compatible with the specs + + :param store_path: + :param backend_config: + :param schema: + :return: + """ + if not store_path.exists(): + return + if isinstance(backend_config, RecordDirBackendConfig): + check_record_dir_compatibility(store_path, backend_config, schema) + elif isinstance(backend_config, SQLiteBackendConfig): + check_sqlite_compatibility(store_path, backend_config, schema) + else: + raise ConfigError( + f"Unsupported backend config type: '{type(backend_config)}'" + ) + return + + +def check_record_dir_compatibility( + store_path: Path, + backend_config: RecordDirBackendConfig, + schema: str, +): + from dump_things_service.config import Config + + record_dir_config = Config.get_collection_dir_config(store_path) + if record_dir_config.schema != schema: + raise ConfigCollisionError(f"Existing backend uses a different schema: '{record_dir_config.schema}'") + + stored_mapping_method = record_dir_config.idfx.value + if stored_mapping_method != backend_config.mapping_method: + msg = f"Configuration specifies mapping method '{backend_config.mapping_method}', existing backend uses mapping method: '{stored_mapping_method}'" + raise ConfigCollisionError(msg) + return + + +def check_sqlite_compatibility( + store_path: Path, + backend_config: SQLiteBackendConfig, + schema: str, +): + sqlite_db_path = Path(store_path / sqlite_db_filename) + if not sqlite_db_path.exists(): + raise ConfigError('No sqlite database found in existing store') + return + + +def check_audit_compatibility( + audit_path: Path, +): + """Check if an existing audit is compatible with the specs + + :param audit_path: + :return: + """ + if not audit_path.exists(): + return + print('IMPLEMENT: check_audit_compatibility', file=sys.stderr, flush=True) + + + +# store_record +# validate_record +# store_curated_record +# store_incoming_record + + +def create_endpoint( + operation_name: str, + operation_path: str, + collection_config: CollectionConfig, + template: str, + handler: str, + tag_name: str, + app: FastAPI, +): + logger.info( + f'Creating %s-endpoints for collection: "%s"', + operation_name, + collection_config.name, + ) + + model, classes, model_var_name = get_model_for_schema(collection_config.schema) + globals()[model_var_name] = model + + use_classes = set(classes) + if collection_config.use_classes: + use_classes &= set(collection_config.use_classes) + + if collection_config.ignore_classes: + use_classes -= set(collection_config.ignore_classes) + + for class_name in use_classes: + endpoint_name = f'_endpoint_{collection_config.name}_{operation_name}_{class_name}' + endpoint_source = template.format( + name=endpoint_name, + model_var_name=model_var_name, + class_name=class_name, + collection=collection_config.name, + info=f"'{operation_name} {collection_config.name}/{class_name} objects'", + handler=handler, + ) + exec(endpoint_source, globals()) # noqa S102 + + # Create an API route for the endpoint + app.add_api_route( + path=f'/{collection_config.name}/{operation_path}/{class_name}', + endpoint=globals()[endpoint_name], + methods=['POST'], + name=f'{operation_name} "{class_name}" object (schema: {model.linkml_meta["id"]})', + response_model=None, + tags=[tag_name] + ) + + logger.info( + 'Creation of %d %s-endpoints completed.', + len(use_classes), + operation_name, + ) + + +def create_endpoints_for_collection( + collection_config: CollectionConfig, + app: FastAPI, +): + for ( + operation_name, + operation_path, + template, + handler, + tag_name, + ) in ( + ('store', 'record', _endpoint_template, 'store_record', f'Write records to collection "{collection_config.name}"'), + ('validate', 'validate', _endpoint_template, 'validate_record', f'Validate records for collection "{collection_config.name}"'), + ('curated', 'curated/record', _endpoint_template, 'store_curated_record', f'Store records in curated area of collection "{collection_config.name}"'), + ('incoming', 'incoming/{label}/record', _endpoint_template, 'store_incoming_record', f'Store records in incoming area "{{label}}" of collection "{collection_config.name}"'), + ): + create_endpoint( + operation_name=operation_name, + operation_path=operation_path, + collection_config=collection_config, + template=template, + handler=handler, + tag_name=tag_name, + app=app, + ) + + +def store_record( + collection: str, + data: BaseModel | str, + class_name: str, + model: Any, + input_format: Format, + api_key: str | None = Depends(api_key_header_scheme), +) -> JSONResponse | PlainTextResponse: + if input_format == Format.json and isinstance(data, str): + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail='Invalid JSON data provided.' + ) + + if input_format == Format.ttl and not isinstance(data, str): + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail='Invalid ttl data provided.' + ) + + instance_config = get_config() + check_collection(instance_config, collection) + + token = ( + get_default_token_name(instance_config, collection) + if api_key is None + else api_key + ) + + # Get the token permissions and extend them by the default permissions. + # This call will also convert plaintext tokens into the hashed version of + # the token, if the token is hashed. This is necessary because we do not + # store the plaintext token, so all token-information is associated with + # the hashed representation of the token. + store, token, token_permissions, user_id = get_token_store( + instance_config, + collection, + token, + ) + final_permissions = join_default_token_permissions( + instance_config, token_permissions, collection + ) + if not final_permissions.incoming_write: + raise HTTPException( + status_code=HTTP_403_FORBIDDEN, + detail=f"Not authorized to submit to collection '{collection}'.", + ) + + if input_format == Format.ttl: + with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Conversion error'): + json_object = FormatConverter( + instance_config.schemas[collection], + input_format=Format.ttl, + output_format=Format.json, + ).convert(data, class_name) + with wrap_http_exception(ValidationError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): + record = TypeAdapter(getattr(model, class_name)).validate_python(json_object) + else: + record = data + + with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): + instance_config.validators[collection].validate(record) + + with wrap_http_exception(CurieResolutionError): + stored_records = store.store_object(obj=record, submitter=user_id) + + if input_format == Format.ttl: + format_converter = FormatConverter( + instance_config.schemas[collection], + input_format=Format.json, + output_format=Format.ttl, + ) + with wrap_http_exception(ValueError, header='Conversion error'): + return PlainTextResponse( + combine_ttl( + [ + format_converter.convert( + record, + class_name, + ) + for class_name, record in stored_records + ] + ), + media_type='text/turtle', + ) + return JSONResponse([record for _, record in stored_records]) diff --git a/dump_things_service/collection_endpoints.py b/dump_things_service/collection_endpoints.py new file mode 100644 index 0000000..56dd3a0 --- /dev/null +++ b/dump_things_service/collection_endpoints.py @@ -0,0 +1,192 @@ +import logging +import random +import sys +from pathlib import ( + Path, + PurePosixPath, +) +from typing import ( + Literal, + cast, +) +from urllib.parse import quote + +from fastapi import ( + APIRouter, + Depends, + HTTPException, + Response, +) +from pydantic import BaseModel + +from dump_things_service import ( + HTTP_201_CREATED, + HTTP_401_UNAUTHORIZED, + HTTP_409_CONFLICT, +) +from dump_things_service.abstract_config import read_config, store_config, Configuration +from dump_things_service.admin import authenticate_admin +from dump_things_service.api_key import api_key_header_scheme +from dump_things_service.manifest import manifest_configuration +from dump_things_service.config import ( + Config, + InstanceConfig, + get_config, +) +from dump_things_service.exceptions import ConfigError +from dump_things_service.utils import check_collection, wrap_http_exception + +logger = logging.getLogger('dump_things_service') +router = APIRouter() + + +class ConfigAuthSpec(BaseModel): + type: Literal['config'] = 'config' + + +class ForgejoAuthSpec(BaseModel): + type: Literal['forgejo'] + url: str + organization: str + team: str + label_type: Literal['team', 'user'] + repository: str | None = None + + +class TagSpec(BaseModel): + submitter_id_tag: str = 'http://purl.obolibrary.org/obo/NCIT_C54269' + submission_time_tag: str = 'http://semanticscience.org/resource/SIO_001083' + + +class CreateCollectionRequest(BaseModel): + name: str + default_token: str + schema: str + curated: PurePosixPath + incoming: PurePosixPath | None = None + backend: str = 'record_dir+stl' + auth_sources: list[ForgejoAuthSpec | ConfigAuthSpec] = [ConfigAuthSpec()] + submission_tags: TagSpec = TagSpec() + use_classes: list[str] | None = None + ignore_classes: list[str] | None = None + + +class CollectionResponse(BaseModel): + name: str + default_token: str + schema: str + curated: PurePosixPath + incoming: PurePosixPath | None = None + backend: str = 'record_dir+stl' + auth_sources: list[ForgejoAuthSpec | ConfigAuthSpec] = [ConfigAuthSpec()] + submission_tags: TagSpec = TagSpec() + use_classes: list[str] | None = None + ignore_classes: list[str] | None = None + + +@router.post( + '/collections', + tags=['Administration interface'], + name='Create a new collection', + status_code=HTTP_201_CREATED, +) +async def create_collection( + response: Response, + body: CreateCollectionRequest, + api_key: str = Depends(api_key_header_scheme), +): + + instance_config = get_config() + + # Check admin rights + authenticate_admin(instance_config, api_key) + + # TODO: read the current abstract configuration, check for a collection + # of the given name. If it does not exist yet, add a collection + # configuration that reflects the `body`. Then try to manifest the + # new configuration. If there are no errors, persist the new + # configuration. + configuration: Configuration = read_config( + store_path=instance_config.store_path + ) + + # Check for existing collection name + if body.name in configuration.collections: + raise HTTPException( + status_code=HTTP_409_CONFLICT, + detail=f"Collection with name '{body.name}' already exists.", + ) + + # Update the abstract configuration + configuration.collections[body.name] = Configuration( + **(body.model_dump(mode='json')), + ) + + # Manifest the abstract configuration + with wrap_http_exception(ConfigError): + manifest_configuration(configuration, instance_config) + + # Persist the configuration + store_config( + store_path=instance_config.store_path, + config=configuration, + ) + + response.headers['Location'] = f'/collections/{quote(body.name)}' + + +@router.get( + '/collections', + tags=['Administration interface'], + name='Get existing collections', +) +async def get_tokens( + api_key: str = Depends(api_key_header_scheme), +) -> list[CollectionResponse]: + + instance_config = get_config() + + # Check admin rights + authenticate_admin(instance_config, api_key) + return list(instance_config.xxx_tokens.values()) + + +x = """ +def create_or_reuse( + instance_config: InstanceConfig, + local_path: PurePosixPath, + schema_location: str, + backend_spec: str, +): + full_path = Path(instance_config.store_path / local_path) + if full_path.exists(): + ensure_backend_type(full_path, schema_location, backend_spec) + else: + full_path.mkdir(parents=True, exist_ok=False) + create_backend(instance_config, full_path, schema_location, backend_spec) + + +def ensure_backend_type( + path: Path, + schema_location: str, + backend_spec: str, +): + backend_name, extension = get_backend_and_extension(backend_spec) + if backend_name is 'record_dir': + try: + config = Config.get_collection_dir_config(path) + except ConfigError as e: + raise HTTPException( + status_code=HTTP_409_CONFLICT, + detail=f"did not find record_dir store in '{path}', reason: {e}", + ) from e + if config.schema != schema_location: + raise HTTPException( + status_code=HTTP_409_CONFLICT, + detail=f"existing record_dir store has different schema: '{config.schema}'", + ) + elif backend_name is 'sqlite': + pass + else: + raise ValueError(f"unknown backend type: '{backend_spec}'") +""" diff --git a/dump_things_service/config.py b/dump_things_service/config.py index aa373ca..778738d 100644 --- a/dump_things_service/config.py +++ b/dump_things_service/config.py @@ -14,7 +14,10 @@ from typing import ( ) import yaml -from fastapi import HTTPException +from fastapi import ( + FastAPI, + HTTPException, +) from pydantic import ( BaseModel, ConfigDict, @@ -34,6 +37,7 @@ from dump_things_service.backends.sqlite import SQLiteBackend from dump_things_service.backends.sqlite import ( record_file_name as sqlite_record_file_name, ) + from dump_things_service.converter import FormatConverter, get_conversion_objects from dump_things_service.exceptions import ( ConfigError, @@ -93,10 +97,11 @@ class TokenModes(enum.Enum): SUBMIT_ONLY = 'SUBMIT_ONLY' NOTHING = 'NOTHING' CURATOR = 'CURATOR' + ADMIN = 'ADMIN' class TokenCollectionConfig(BaseModel): - model_config = ConfigDict(extra='forbid') + model_config = ConfigDict(extra='forbid', use_enum_values=True) mode: TokenModes incoming_label: str = Field(strict=True) @@ -165,6 +170,8 @@ class GlobalConfig(StrictModel): @dataclasses.dataclass class InstanceConfig: store_path: Path + fastapi_app: FastAPI + order_by: list[str] = dataclasses.field(default_factory=list) collections: dict = dataclasses.field(default_factory=dict) all_stores: dict = dataclasses.field(default_factory=dict) curated_stores: dict = dataclasses.field(default_factory=dict) @@ -183,6 +190,9 @@ class InstanceConfig: use_classes: dict = dataclasses.field(default_factory=dict) maintenance_mode: set = dataclasses.field(default_factory=set) audit_backends: dict = dataclasses.field(default_factory=dict) + xxx_tokens: dict = dataclasses.field(default_factory=dict) + xxx_collections: dict = dataclasses.field(default_factory=dict) + mode_mapping = { TokenModes.READ_CURATED: TokenPermission(curated_read=True), @@ -210,6 +220,14 @@ mode_mapping = { curated_write=True, zones_access=True, ), + TokenModes.ADMIN: TokenPermission( + curated_read=True, + incoming_read=True, + incoming_write=True, + curated_write=True, + zones_access=True, + admin=True, + ) } @@ -313,6 +331,31 @@ class Config: raise ConfigError(msg) from e +def new_process_config( + store_path: Path, + fastapi_app: FastAPI, + order_by: list[str], + globals_dict: dict[str, Any], +) -> InstanceConfig: + global global_config_instance + + from dump_things_service.abstract_config import read_config + from dump_things_service.manifest import manifest_configuration + + abstract_configuration = read_config(store_path) + global_config_instance = InstanceConfig( + store_path=store_path, + fastapi_app=fastapi_app, + order_by=order_by, + ) + manifest_configuration(abstract_configuration, global_config_instance) + return global_config_instance + + +def get_config(): + return global_config_instance + + def process_config( store_path: Path, config_file: Path, @@ -331,10 +374,6 @@ def process_config( return global_config_instance -def get_config(): - return global_config_instance - - def process_config_object( store_path: Path, config_object: GlobalConfig, diff --git a/dump_things_service/exceptions.py b/dump_things_service/exceptions.py index aa703e7..8a3b8ae 100644 --- a/dump_things_service/exceptions.py +++ b/dump_things_service/exceptions.py @@ -2,5 +2,9 @@ class ConfigError(Exception): pass +class ConfigCollisionError(ConfigError): + pass + + class CurieResolutionError(Exception): pass diff --git a/dump_things_service/main.py b/dump_things_service/main.py index 39fccec..acd29c0 100644 --- a/dump_things_service/main.py +++ b/dump_things_service/main.py @@ -76,9 +76,14 @@ from dump_things_service.lazy_list import ( ModifierList, ) from dump_things_service.model import ( - get_classes, get_subclasses, ) +from dump_things_service.collection_endpoints import ( + router as collection_router, +) +from dump_things_service.token_endpoints import ( + router as token_router, +) from dump_things_service.utils import ( authenticate_token, check_bounds, @@ -261,8 +266,10 @@ app = FastAPI( version=__version__, openapi_tags=tag_info ) +app.include_router(collection_router) app.include_router(curated_router) app.include_router(incoming_router) +app.include_router(token_router) def store_record( @@ -442,6 +449,8 @@ async def maintenance( api_key: str | None = Depends(api_key_header_scheme), ): + + if api_key is None: raise HTTPException( status_code=HTTP_400_BAD_REQUEST, diff --git a/dump_things_service/manifest.py b/dump_things_service/manifest.py new file mode 100644 index 0000000..1f36150 --- /dev/null +++ b/dump_things_service/manifest.py @@ -0,0 +1,118 @@ +import logging + +from dump_things_service.abstract_config import ( + Configuration, + TokenConfig, +) +from dump_things_service.collection import create_collection +from dump_things_service.config import InstanceConfig + + +logger = logging.getLogger('dump_things_service') + + +def manifest_configuration( + configuration: Configuration, + instance_config: InstanceConfig, +): + """Interpret the configuration and instantiate respective objects + + For every collection in the configuration this method will: + + - create a `ModelStore`-instance with correct `Backend`-instances and + check for compatibility with existing data + - create configured `AuthenticationSource`-instances + - create schema-related objects + - add schema class-specific http-endpoints to: + -- validate records + -- create records in the user's inbox + -- create records in the curated area + -- create records in a specific incoming area + + Objects and endpoints that belong to a non-existing configuration are + deleted. + + If objects for a collection do already exist, they are kept unmodified + and are not validated. That means changes of existing configuration objects + are not possible. To modify a collection or token configuration, the + configuration has to be deleted and created in the new state. + + If case of an error in the configuration, no objects will be create for + the respective collection or token. + + Tokens can be updated. Collections can not be updated, to modify the + configuration of a collection, the collection must be deleted and + recreated with the modified configuration. Deleting a collection will + not delete the data of the collection. If the collection is recreated with + the same backend- and directory-configuration, the data will be accessible + in the new collection as well. It is, however, not recommended to update + the schema of a collection. This will most likely break the service on this + collection. + + When collections are deleted, some tokens might still refer to them, this + is silently ignored. This supports to delete a collection and delete or + modify the token later. + + Default-tokens are not validated when a new collection is created. This + allows to first create a collection and then the default token. The cost + is that a "default-token unknown" error might be created when accessing + a collection. + """ + + # Determine the changes in collections. + existing_collections = set(instance_config.collections) + configured_collections = set(configuration.collections) + new_collection_names = configured_collections - existing_collections + deleted_collection_names = existing_collections - configured_collections + + # Delete collection objects of collections that are no longer in the + # configuration (we do not delete the collection from token-objects here + # because token-objects are all re-created below). + for collection_name in deleted_collection_names: + delete_collection(instance_config, collection_name) + + # Create the internal representation objects for collections that have been + # added to the configuration. + for collection_name in new_collection_names: + create_collection( + instance_config, + configuration.collections[collection_name], + ) + + # Delete all token objects and recreate the tokens. This ensures that + # modified token scope and permissions are set for all tokens. + for token_name in list(instance_config.tokens): + delete_token(instance_config, token_name) + + for token_name, token_configuration in configuration.tokens.items(): + create_token( + instance_config, + token_name, + token_configuration, + ) + + if new_collection_names or deleted_collection_names: + instance_config.fastapi_app.openapi_schema = None + instance_config.fastapi_app.setup() + + +def create_token( + instance_config: InstanceConfig, + token_name: str, + token_configuration: TokenConfig, +): + instance_config.xxx_tokens[token_name] = token_configuration + + +def delete_token( + global_objects: InstanceConfig, + token_name: str, +): + global_objects.tokens.pop(token_name) + + +def delete_collection( + global_objects: InstanceConfig, + collection_name: str, +): + global_objects.collections.pop(collection_name) diff --git a/dump_things_service/mapping_functions.py b/dump_things_service/mapping_functions.py new file mode 100644 index 0000000..31587c6 --- /dev/null +++ b/dump_things_service/mapping_functions.py @@ -0,0 +1,82 @@ +import enum +import hashlib +from functools import partial +from pathlib import Path +from typing import Callable, Literal + +from pydantic import BaseModel, ConfigDict + + +class MappingMethod(enum.Enum): + digest_md5 = 'digest-md5' + digest_md5_p3 = 'digest-md5-p3' + digest_md5_p3_p3 = 'digest-md5-p3-p3' + digest_sha1 = 'digest-sha1' + digest_sha1_p3 = 'digest-sha1-p3' + digest_sha1_p3_p3 = 'digest-sha1-p3-p3' + after_last_colon = 'after-last-colon' + + +class CollectionDirConfigContent(BaseModel): + model_config = ConfigDict(extra='forbid', use_enum_values=True) + type: Literal['records'] + version: Literal[1] + schema: str + format: Literal['yaml'] + idfx: MappingMethod + + +def get_hex_digest(hasher: Callable, data: str) -> str: + hash_context = hasher(data.encode()) + return hash_context.hexdigest() + + +def mapping_digest_p3( + hasher: Callable, + pid: str, + suffix: str, +) -> Path: + hex_digest = get_hex_digest(hasher, pid) + return Path(hex_digest[:3]) / (hex_digest[3:] + '.' + suffix) + + +def mapping_digest_p3_p3( + hasher: Callable, + pid: str, + suffix: str, +) -> Path: + hex_digest = get_hex_digest(hasher, pid) + return Path(hex_digest[:3]) / hex_digest[3:6] / (hex_digest[6:] + '.' + suffix) + + +def mapping_digest(hasher: Callable, pid: str, suffix: str) -> Path: + hex_digest = get_hex_digest(hasher, pid) + return Path(hex_digest + '.' + suffix) + + +def mapping_after_last_colon(pid: str, suffix: str) -> Path: + plain_result = pid.split(':')[-1] + # Escape any colons and slashes in the pid + escaped_result = ( + plain_result.replace('_', '__').replace('/', '_s').replace('.', '_d') + ) + return Path(escaped_result + '.' + suffix) + + +mapping_functions = { + MappingMethod.digest_md5: partial(mapping_digest, hashlib.md5), + MappingMethod.digest_md5_p3: partial(mapping_digest_p3, hashlib.md5), + MappingMethod.digest_md5_p3_p3: partial(mapping_digest_p3_p3, hashlib.md5), + MappingMethod.digest_sha1: partial(mapping_digest, hashlib.sha1), + MappingMethod.digest_sha1_p3: partial(mapping_digest_p3, hashlib.sha1), + MappingMethod.digest_sha1_p3_p3: partial(mapping_digest_p3_p3, hashlib.sha1), + MappingMethod.after_last_colon: mapping_after_last_colon, +} + + +def get_mapping_function_by_name(mapping_function_name: str) -> Callable: + return mapping_functions[MappingMethod(mapping_function_name)] + + +def get_mapping_function(collection_config: CollectionDirConfigContent): + return mapping_functions[collection_config.idfx] diff --git a/dump_things_service/tests/test_token_endpoints.py b/dump_things_service/tests/test_token_endpoints.py new file mode 100644 index 0000000..d605d16 --- /dev/null +++ b/dump_things_service/tests/test_token_endpoints.py @@ -0,0 +1,45 @@ +from starlette.status import HTTP_409_CONFLICT + +from dump_things_service import HTTP_201_CREATED + + +def test_token_creation(fastapi_client_simple): + test_client, _ = fastapi_client_simple + + json_record = { + 'name': 'a', + 'user_id': 'u_a', + 'representation': '8bb6805ff10bcb1c2ca49dcd4bfef94d', + 'collection_info': { + 'collection_1': { + 'mode': 'WRITE_COLLECTION', + 'incoming_label': 'i_a' + } + } + } + + # Create a token eith name 'a' + response = test_client.post( + '/tokens', + headers={'x-dumpthings-token': 'admin-1'}, + json=json_record, + ) + assert response.status_code == HTTP_201_CREATED + + # Try to create another token eith name 'a', should result in a 4ß9-error + response = test_client.post( + '/tokens', + headers={'x-dumpthings-token': 'admin-1'}, + json=json_record, + ) + assert response.status_code == HTTP_409_CONFLICT + + # Try to create another token eith name 'b' and the same representation + # as 'a', should result in a 4ß9-error + json_record['name'] = 'b' + response = test_client.post( + '/tokens', + headers={'x-dumpthings-token': 'admin-1'}, + json=json_record, + ) + assert response.status_code == HTTP_409_CONFLICT diff --git a/dump_things_service/token.py b/dump_things_service/token.py index 15cb8ba..d8d3ddd 100644 --- a/dump_things_service/token.py +++ b/dump_things_service/token.py @@ -9,6 +9,7 @@ class TokenPermission(BaseModel): incoming_write: bool = False curated_write: bool = False zones_access: bool = False + admin: bool = False def get_token_parts(token: str) -> list[str]: diff --git a/dump_things_service/token_endpoints.py b/dump_things_service/token_endpoints.py new file mode 100644 index 0000000..e298430 --- /dev/null +++ b/dump_things_service/token_endpoints.py @@ -0,0 +1,193 @@ +import hashlib +import logging +import random +import sys +from typing import cast +from urllib.parse import quote + +from fastapi import ( + APIRouter, + Depends, + HTTPException, + Response, +) +from fastapi_pagination import ( + Page, + add_pagination, + paginate, +) +from pydantic import BaseModel +from starlette.status import HTTP_404_NOT_FOUND + +from dump_things_service import ( + HTTP_201_CREATED, + HTTP_401_UNAUTHORIZED, + HTTP_409_CONFLICT, +) +from dump_things_service.admin import authenticate_admin +from dump_things_service.api_key import api_key_header_scheme +from dump_things_service.config import ( + InstanceConfig, + TokenModes, + get_config, +) +from dump_things_service.utils import check_collection + +logger = logging.getLogger('dump_things_service') +router = APIRouter() +#add_pagination(router) + + +class TokenCollectionInfo(BaseModel): + mode: TokenModes + incoming_label: str + + +class TokenRequestBase(BaseModel): + name: str + user_id: str + curated: str + incoming: str + collection_info: dict[str, TokenCollectionInfo] + + +class CreateTokenRequest(TokenRequestBase): + representation: str | None = None + + +class TokenResponse(TokenRequestBase): + representation: str + + +def get_token_parts(token: str) -> list[str]: + parts = token.split('-', 1) + if len(parts) != 2: + msg = 'Invalid token format' + raise ValueError(msg) + return parts + + +def hash_token(token: str) -> str: + parts = get_token_parts(token) + hasher = hashlib.sha256() + hasher.update(parts[1].encode()) + return f'{parts[0]}-{hasher.hexdigest()}' + + +@router.post( + '/tokens', + tags=['Administration interface'], + name='Create a new token', + status_code=HTTP_201_CREATED, +) +async def create_token( + response: Response, + body: CreateTokenRequest, + api_key: str = Depends(api_key_header_scheme), +) -> TokenResponse: + + instance_config = get_config() + + # Check admin rights + authenticate_admin(instance_config, api_key) + + # Check for existing token-name + if body.name in instance_config.xxx_tokens: + raise HTTPException( + status_code=HTTP_409_CONFLICT, + detail=f"Token with name '{body.name}' already exists.", + ) + + # Ensure that all specified collections and modes exist + for collection_name, token_collection_info in body.collection_info.items(): + check_collection(instance_config, collection_name) + print(f'IMPLEMENT: check incoming label ({token_collection_info.incoming_label}), check mode ({token_collection_info.mode})', file=sys.stderr, flush=True) + # TODO: check mode(!), check incoming_label(?) + + if body.representation: + # We have a specific representation, check that it is not already used + for token in instance_config.xxx_tokens.values(): + if token.representation == body.representation: + raise HTTPException( + status_code=HTTP_409_CONFLICT, + detail=f"Representation '{body.representation}' already exists.", + ) + else: + # Generate a random representation that does not yet exist. + collision = True + while collision: + body.representation = random.randbytes(24).hex() + collision = any( + map( + lambda t: t.representation == body.representation, + instance_config.xxx_tokens + ) + ) + + # Store the new token in the configuration + instance_config.xxx_tokens[body.name] = TokenResponse( + name = body.name, + user_id = body.user_id, + collection_info=body.collection_info, + representation=cast(str, body.representation), + ) + + response.headers['Location'] = f'/tokens/{quote(body.name)}' + return TokenResponse( + name=body.name, + user_id=body.user_id, + representation=cast(str, body.representation), + collection_info=body.collection_info, + ) + + +@router.get( + '/tokens', + tags=['Administration interface'], + name='Get existing tokens', +) +async def get_tokens( + api_key: str = Depends(api_key_header_scheme), +) -> list[TokenResponse]: + + instance_config = get_config() + + # Check admin rights + authenticate_admin(instance_config, api_key) + return list(instance_config.xxx_tokens.values()) + + +def authenticate_admin( + instance_config: InstanceConfig, + api_key: str, +): + print('IMPLEMENT: authenticate_admin() ', file=sys.stderr, flush=True) + if api_key != 'admin-1': + detail = f'invalid admin token: {api_key}' + raise HTTPException( + status_code=HTTP_401_UNAUTHORIZED, + detail=detail, + ) + + +@router.get( + '/tokens/{token_name}', + tags=['Administration interface'], + name='Get token by name', +) +async def get_token_with_name( + token_name: str, + api_key: str = Depends(api_key_header_scheme), +) -> TokenResponse: + + instance_config = get_config() + + # Check admin rights + authenticate_admin(instance_config, api_key) + + if token_name not in instance_config.xxx_tokens: + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f"token with name '{token_name}' does not exist.", + ) + return instance_config.xxx_tokens[token_name] -- 2.52.0 From 8e7c82dd75c156a1e479508d0e0dfb1798eea5ec Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 22 Apr 2026 14:02:00 +0200 Subject: [PATCH 03/64] introduce abstract_configuration concept --- dump_things_service/abstract_config.py | 214 +++++- dump_things_service/admin.py | 4 +- dump_things_service/auth/config.py | 56 +- dump_things_service/collection.py | 88 ++- dump_things_service/collection_endpoints.py | 59 +- dump_things_service/commands/check_pids.py | 8 +- dump_things_service/config.py | 20 +- dump_things_service/curated.py | 4 +- dump_things_service/dynamic_endpoints.py | 31 +- dump_things_service/incoming.py | 10 +- dump_things_service/instance_state.py | 769 ++++++++++++++++++++ dump_things_service/manifest.py | 32 +- dump_things_service/token.py | 11 +- dump_things_service/token_endpoints.py | 146 ++-- dump_things_service/utils.py | 247 +++---- 15 files changed, 1306 insertions(+), 393 deletions(-) create mode 100644 dump_things_service/instance_state.py diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index 7083063..3fa19e1 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -4,14 +4,29 @@ from pathlib import ( Path, PurePosixPath, ) -from typing import Literal +from typing import ( + Iterable, + Literal, +) +from fastapi import HTTPException from pydantic import ( BaseModel, ConfigDict, Field, ) +from dump_things_service import ( + HTTP_401_UNAUTHORIZED, + HTTP_403_FORBIDDEN, + HTTP_404_NOT_FOUND, + HTTP_503_SERVICE_UNAVAILABLE, +) +from dump_things_service.audit.gitaudit import GitAuditBackend +from dump_things_service.backends.record_dir import ( + _RecordDirStore, + RecordDirStore, +) from dump_things_service.mapping_functions import ( MappingMethod, mapping_functions, @@ -20,6 +35,8 @@ from dump_things_service.mapping_functions import ( logger = logging.getLogger('dump_things_service') +g_abstract_configuration = None + class StrictModel(BaseModel): model_config = ConfigDict( @@ -91,6 +108,15 @@ class TokenModes(enum.Enum): ADMIN = 'ADMIN' +class TokenPermission(BaseModel): + curated_read: bool = False + incoming_read: bool = False + incoming_write: bool = False + curated_write: bool = False + zones_access: bool = False + admin: bool = False + + class TokenCollectionConfig(StrictModel): model_config = ConfigDict(extra='forbid', use_enum_values=True) mode: TokenModes @@ -101,6 +127,7 @@ class TokenConfig(StrictModel): user_id: str collections: dict[str, TokenCollectionConfig] hashed: bool = False + representation: str = '' dump_things_config_iri = 'dump_things:config' @@ -117,10 +144,6 @@ class Configuration(BaseModel): pid: str = dump_things_config_iri -from dump_things_service.audit.gitaudit import GitAuditBackend -from dump_things_service.backends.record_dir import _RecordDirStore, RecordDirStore - - def get_config_backends( store_path: Path, ) -> tuple[_RecordDirStore, GitAuditBackend]: @@ -147,16 +170,28 @@ def get_config_backends( return config_backend, config_audit -def read_config(store_path) -> Configuration: - config_backend, _ = get_config_backends(store_path) - record_info = config_backend.get_record_by_iri(dump_things_config_iri) - return Configuration(**(record_info.json_object)) if record_info else Configuration() +def read_config( + store_path: Path, +) -> Configuration: + global g_abstract_configuration + + if not g_abstract_configuration: + config_backend, _ = get_config_backends(store_path) + record_info = config_backend.get_record_by_iri(dump_things_config_iri) + g_abstract_configuration = ( + Configuration(**(record_info.json_object)) + if record_info + else Configuration() + ) + return g_abstract_configuration def store_config( store_path, config: Configuration, ): + global g_abstract_configuration + config_backend, audit_backend = get_config_backends(store_path) json_object = config.model_dump(mode='json', exclude_none=True) json_object['pid'] = dump_things_config_iri @@ -169,3 +204,164 @@ def store_config( record=json_object, committer_id='__dump_things_server__', ) + g_abstract_configuration = config + + +def tokens_for_collection( + config: Configuration, + collection: str, +) -> Iterable[TokenConfig]: + yield from ( + token + for token_name, token in config.tokens.items() + if collection in token.collections + ) + + +def get_zone( + configuration: Configuration, + collection: str, + token: str, +) -> str | None: + """Get the zone for the given collection and token.""" + check_collection(configuration, collection) + + assert False + if collection not in configuration.collections: + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f'No incoming zone defined for collection: {collection}', + ) + if token not in instance_config.zones[collection]: + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f'Missing incoming_label for given token in collection: {collection}', + ) + return instance_config.zones[collection][token] + + +def check_collection( + abstract_config: Configuration, + collection: str, +): + if collection not in abstract_config.collections: + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f"No such collection: '{collection}'.", + ) + + +def check_label( + abstract_config: Configuration, + collection: str, + label: str, +): + """Check that a label exists in a collection configuration or on disk""" + if ( + label not in get_config_labels(abstract_config, collection) + and label not in get_on_disk_labels(abstract_config, collection) + ): + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f"No incoming label: '{label}' in collection: '{collection}'.", + ) + + +def get_config_labels( + abstract_config: Configuration, + collection: str, +) -> set[str]: + check_collection(abstract_config, collection) + return { + token.collections[collection].incoming_label + for token in tokens_for_collection(abstract_config, collection) + if token.collections[collection].incoming_label + } + + +def get_default_token_name( + abstract_config: Configuration, + collection: str +) -> str: + check_collection(abstract_config, collection) + return abstract_config.collections[collection].default_token + + +def get_token_info_by_representation( + abstract_config: Configuration, + token_representation: str, +) -> tuple[str, TokenConfig] | None: + """Get the name of the token given in `token_representation`""" + hashed_representation = hashlib.sha1(token_representation.encode()).hexdigest() + for token_name, token_config in abstract_config.tokens.items(): + if token_config.hashed: + compare_representation = hashed_representation + else: + compare_representation = token_representation + if compare_representation == token_config.representation: + return token_name, token_config + return None + + +def get_token_config_by_name( + abstract_config: Configuration, + token_name: str, +) -> TokenConfig | None: + return abstract_config.tokens.get(token_name) + + +def get_token_infos_for_collection( + abstract_config: Configuration, + collection_name: str, +) -> Iterable[tuple[str, TokenConfig, TokenCollectionConfig]]: + + yield from { + (token_name, token_config, token_collection_config) + for token_name, token_config in abstract_config.tokens.items() + for token_collection_config in token_config.collections.get(collection_name) + if token_config is not None + } + + +def get_token_config_for_representation_and_collection( + abstract_config: Configuration, + collection_name: str, + token_representation: str, +) -> tuple[str, TokenConfig, TokenCollectionConfig] | None: + + token_info = get_token_info_by_representation( + abstract_config=abstract_config, + token_representation=token_representation, + ) + if token_info: + token_name, token_config = token_info + if collection_name in token_config.collections: + return token_name, token_config, token_config.collections[collection_name] + + return None + + +def get_collection_config_by_name( + abstract_config: Configuration, + collection_name: str, +) -> CollectionConfig: + collection_config = abstract_config.collections.get(collection_name) + if not collection_config: + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f"No such collection: '{collection_name}'", + ) + return collection_config + + +def get_default_token_config( + abstract_config: Configuration, + collection: str, +) -> TokenConfig: + + default_token_name = get_collection_config_by_name( + abstract_config, + collection, + ).default_token + + return get_token_config_by_name(abstract_config, default_token_name) diff --git a/dump_things_service/admin.py b/dump_things_service/admin.py index 4b5c561..0d12db5 100644 --- a/dump_things_service/admin.py +++ b/dump_things_service/admin.py @@ -5,11 +5,11 @@ from fastapi import HTTPException from dump_things_service import ( HTTP_401_UNAUTHORIZED, ) -from dump_things_service.config import InstanceConfig +from dump_things_service.instance_state import InstanceState def authenticate_admin( - instance_config: InstanceConfig, + instance_state: InstanceState, api_key: str, ): print('IMPLEMENT: authenticate_admin() ', file=sys.stderr, flush=True) diff --git a/dump_things_service/auth/config.py b/dump_things_service/auth/config.py index a4ce625..37888cf 100644 --- a/dump_things_service/auth/config.py +++ b/dump_things_service/auth/config.py @@ -1,56 +1,42 @@ -"""Use configuration information to fetch token permissions, ids, and incomng_label """ +"""Use configuration information to fetch token permissions, ids, and incoming_label """ +from dump_things_service.abstract_config import Configuration from dump_things_service.auth import ( AuthenticationInfo, AuthenticationSource, InvalidTokenError, ) -from dump_things_service.config import ( - InstanceConfig, -) -from dump_things_service.token import ( - get_token_parts, - hash_token, -) - -missing = {} +from dump_things_service.config import get_permissions +from dump_things_service.utils import get_token_config_for_representation_and_collection class ConfigAuthenticationSource(AuthenticationSource): def __init__( - self, - instance_config: InstanceConfig, - collection: str, + self, + abstract_configuration: Configuration, + collection: str, ): - self.instance_config = instance_config + self.abstract_configuration = abstract_configuration self.collection = collection def authenticate( - self, - token: str, + self, + token_representation: str, ) -> AuthenticationInfo: - token = self._resolve_hashed_token(token) - token_info = self.instance_config.tokens_per_collection.get(self.collection, {}).get(token, missing) - if token_info is missing: + result = get_token_config_for_representation_and_collection( + self.abstract_configuration, + self.collection, + token_representation, + ) + + if not result: msg = f'Token not valid for collection `{self.collection}`' raise InvalidTokenError(msg) + _, token_config, token_collection_config = result return AuthenticationInfo( - token_permission=token_info['permissions'], - user_id=token_info['user_id'], - incoming_label=token_info['incoming_label'], + token_permission=get_permissions(token_collection_config.mode), + user_id=token_config.user_id, + incoming_label=token_collection_config.incoming_label, ) - - def _resolve_hashed_token( - self, - token: str - ) -> str: - - try: - token_id, _ = get_token_parts(token) - if token_id in self.instance_config.hashed_tokens[self.collection]: - return hash_token(token) - except ValueError: - pass - return token diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index 069f007..ca2e75b 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -14,20 +14,25 @@ from pydantic import ( ValidationError, ) + from dump_things_service import ( HTTP_400_BAD_REQUEST, HTTP_403_FORBIDDEN, HTTP_422_UNPROCESSABLE_CONTENT, ) -from dump_things_service.abstract_config import ( +from .abstract_config import ( CollectionConfig, RecordDirBackendConfig, SQLiteBackendConfig, + read_config, + check_collection, + get_default_token_name, ) from dump_things_service.backends.sqlite import record_file_name as sqlite_db_filename -from dump_things_service.config import ( - InstanceConfig, - get_config, +from dump_things_service.instance_state import ( + InstanceState, + get_collection_dir_config, + get_instance_state, ) from dump_things_service.converter import FormatConverter from dump_things_service.exceptions import ( @@ -37,16 +42,14 @@ from dump_things_service.exceptions import ( ) from dump_things_service.model import get_model_for_schema from dump_things_service.utils import ( - check_collection, combine_ttl, - get_default_token_name, get_token_store, join_default_token_permissions, wrap_http_exception, ) -# This following two lines are required for dynamic endpoint generation +# This following lines are required for dynamic endpoint generation from typing import Annotated from fastapi import Body, Depends from dump_things_service import Format @@ -111,7 +114,7 @@ async def {name}( def create_collection( - instance_config: InstanceConfig, + instance_state: InstanceState, collection_configuration: CollectionConfig, ): """Create a collection as specified by `collection_configuration` @@ -119,16 +122,16 @@ def create_collection( Reuse existing disk structures, if they are compatible. If they are not compatible, raise an error. - :param instance_config: + :param instance_state: :param collection_configuration: :return: """ - curated_path = Path(instance_config.store_path / collection_configuration.curated) + curated_path = Path(instance_state.store_path / collection_configuration.curated) incoming_path = ( None if collection_configuration.incoming is None - else Path(instance_config.store_path / collection_configuration.incoming) + else Path(instance_state.store_path / collection_configuration.incoming) ) # Check for compatibility of all existing stores before creating any @@ -148,7 +151,7 @@ def create_collection( ) for audit_backend in collection_configuration.audit_backends: - audit_path = Path(instance_config.store_path / audit_backend.path) + audit_path = Path(instance_state.store_path / audit_backend.path) if audit_path.exists(): check_audit_compatibility(audit_path) @@ -160,13 +163,23 @@ def create_collection( if not curated_path.exists(): curated_path.mkdir(parents=True) created_directories.append(curated_path) + initialize_backend( + curated_path, + collection_configuration.backend, + collection_configuration.schema, + ) if incoming_path and not incoming_path.exists(): incoming_path.mkdir(parents=True) created_directories.append(incoming_path) + initialize_backend( + incoming_path, + collection_configuration.backend, + collection_configuration.schema, + ) for audit_backend in collection_configuration.audit_backends: - audit_path = Path(instance_config.store_path / audit_backend.path) + audit_path = Path(instance_state.store_path / audit_backend.path) if not audit_path.exists(): audit_path.mkdir(parents=True) created_directories.append(audit_path) @@ -183,7 +196,6 @@ def create_collection( collection_configuration.backend, collection_configuration.schema, ) - created_directories.append(curated_path) if incoming_path: create_backend( @@ -191,11 +203,10 @@ def create_collection( collection_configuration.backend, collection_configuration.schema, ) - created_directories.append(incoming_path) # Create the audit log for audit_backend in collection_configuration.audit_backends: - audit_path = Path(instance_config.store_path / audit_backend.path) + audit_path = Path(instance_state.store_path / audit_backend.path) if not audit_path.exists(): create_audit_store(audit_path) created_directories.append(audit_path) @@ -204,11 +215,11 @@ def create_collection( # inbox-storing, and for curated area storing. create_endpoints_for_collection( collection_configuration, - instance_config.fastapi_app, + instance_state.fastapi_app, ) # Create the collection configuration element - instance_config.xxx_collections[collection_configuration.name] = collection_configuration + instance_state.xxx_collections[collection_configuration.name] = collection_configuration def create_backend( @@ -218,6 +229,15 @@ def create_backend( ): assert backend_config.type == 'record_dir+stl' + +# TODO: should this be in instance_state? +def initialize_backend( + incoming_path: Path, + backend_config: RecordDirBackendConfig | SQLiteBackendConfig, + schema: str, +): + assert backend_config.type == 'record_dir+stl' + print(f'Incoming path: {incoming_path}') print(f'backend spec: {backend_config}') print(f'schema: {schema}') @@ -238,10 +258,10 @@ def create_audit_store(*args, **kwargs): def check_store_compatibility( store_path: Path, - backend_config: RecordDirBackendConfig|SQLiteBackendConfig, + backend_config: RecordDirBackendConfig | SQLiteBackendConfig, schema: str, ): - """Check if an existing store is compatible with the specs + """Check if an existing store is compatible with the specs in `backend_config` :param store_path: :param backend_config: @@ -266,9 +286,7 @@ def check_record_dir_compatibility( backend_config: RecordDirBackendConfig, schema: str, ): - from dump_things_service.config import Config - - record_dir_config = Config.get_collection_dir_config(store_path) + record_dir_config = get_collection_dir_config(store_path) if record_dir_config.schema != schema: raise ConfigCollisionError(f"Existing backend uses a different schema: '{record_dir_config.schema}'") @@ -303,13 +321,6 @@ def check_audit_compatibility( print('IMPLEMENT: check_audit_compatibility', file=sys.stderr, flush=True) - -# store_record -# validate_record -# store_curated_record -# store_incoming_record - - def create_endpoint( operation_name: str, operation_path: str, @@ -409,11 +420,12 @@ def store_record( status_code=HTTP_400_BAD_REQUEST, detail='Invalid ttl data provided.' ) - instance_config = get_config() - check_collection(instance_config, collection) + instance_state = get_instance_state() + abstract_config = read_config(instance_state.store_path) + check_collection(abstract_config, collection) token = ( - get_default_token_name(instance_config, collection) + get_default_token_name(instance_state, collection) if api_key is None else api_key ) @@ -424,12 +436,12 @@ def store_record( # store the plaintext token, so all token-information is associated with # the hashed representation of the token. store, token, token_permissions, user_id = get_token_store( - instance_config, + instance_state, collection, token, ) final_permissions = join_default_token_permissions( - instance_config, token_permissions, collection + instance_state, token_permissions, collection ) if not final_permissions.incoming_write: raise HTTPException( @@ -440,7 +452,7 @@ def store_record( if input_format == Format.ttl: with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Conversion error'): json_object = FormatConverter( - instance_config.schemas[collection], + instance_state.schemas[collection], input_format=Format.ttl, output_format=Format.json, ).convert(data, class_name) @@ -450,14 +462,14 @@ def store_record( record = data with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): - instance_config.validators[collection].validate(record) + instance_state.validators[collection].validate(record) with wrap_http_exception(CurieResolutionError): stored_records = store.store_object(obj=record, submitter=user_id) if input_format == Format.ttl: format_converter = FormatConverter( - instance_config.schemas[collection], + instance_state.schemas[collection], input_format=Format.json, output_format=Format.ttl, ) diff --git a/dump_things_service/collection_endpoints.py b/dump_things_service/collection_endpoints.py index 56dd3a0..b2d56e9 100644 --- a/dump_things_service/collection_endpoints.py +++ b/dump_things_service/collection_endpoints.py @@ -17,24 +17,29 @@ from fastapi import ( HTTPException, Response, ) -from pydantic import BaseModel +from pydantic import ( + BaseModel, + ConfigDict, +) from dump_things_service import ( HTTP_201_CREATED, - HTTP_401_UNAUTHORIZED, HTTP_409_CONFLICT, ) -from dump_things_service.abstract_config import read_config, store_config, Configuration +from dump_things_service.abstract_config import ( + read_config, + store_config, + CollectionConfig, + Configuration, + check_collection, + get_default_token_name, +) from dump_things_service.admin import authenticate_admin from dump_things_service.api_key import api_key_header_scheme from dump_things_service.manifest import manifest_configuration -from dump_things_service.config import ( - Config, - InstanceConfig, - get_config, -) +#from dump_things_service.config import get_config from dump_things_service.exceptions import ConfigError -from dump_things_service.utils import check_collection, wrap_http_exception +from dump_things_service.utils import wrap_http_exception logger = logging.getLogger('dump_things_service') router = APIRouter() @@ -58,32 +63,6 @@ class TagSpec(BaseModel): submission_time_tag: str = 'http://semanticscience.org/resource/SIO_001083' -class CreateCollectionRequest(BaseModel): - name: str - default_token: str - schema: str - curated: PurePosixPath - incoming: PurePosixPath | None = None - backend: str = 'record_dir+stl' - auth_sources: list[ForgejoAuthSpec | ConfigAuthSpec] = [ConfigAuthSpec()] - submission_tags: TagSpec = TagSpec() - use_classes: list[str] | None = None - ignore_classes: list[str] | None = None - - -class CollectionResponse(BaseModel): - name: str - default_token: str - schema: str - curated: PurePosixPath - incoming: PurePosixPath | None = None - backend: str = 'record_dir+stl' - auth_sources: list[ForgejoAuthSpec | ConfigAuthSpec] = [ConfigAuthSpec()] - submission_tags: TagSpec = TagSpec() - use_classes: list[str] | None = None - ignore_classes: list[str] | None = None - - @router.post( '/collections', tags=['Administration interface'], @@ -92,7 +71,7 @@ class CollectionResponse(BaseModel): ) async def create_collection( response: Response, - body: CreateCollectionRequest, + body: CollectionConfig, api_key: str = Depends(api_key_header_scheme), ): @@ -140,15 +119,17 @@ async def create_collection( tags=['Administration interface'], name='Get existing collections', ) -async def get_tokens( +async def get_collections( api_key: str = Depends(api_key_header_scheme), -) -> list[CollectionResponse]: +) -> list[CollectionConfig]: instance_config = get_config() # Check admin rights authenticate_admin(instance_config, api_key) - return list(instance_config.xxx_tokens.values()) + + abstract_config = read_config(store_path=instance_config.store_path) + return list(abstract_config.collections.values()) x = """ diff --git a/dump_things_service/commands/check_pids.py b/dump_things_service/commands/check_pids.py index 9dcad65..fccf65c 100644 --- a/dump_things_service/commands/check_pids.py +++ b/dump_things_service/commands/check_pids.py @@ -6,6 +6,7 @@ from collections.abc import Iterable from pathlib import Path from dump_things_service import config_file_name +from dump_things_service.abstract_config import read_config from dump_things_service.backends.schema_type_layer import _SchemaTypeLayer from dump_things_service.backends.sqlite import _SQLiteBackend from dump_things_service.config import get_config, process_config @@ -65,6 +66,7 @@ def check_pids_in_stores( def check_pids(): instance_config = get_config() + abstract_config = read_config(instance_config.store_path) result = 0 @@ -78,7 +80,11 @@ def check_pids(): for collection, collection_info in instance_config.collections.items(): configured_labels = get_config_labels(instance_config, collection) - on_disk_labels = get_on_disk_labels(instance_config, collection) + on_disk_labels = get_on_disk_labels( + store_path=instance_config.store_path, + abstract_config=abstract_config, + collection=collection, + ) all_labels = configured_labels.union(on_disk_labels) token_stores = [ diff --git a/dump_things_service/config.py b/dump_things_service/config.py index 778738d..6a65ed1 100644 --- a/dump_things_service/config.py +++ b/dump_things_service/config.py @@ -51,7 +51,7 @@ from dump_things_service.token import ( get_token_parts, hash_token, ) -from dump_things_service.utils import check_collection +from dump_things_service.abstract_config import check_collection if TYPE_CHECKING: import types @@ -168,7 +168,7 @@ class GlobalConfig(StrictModel): @dataclasses.dataclass -class InstanceConfig: +class XXXInstanceConfig: store_path: Path fastapi_app: FastAPI order_by: list[str] = dataclasses.field(default_factory=list) @@ -336,18 +336,18 @@ def new_process_config( fastapi_app: FastAPI, order_by: list[str], globals_dict: dict[str, Any], -) -> InstanceConfig: +) -> XXXInstanceConfig: global global_config_instance from dump_things_service.abstract_config import read_config from dump_things_service.manifest import manifest_configuration - abstract_configuration = read_config(store_path) - global_config_instance = InstanceConfig( + global_config_instance = XXXInstanceConfig( store_path=store_path, fastapi_app=fastapi_app, order_by=order_by, ) + abstract_configuration = read_config(global_config_instance.store_path) manifest_configuration(abstract_configuration, global_config_instance) return global_config_instance @@ -361,7 +361,7 @@ def process_config( config_file: Path, order_by: list[str], globals_dict: dict[str, Any], -) -> InstanceConfig: +) -> XXXInstanceConfig: global global_config_instance config_object = Config.get_config_from_file(config_file) @@ -383,7 +383,7 @@ def process_config_object( from dump_things_service.auth.config import ConfigAuthenticationSource from dump_things_service.auth.forgejo import ForgejoAuthenticationSource - instance_config = InstanceConfig(store_path=store_path) + instance_config = XXXInstanceConfig(store_path=store_path) instance_config.collections = config_object.collections for collection_name, collection_info in config_object.collections.items(): @@ -670,7 +670,7 @@ def get_backend_and_extension(backend_type: str) -> tuple[str, str]: def get_zone( - instance_config: InstanceConfig, + instance_config: XXXInstanceConfig, collection: str, token: str, ) -> str | None: @@ -689,7 +689,7 @@ def get_zone( def get_conversion_objects_for_collection( - instance_config: InstanceConfig, + instance_config: XXXInstanceConfig, collection_name: str, ) -> dict: """Get the conversion objects for the given collection.""" @@ -698,7 +698,7 @@ def get_conversion_objects_for_collection( def get_model_info_for_collection( - instance_config: InstanceConfig, + instance_config: XXXInstanceConfig, collection_name: str, ) -> tuple[types.ModuleType, dict[str, Any], str]: check_collection(instance_config, collection_name) diff --git a/dump_things_service/curated.py b/dump_things_service/curated.py index 7366343..6e575c6 100644 --- a/dump_things_service/curated.py +++ b/dump_things_service/curated.py @@ -21,15 +21,15 @@ from dump_things_service import ( HTTP_404_NOT_FOUND, HTTP_422_UNPROCESSABLE_CONTENT, ) +from dump_things_service.abstract_config import check_collection from dump_things_service.api_key import api_key_header_scheme from dump_things_service.backends.schema_type_layer import _SchemaTypeLayer -from dump_things_service.config import get_config +#from dump_things_service.config import get_config from dump_things_service.exceptions import CurieResolutionError from dump_things_service.lazy_list import ModifierList from dump_things_service.utils import ( authenticate_token, check_bounds, - check_collection, cleaned_json, wrap_http_exception, ) diff --git a/dump_things_service/dynamic_endpoints.py b/dump_things_service/dynamic_endpoints.py index 913160a..dda8262 100644 --- a/dump_things_service/dynamic_endpoints.py +++ b/dump_things_service/dynamic_endpoints.py @@ -3,7 +3,10 @@ from itertools import count from fastapi import FastAPI -from dump_things_service.config import InstanceConfig +from dump_things_service import config +from dump_things_service.abstract_config import Configuration +from dump_things_service.instance_state import InstanceState + logger = logging.getLogger('dump_things_service') @@ -21,7 +24,7 @@ async def {name}( def create_store_endpoints( app: FastAPI, - instance_config: InstanceConfig, + configuration: Configuration, tag_info: list[dict[str, str]], placeholder: str, global_dict: dict, @@ -32,16 +35,14 @@ def create_store_endpoints( generated_tags = [] - for collection, ( - model, - classes, - model_var_name, - ) in instance_config.model_info.items(): + for collection_name, collection in configuration.collections.items(): tag_name = f'Write records to collection "{collection}"' + model, model_var_name = None, 'model_' + collection_name + global_dict[model_var_name] = model - for class_name in instance_config.use_classes[collection]: + for class_name in instance_state.use_classes[collection]: # Create an endpoint to dump data of type `class_name` in version # `version` of schema `application`. @@ -80,7 +81,7 @@ def create_store_endpoints( def create_validate_endpoints( app: FastAPI, - instance_config: InstanceConfig, + configuration: Configuration, tag_info: list[dict[str, str]], placeholder: str, global_dict: dict, @@ -91,16 +92,14 @@ def create_validate_endpoints( generated_tags = [] - for collection, ( - model, - classes, - model_var_name, - ) in instance_config.model_info.items(): + for collection_name, collection in configuration.collections.items(): tag_name = f'Validate records for collection "{collection}"' + model, model_var_name = None, 'model_' + collection_name + global_dict[model_var_name] = model - for class_name in instance_config.use_classes[collection]: + for class_name in instance_state.use_classes[collection]: # Create an endpoint to dump data of type `class_name` in version # `version` of schema `application`. @@ -117,7 +116,7 @@ def create_validate_endpoints( exec(endpoint_source, global_dict) # noqa S102 # Create an API route for the endpoint - app.add_api_route( + instance_state.fastapi_app.add_api_route( path=f'/{collection}/validate/record/{class_name}', endpoint=global_dict[endpoint_name], methods=['POST'], diff --git a/dump_things_service/incoming.py b/dump_things_service/incoming.py index ce123fc..1e91c22 100644 --- a/dump_things_service/incoming.py +++ b/dump_things_service/incoming.py @@ -21,19 +21,21 @@ from dump_things_service import ( HTTP_404_NOT_FOUND, HTTP_422_UNPROCESSABLE_CONTENT, ) +from dump_things_service.abstract_config import ( + check_collection, + check_label, + get_config_labels, +) from dump_things_service.api_key import api_key_header_scheme from dump_things_service.backends.schema_type_layer import _SchemaTypeLayer -from dump_things_service.config import get_config +#from dump_things_service.config import get_config from dump_things_service.exceptions import CurieResolutionError from dump_things_service.lazy_list import ModifierList from dump_things_service.utils import ( authenticate_token, check_bounds, - check_collection, - check_label, cleaned_json, create_token_store, - get_config_labels, get_on_disk_labels, wrap_http_exception, ) diff --git a/dump_things_service/instance_state.py b/dump_things_service/instance_state.py new file mode 100644 index 0000000..1df8811 --- /dev/null +++ b/dump_things_service/instance_state.py @@ -0,0 +1,769 @@ +from __future__ import annotations + +import dataclasses +import enum +import hashlib +import logging +from functools import partial +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + cast, +) + +import yaml +from fastapi import ( + FastAPI, + HTTPException, +) +from pydantic import ( + BaseModel, + ConfigDict, + Field, + ValidationError, +) +from yaml.scanner import ScannerError + +from dump_things_service import ( + HTTP_404_NOT_FOUND, + Format, +) +from dump_things_service.abstract_config import ( + Configuration, + check_collection, + MappingMethod, +) +from dump_things_service.audit.gitaudit import GitAuditBackend +from dump_things_service.backends.record_dir import RecordDirStore +from dump_things_service.backends.schema_type_layer import SchemaTypeLayer +from dump_things_service.backends.sqlite import SQLiteBackend +from dump_things_service.backends.sqlite import ( + record_file_name as sqlite_record_file_name, +) + +from dump_things_service.converter import FormatConverter, get_conversion_objects +from dump_things_service.exceptions import ( + ConfigError, + CurieResolutionError, +) +from dump_things_service.model import get_model_for_schema +from dump_things_service.resolve_curie import resolve_curie +from dump_things_service.store.model_store import ModelStore +from dump_things_service.token import ( + TokenPermission, + get_token_parts, + hash_token, +) + +if TYPE_CHECKING: + import types + +logger = logging.getLogger('dump_things_service') + +config_file_name = '.dumpthings.yaml' +ignored_files = {'.', '..', config_file_name} + + +class CollectionDirConfig(BaseModel): + model_config = ConfigDict(extra='forbid') + type: Literal['records'] + version: Literal[1] + schema: str + format: Literal['yaml'] + idfx: MappingMethod + + +@dataclasses.dataclass +class InstanceState: + # foundational information from command line or initialisation code + store_path: Path + bootstrap_token: str | None + + # Dynamically created elements + fastapi_app: FastAPI + + # Influenced by maintainer interface + maintenance_mode: set = dataclasses.field(default_factory=set) + + # Created based on abstract configuration + collectiona: dict = dataclasses.field(default_factory=dict) + tokens: dict = dataclasses.field(default_factory=dict) + auth_sources: dict = dataclasses.field(default_factory=dict) + audit_backends: dict = dataclasses.field(default_factory=dict) + + model_info: dict = dataclasses.field(default_factory=dict) + schemas: dict = dataclasses.field(default_factory=dict) + conversion_objects: dict = dataclasses.field(default_factory=dict) + + order_by: list[str] = dataclasses.field(default_factory=list) + collections: dict = dataclasses.field(default_factory=dict) + all_stores: dict = dataclasses.field(default_factory=dict) + curated_stores: dict = dataclasses.field(default_factory=dict) + incoming: dict = dataclasses.field(default_factory=dict) + zones: dict = dataclasses.field(default_factory=dict) + permissions: dict = dataclasses.field(default_factory=dict) + #model_info: dict = dataclasses.field(default_factory=dict) + token_stores: dict = dataclasses.field(default_factory=dict) + #schemas: dict = dataclasses.field(default_factory=dict) + #conversion_objects: dict = dataclasses.field(default_factory=dict) + backend: dict = dataclasses.field(default_factory=dict) + #tokens: dict = dataclasses.field(default_factory=dict) + hashed_tokens: dict = dataclasses.field(default_factory=dict) + validators: dict = dataclasses.field(default_factory=dict) + use_classes: dict = dataclasses.field(default_factory=dict) + #maintenance_mode: set = dataclasses.field(default_factory=set) + #audit_backends: dict = dataclasses.field(default_factory=dict) + xxx_tokens: dict = dataclasses.field(default_factory=dict) + xxx_collections: dict = dataclasses.field(default_factory=dict) + + +g_instance_state:InstanceState | None = None + + +def create_instance_state( + store_path: Path, + bootstrap_token: str, + fastapi_app: FastAPI, +) -> InstanceState: + global g_instance_state + + if g_instance_state: + logger.warning('create_instance_state() already called') + else: + g_instance_state = InstanceState( + store_path=store_path, + bootstrap_token=bootstrap_token, + fastapi_app=fastapi_app, + ) + return cast(InstanceState, g_instance_state) + + +def get_instance_state() -> InstanceState | None: + global g_instance_state + + if not g_instance_state: + msg = 'get_instance_state() called before create_instance_state()' + raise RuntimeError(msg) + return g_instance_state + + +def get_collection_dir_config( + path: Path, + file_name: str = config_file_name, +) -> CollectionDirConfig: + config_path = path / file_name + if not config_path.exists(): + msg = f'Config file does not exist: {config_path}' + raise ConfigError(msg) + try: + return CollectionDirConfig( + **yaml.load(config_path.read_text(), Loader=yaml.SafeLoader) + ) + except ScannerError as e: + msg = f'YAML-error while reading config file {config_path}: {e}' + raise ConfigError(msg) from e + except ValidationError as e: + msg = f'Pydantic-error reading config file {config_path}: {e}' + raise ConfigError(msg) from e + + +x = """ +class StrictModel(BaseModel): + model_config = ConfigDict(extra='forbid') + + +class MappingMethod(enum.Enum): + digest_md5 = 'digest-md5' + digest_md5_p3 = 'digest-md5-p3' + digest_md5_p3_p3 = 'digest-md5-p3-p3' + digest_sha1 = 'digest-sha1' + digest_sha1_p3 = 'digest-sha1-p3' + digest_sha1_p3_p3 = 'digest-sha1-p3-p3' + after_last_colon = 'after-last-colon' + + +class CollectionDirConfig(StrictModel): + type: Literal['records'] + version: Literal[1] + schema: str + format: Literal['yaml'] + idfx: MappingMethod + + +class TokenModes(enum.Enum): + READ_CURATED = 'READ_CURATED' + READ_COLLECTION = 'READ_COLLECTION' + WRITE_COLLECTION = 'WRITE_COLLECTION' + READ_SUBMISSIONS = 'READ_SUBMISSIONS' + WRITE_SUBMISSIONS = 'WRITE_SUBMISSIONS' + SUBMIT = 'SUBMIT' + SUBMIT_ONLY = 'SUBMIT_ONLY' + NOTHING = 'NOTHING' + CURATOR = 'CURATOR' + ADMIN = 'ADMIN' + + +class TokenCollectionConfig(BaseModel): + model_config = ConfigDict(extra='forbid', use_enum_values=True) + mode: TokenModes + incoming_label: str = Field(strict=True) + + +class TokenConfig(StrictModel): + user_id: str + collections: dict[str, TokenCollectionConfig] + hashed: bool = False + + +class BackendConfigRecordDir(StrictModel): + type: Literal['record_dir', 'record_dir+stl'] + + +class BackendConfigSQLite(StrictModel): + type: Literal['sqlite', 'sqlite+stl'] + schema: str + + +class ForgejoAuthConfig(StrictModel): + type: Literal['forgejo'] + url: str + organization: str + team: str + label_type: Literal['team', 'user'] + instance_id: str | None = None + repository: str | None = None + + +class ConfigAuthConfig(StrictModel): + type: Literal['config'] = 'config' + + +class GitAuditBackendConfig(StrictModel): + type: Literal['gitaudit'] + path: Path + auto_flush_timeout: int = 60 + + +class TagConfig(StrictModel): + submitter_id_tag: str = 'http://purl.obolibrary.org/obo/NCIT_C54269' + submission_time_tag: str = 'http://semanticscience.org/resource/SIO_001083' + + +class CollectionConfig(StrictModel): + default_token: str + curated: Path + incoming: Path | None = None + backend: BackendConfigRecordDir | BackendConfigSQLite | None = None + auth_sources: list[ForgejoAuthConfig | ConfigAuthConfig] = [ConfigAuthConfig()] + submission_tags: TagConfig = TagConfig() + use_classes: list[str] = dataclasses.field(default_factory=list) + ignore_classes: list[str] = dataclasses.field(default_factory=list) + audit_backends: list[GitAuditBackendConfig] = dataclasses.field(default_factory=list) + + +class GlobalConfig(StrictModel): + model_config = ConfigDict(strict=True) + + type: Literal['collections'] + version: Literal[1] + collections: dict[str, CollectionConfig] + tokens: dict[str, TokenConfig] + + +mode_mapping = { + TokenModes.READ_CURATED: TokenPermission(curated_read=True), + TokenModes.READ_COLLECTION: TokenPermission( + curated_read=True, + incoming_read=True, + ), + TokenModes.WRITE_COLLECTION: TokenPermission( + curated_read=True, + incoming_read=True, + incoming_write=True, + ), + TokenModes.READ_SUBMISSIONS: TokenPermission(incoming_read=True), + TokenModes.WRITE_SUBMISSIONS: TokenPermission( + incoming_read=True, + incoming_write=True, + ), + TokenModes.SUBMIT: TokenPermission(curated_read=True, incoming_write=True), + TokenModes.SUBMIT_ONLY: TokenPermission(incoming_write=True), + TokenModes.NOTHING: TokenPermission(), + TokenModes.CURATOR: TokenPermission( + curated_read=True, + incoming_read=True, + incoming_write=True, + curated_write=True, + zones_access=True, + ), + TokenModes.ADMIN: TokenPermission( + curated_read=True, + incoming_read=True, + incoming_write=True, + curated_write=True, + zones_access=True, + admin=True, + ) +} + + +def get_hex_digest(hasher: Callable, data: str) -> str: + hash_context = hasher(data.encode()) + return hash_context.hexdigest() + + +def mapping_digest_p3( + hasher: Callable, + pid: str, + suffix: str, +) -> Path: + hex_digest = get_hex_digest(hasher, pid) + return Path(hex_digest[:3]) / (hex_digest[3:] + '.' + suffix) + + +def mapping_digest_p3_p3( + hasher: Callable, + pid: str, + suffix: str, +) -> Path: + hex_digest = get_hex_digest(hasher, pid) + return Path(hex_digest[:3]) / hex_digest[3:6] / (hex_digest[6:] + '.' + suffix) + + +def mapping_digest(hasher: Callable, pid: str, suffix: str) -> Path: + hex_digest = get_hex_digest(hasher, pid) + return Path(hex_digest + '.' + suffix) + + +def mapping_after_last_colon(pid: str, suffix: str) -> Path: + plain_result = pid.split(':')[-1] + # Escape any colons and slashes in the pid + escaped_result = ( + plain_result.replace('_', '__').replace('/', '_s').replace('.', '_d') + ) + return Path(escaped_result + '.' + suffix) + + +mapping_functions = { + MappingMethod.digest_md5: partial(mapping_digest, hashlib.md5), + MappingMethod.digest_md5_p3: partial(mapping_digest_p3, hashlib.md5), + MappingMethod.digest_md5_p3_p3: partial(mapping_digest_p3_p3, hashlib.md5), + MappingMethod.digest_sha1: partial(mapping_digest, hashlib.sha1), + MappingMethod.digest_sha1_p3: partial(mapping_digest_p3, hashlib.sha1), + MappingMethod.digest_sha1_p3_p3: partial(mapping_digest_p3_p3, hashlib.sha1), + MappingMethod.after_last_colon: mapping_after_last_colon, +} + + +def get_mapping_function_by_name(mapping_function_name: str) -> Callable: + return mapping_functions[MappingMethod(mapping_function_name)] + + +def get_mapping_function(collection_config: CollectionDirConfig): + return mapping_functions[collection_config.idfx] + + +def get_permissions(mode: TokenModes) -> TokenPermission: + return mode_mapping[mode] + + +class Config: + @staticmethod + def get_config_from_file(path: Path) -> GlobalConfig: + try: + return GlobalConfig(**yaml.load(path.read_text(), Loader=yaml.SafeLoader)) + except ScannerError as e: + msg = f'YAML-error while reading config file {path}: {e}' + raise ConfigError(msg) from e + except TypeError: + msg = f'Error in yaml file {path}: content is not a mapping' + raise ConfigError(msg) from None + except ValidationError as e: + msg = f'Pydantic-error reading config file {path}: {e}' + raise ConfigError(msg) from e + + @staticmethod + def get_config(path: Path, file_name=config_file_name) -> GlobalConfig: + return Config.get_config_from_file(path / file_name) + + @staticmethod + def get_collection_dir_config( + path: Path, + file_name: str = config_file_name, + ) -> CollectionDirConfig: + config_path = path / file_name + if not config_path.exists(): + msg = f'Config file does not exist: {config_path}' + raise ConfigError(msg) + try: + return CollectionDirConfig( + **yaml.load(config_path.read_text(), Loader=yaml.SafeLoader) + ) + except ScannerError as e: + msg = f'YAML-error while reading config file {config_path}: {e}' + raise ConfigError(msg) from e + except ValidationError as e: + msg = f'Pydantic-error reading config file {config_path}: {e}' + raise ConfigError(msg) from e + + +def new_process_config( + store_path: Path, + fastapi_app: FastAPI, + order_by: list[str], + globals_dict: dict[str, Any], +) -> InstanceConfig: + global global_config_instance + + from dump_things_service.abstract_config import read_config + from dump_things_service.manifest import manifest_configuration + + global_config_instance = InstanceConfig( + store_path=store_path, + fastapi_app=fastapi_app, + order_by=order_by, + ) + abstract_configuration = read_config(global_config_instance.store_path) + manifest_configuration(abstract_configuration, global_config_instance) + return global_config_instance + + +def get_config(): + return global_config_instance + + +def process_config( + store_path: Path, + config_file: Path, + order_by: list[str], + globals_dict: dict[str, Any], +) -> InstanceConfig: + global global_config_instance + + config_object = Config.get_config_from_file(config_file) + global_config_instance = process_config_object( + store_path=store_path, + config_object=config_object, + order_by=order_by, + globals_dict=globals_dict, + ) + return global_config_instance + + +def process_config_object( + store_path: Path, + config_object: GlobalConfig, + order_by: list[str], + globals_dict: dict[str, Any], +): + from dump_things_service.auth.config import ConfigAuthenticationSource + from dump_things_service.auth.forgejo import ForgejoAuthenticationSource + + instance_config = InstanceConfig(store_path=store_path) + instance_config.collections = config_object.collections + + for collection_name, collection_info in config_object.collections.items(): + # Create the authentication providers + instance_config.auth_providers[collection_name] = [] + + auth_provider_list = [] + # Check for multiple providers + for auth_provider in collection_info.auth_sources: + if auth_provider.type == 'config': + key = ('config',) + elif auth_provider.type == 'forgejo': + key = ( + 'forgejo', + auth_provider.url, + auth_provider.organization, + auth_provider.team, + auth_provider.label_type, + auth_provider.repository, + ) + else: + msg = f'Unknown authentication provider type: {auth_provider.type}' + raise ConfigError(msg) + if key in auth_provider_list: + logger.warning('Ignoring duplicated authentication provider: %s', key) + continue + auth_provider_list.append(key) + + for auth_provider in auth_provider_list: + if auth_provider[0] == 'config': + instance_config.auth_providers[collection_name].append( + ConfigAuthenticationSource( + instance_config=instance_config, + collection=collection_name, + ) + ) + else: + instance_config.auth_providers[collection_name].append( + ForgejoAuthenticationSource(*auth_provider[1:]) + ) + + # Set the default backend if not specified + backend = collection_info.backend or BackendConfigRecordDir( + type='record_dir+stl' + ) + + instance_config.backend[collection_name] = backend + backend_name, extension = get_backend_and_extension(backend.type) + if backend_name == 'record_dir': + # Get the config from the curated directory + collection_config = Config.get_collection_dir_config( + store_path / collection_info.curated + ) + schema = collection_config.schema + elif backend.type == 'sqlite': + schema = backend.schema + else: + msg = f'Unsupported backend `{collection_info.backend}` for collection `{collection_name}`.' + raise ConfigError(msg) + + # Generate the collection model + model, classes, model_var_name = get_model_for_schema(schema) + instance_config.model_info[collection_name] = model, classes, model_var_name + globals_dict[model_var_name] = model + + # Generate the curated stores + if backend_name == 'record_dir': + curated_store_backend = RecordDirStore( + root=store_path / collection_info.curated, + pid_mapping_function=get_mapping_function(collection_config), + suffix=collection_config.format, + order_by=order_by, + ) + curated_store_backend.build_index_if_needed(schema=schema) + elif backend.type == 'sqlite': + curated_store_backend = SQLiteBackend( + db_path=store_path / collection_info.curated / sqlite_record_file_name, + ) + else: + msg = f'Unsupported backend `{collection_info.backend}` for collection `{collection_name}`.' + raise ConfigError(msg) + + if extension == 'stl': + curated_store_backend = SchemaTypeLayer( + backend=curated_store_backend, + schema=schema, + ) + + curated_store = ModelStore( + schema=schema, + backend=curated_store_backend, + tags={ + 'id': collection_info.submission_tags.submitter_id_tag, + 'time': collection_info.submission_tags.submission_time_tag, + } + ) + + instance_config.curated_stores[collection_name] = curated_store + + if collection_info.incoming: + instance_config.incoming[collection_name] = collection_info.incoming + + instance_config.schemas[collection_name] = schema + if schema not in instance_config.conversion_objects: + instance_config.conversion_objects[schema] = get_conversion_objects(schema) + + # We do not create stores for tokens here, but leave it to the token + # authentication routine. + instance_config.token_stores[collection_name] = {} + + # Generate audit backends + instance_config.audit_backends[collection_name] = [] + for audit_backend in collection_info.audit_backends: + instance_config.audit_backends[collection_name].append( + GitAuditBackend(audit_backend.path, audit_backend.auto_flush_timeout) + ) + + # Create validator for each collection + for collection_name, _ in config_object.collections.items(): + instance_config.validators[collection_name] = FormatConverter( + schema=instance_config.schemas[collection_name], + input_format=Format.json, + output_format=Format.ttl, + ) + + # Resolve classes-blacklist and -whitelist + for collection_name, collection_info in config_object.collections.items(): + + model_info = instance_config.model_info[collection_name] + + # If the whitelist is present, get all whitelisted classes + if collection_info.use_classes: + # Check that the whitelisted classes exist + undefined = [ + name + for name in collection_info.use_classes + if name not in model_info[1] + ] + if undefined: + msg = ( + 'used class(es): ' + + ', '.join(undefined) + + ' not defined in schema: ' + + model_info[0].linkml_meta.root['id'] + ) + raise ConfigError(msg) + use_classes = collection_info.use_classes + else: + use_classes = model_info[1] + + # Check for blacklisted classes + undefined = [ + name + for name in collection_info.ignore_classes + if name not in use_classes + ] + if undefined: + msg = ( + 'ignored class(es): ' + + ', '.join(undefined) + + ' not defined in schema or in `used_classes`: ' + + model_info[0].linkml_meta.root['id'] + ) + raise ConfigError(msg) + + instance_config.use_classes[collection_name] = [ + name + for name in use_classes + if name not in collection_info.ignore_classes + ] + + # Read info for tokens from the configuration + for token_name, token_info in config_object.tokens.items(): + for collection_name, token_collection_info in token_info.collections.items(): + + if collection_name not in instance_config.hashed_tokens: + instance_config.hashed_tokens[collection_name] = {} + + if token_info.hashed: + token_id, _ = get_token_parts(token_name) + if token_id == '': + msg = 'empty ID in hashed token' + raise ConfigError(msg) + if token_id in instance_config.hashed_tokens[collection_name]: + msg = f'duplicated ID in hashed token: {token_id}' + raise ConfigError(msg) + instance_config.hashed_tokens[collection_name][token_id] = token_name + + if collection_name not in instance_config.tokens: + instance_config.tokens[collection_name] = {} + + permissions = get_permissions(token_collection_info.mode) + instance_config.tokens[collection_name][token_name] = { + 'permissions': permissions, + 'user_id': token_info.user_id, + 'incoming_label': token_collection_info.incoming_label, + } + + # There is only a token store if the token has incoming read- or + # incoming write-permissions. If a token store exists, we ensure + # that an incoming path is set and an incoming label exists. + if permissions.incoming_read or permissions.incoming_write: + # Check that the incoming label is set for a token that has + # access rights to incoming records. + if not token_collection_info.incoming_label: + msg = f'Token `{token_name}` with mode {token_collection_info.mode} must not have an empty `incoming_label`' + raise ConfigError(msg) + + if any(c in token_collection_info.incoming_label for c in ('\\', '/')): + msg = ( + f'Incoming label for token `...` on collection ' + f'`{collection_name}` must not contain slashes or ' + f'backslashes: `{token_collection_info.incoming_label}`' + ) + raise ConfigError(msg) + + if collection_name not in instance_config.incoming: + msg = ( + 'Incoming location not defined for collection ' + f'`{collection_name}`, which has at least one token ' + f'with write access' + ) + raise ConfigError(msg) + + # Create all incoming zones + incoming_location = ( + store_path + / instance_config.collections[collection_name].incoming + / token_collection_info.incoming_label + ) + incoming_location.mkdir(parents=True, exist_ok=True) + + # Check that default tokens are defined + for collection_name, collection_info in config_object.collections.items(): + if collection_info.default_token not in instance_config.tokens[collection_name]: + msg = f'Unknown default token: `{collection_info.default_token}`' + raise ConfigError(msg) + + # Check that config authentication source is present if tokens are defined + # in the config file + for collection_name, _ in config_object.collections.items(): + config_tokens = instance_config.tokens.get(collection_name, {}) + if config_tokens: + if not any( + isinstance(auth_source, ConfigAuthenticationSource) + for auth_source in instance_config.auth_providers[collection_name] + ): + msg = ( + f'Collection `{collection_name}` has tokens defined in ' + 'configuration file, but no `config` authentication source' + ) + raise ConfigError(msg) + + # Check that hashed plain tokens do not clash with hashed tokens: + hashed_plain_tokens = { + hash_token(token) + for collection in instance_config.collections + for token in instance_config.tokens[collection] + if '-' in token + } + hashed_tokens = { + value + for token_dict in instance_config.hashed_tokens.values() + for value in token_dict.values() + } + if hashed_plain_tokens.intersection(hashed_tokens): + msg = 'plain tokens clash with hashed tokens' + raise ConfigError(msg) + + # Check tags + for collection_name, collection_info in config_object.collections.items(): + module = instance_config.model_info[collection_name][0] + try: + resolve_curie(module, collection_info.submission_tags.submission_time_tag) + except CurieResolutionError as e: + raise ConfigError(str(e)) from e + + return instance_config + + +def get_backend_and_extension(backend_type: str) -> tuple[str, str]: + elements = backend_type.split('+') + return (elements[0], elements[1]) if len(elements) > 1 else (elements[0], '') + + +def get_conversion_objects_for_collection( + configuration: Configuration, + instance_state: InstanceState, + collection: str, +) -> dict: + '''Get the conversion objects for the given collection.''' + check_collection(configuration, collection) + return instance_state.conversion_objects[collection] + + +def get_model_info_for_collection( + configuration: Configuration, + instance_state: InstanceState, + collection: str, +) -> tuple[types.ModuleType, dict[str, Any], str]: + '''Get the conversion objects for the given collection.''' + check_collection(configuration, collection) + return instance_state.model_info[collection] +""" diff --git a/dump_things_service/manifest.py b/dump_things_service/manifest.py index 1f36150..3c0ad7e 100644 --- a/dump_things_service/manifest.py +++ b/dump_things_service/manifest.py @@ -5,7 +5,7 @@ from dump_things_service.abstract_config import ( TokenConfig, ) from dump_things_service.collection import create_collection -from dump_things_service.config import InstanceConfig +from dump_things_service.instance_state import InstanceState logger = logging.getLogger('dump_things_service') @@ -13,7 +13,7 @@ logger = logging.getLogger('dump_things_service') def manifest_configuration( configuration: Configuration, - instance_config: InstanceConfig, + instance_state: InstanceState, ): """Interpret the configuration and instantiate respective objects @@ -60,7 +60,7 @@ def manifest_configuration( """ # Determine the changes in collections. - existing_collections = set(instance_config.collections) + existing_collections = set(instance_state.collections) configured_collections = set(configuration.collections) new_collection_names = configured_collections - existing_collections deleted_collection_names = existing_collections - configured_collections @@ -69,50 +69,50 @@ def manifest_configuration( # configuration (we do not delete the collection from token-objects here # because token-objects are all re-created below). for collection_name in deleted_collection_names: - delete_collection(instance_config, collection_name) + delete_collection(instance_state, collection_name) # Create the internal representation objects for collections that have been # added to the configuration. for collection_name in new_collection_names: create_collection( - instance_config, + instance_state, configuration.collections[collection_name], ) # Delete all token objects and recreate the tokens. This ensures that # modified token scope and permissions are set for all tokens. - for token_name in list(instance_config.tokens): - delete_token(instance_config, token_name) + for token_name in list(instance_state.tokens): + delete_token(instance_state, token_name) for token_name, token_configuration in configuration.tokens.items(): create_token( - instance_config, + instance_state, token_name, token_configuration, ) if new_collection_names or deleted_collection_names: - instance_config.fastapi_app.openapi_schema = None - instance_config.fastapi_app.setup() + instance_state.fastapi_app.openapi_schema = None + instance_state.fastapi_app.setup() def create_token( - instance_config: InstanceConfig, + instance_state: InstanceState, token_name: str, token_configuration: TokenConfig, ): - instance_config.xxx_tokens[token_name] = token_configuration + instance_state.tokens[token_name] = token_configuration def delete_token( - global_objects: InstanceConfig, + instance_state: InstanceState, token_name: str, ): - global_objects.tokens.pop(token_name) + instance_state.tokens.pop(token_name) def delete_collection( - global_objects: InstanceConfig, + instance_state: InstanceState, collection_name: str, ): - global_objects.collections.pop(collection_name) + instance_state.collections.pop(collection_name) diff --git a/dump_things_service/token.py b/dump_things_service/token.py index d8d3ddd..461fbbd 100644 --- a/dump_things_service/token.py +++ b/dump_things_service/token.py @@ -1,15 +1,6 @@ import hashlib -from pydantic import BaseModel - - -class TokenPermission(BaseModel): - curated_read: bool = False - incoming_read: bool = False - incoming_write: bool = False - curated_write: bool = False - zones_access: bool = False - admin: bool = False +from dump_things_service.abstract_config import TokenPermission def get_token_parts(token: str) -> list[str]: diff --git a/dump_things_service/token_endpoints.py b/dump_things_service/token_endpoints.py index e298430..07d81ec 100644 --- a/dump_things_service/token_endpoints.py +++ b/dump_things_service/token_endpoints.py @@ -2,7 +2,6 @@ import hashlib import logging import random import sys -from typing import cast from urllib.parse import quote from fastapi import ( @@ -11,52 +10,31 @@ from fastapi import ( HTTPException, Response, ) -from fastapi_pagination import ( - Page, - add_pagination, - paginate, -) -from pydantic import BaseModel -from starlette.status import HTTP_404_NOT_FOUND from dump_things_service import ( HTTP_201_CREATED, - HTTP_401_UNAUTHORIZED, + HTTP_404_NOT_FOUND, HTTP_409_CONFLICT, ) +from dump_things_service.abstract_config import ( + TokenConfig, + read_config, + store_config, +) from dump_things_service.admin import authenticate_admin from dump_things_service.api_key import api_key_header_scheme -from dump_things_service.config import ( - InstanceConfig, - TokenModes, - get_config, -) -from dump_things_service.utils import check_collection +#from dump_things_service.config import get_config +from dump_things_service.exceptions import ConfigError +from dump_things_service.manifest import manifest_configuration +from dump_things_service.utils import wrap_http_exception + logger = logging.getLogger('dump_things_service') router = APIRouter() -#add_pagination(router) -class TokenCollectionInfo(BaseModel): - mode: TokenModes - incoming_label: str - - -class TokenRequestBase(BaseModel): +class TokenRequest(TokenConfig): name: str - user_id: str - curated: str - incoming: str - collection_info: dict[str, TokenCollectionInfo] - - -class CreateTokenRequest(TokenRequestBase): - representation: str | None = None - - -class TokenResponse(TokenRequestBase): - representation: str def get_token_parts(token: str) -> list[str]: @@ -82,36 +60,36 @@ def hash_token(token: str) -> str: ) async def create_token( response: Response, - body: CreateTokenRequest, + body: TokenRequest, api_key: str = Depends(api_key_header_scheme), -) -> TokenResponse: +) -> TokenRequest: instance_config = get_config() - - # Check admin rights authenticate_admin(instance_config, api_key) + abstract_config = read_config(store_path=instance_config.store_path) # Check for existing token-name - if body.name in instance_config.xxx_tokens: + if body.name in abstract_config.tokens: raise HTTPException( status_code=HTTP_409_CONFLICT, detail=f"Token with name '{body.name}' already exists.", ) # Ensure that all specified collections and modes exist - for collection_name, token_collection_info in body.collection_info.items(): - check_collection(instance_config, collection_name) + for collection_name, token_collection_info in body.collections.items(): + if collection_name not in abstract_config.collections: + detail = f"No such collection: '{collection_name}'." + raise HTTPException(status_code=HTTP_404_NOT_FOUND, detail=detail) + print(f'IMPLEMENT: check incoming label ({token_collection_info.incoming_label}), check mode ({token_collection_info.mode})', file=sys.stderr, flush=True) # TODO: check mode(!), check incoming_label(?) if body.representation: # We have a specific representation, check that it is not already used - for token in instance_config.xxx_tokens.values(): + for token in abstract_config.tokens.values(): if token.representation == body.representation: - raise HTTPException( - status_code=HTTP_409_CONFLICT, - detail=f"Representation '{body.representation}' already exists.", - ) + detail= f"Representation '{body.representation}' already exists." + raise HTTPException(status_code=HTTP_409_CONFLICT, detail=detail) else: # Generate a random representation that does not yet exist. collision = True @@ -125,19 +103,30 @@ async def create_token( ) # Store the new token in the configuration - instance_config.xxx_tokens[body.name] = TokenResponse( - name = body.name, - user_id = body.user_id, - collection_info=body.collection_info, - representation=cast(str, body.representation), + abstract_config.tokens[body.name] = TokenConfig( + user_id=body.user_id, + collections=body.collections, + representation=body.representation, + hashed=body.hashed, + ) + + # Manifest the configuration + with wrap_http_exception(ConfigError): + manifest_configuration(abstract_config, instance_config) + + # Persist the configuration + store_config( + store_path=instance_config.store_path, + config=abstract_config, ) response.headers['Location'] = f'/tokens/{quote(body.name)}' - return TokenResponse( + return TokenRequest( name=body.name, user_id=body.user_id, - representation=cast(str, body.representation), - collection_info=body.collection_info, + collections=body.collections, + representation=body.representation, + hashed=body.hashed, ) @@ -148,26 +137,22 @@ async def create_token( ) async def get_tokens( api_key: str = Depends(api_key_header_scheme), -) -> list[TokenResponse]: +) -> list[TokenRequest]: instance_config = get_config() - - # Check admin rights authenticate_admin(instance_config, api_key) - return list(instance_config.xxx_tokens.values()) - -def authenticate_admin( - instance_config: InstanceConfig, - api_key: str, -): - print('IMPLEMENT: authenticate_admin() ', file=sys.stderr, flush=True) - if api_key != 'admin-1': - detail = f'invalid admin token: {api_key}' - raise HTTPException( - status_code=HTTP_401_UNAUTHORIZED, - detail=detail, + abstract_config = read_config(store_path=instance_config.store_path) + return [ + TokenRequest( + name=n, + user_id=t.user_id, + representation=t.representation, + collections=t.collections, + hashed=t.hashed, ) + for n, t in abstract_config.tokens.items() + ] @router.get( @@ -178,16 +163,21 @@ def authenticate_admin( async def get_token_with_name( token_name: str, api_key: str = Depends(api_key_header_scheme), -) -> TokenResponse: +) -> TokenRequest: instance_config = get_config() - - # Check admin rights authenticate_admin(instance_config, api_key) - if token_name not in instance_config.xxx_tokens: - raise HTTPException( - status_code=HTTP_404_NOT_FOUND, - detail=f"token with name '{token_name}' does not exist.", - ) - return instance_config.xxx_tokens[token_name] + abstract_config = read_config(store_path=instance_config.store_path) + if token_name not in abstract_config.tokens: + detail = f"token with name '{token_name}' does not exist." + raise HTTPException(status_code=HTTP_404_NOT_FOUND, detail=detail) + + t = abstract_config.tokens[token_name] + return TokenRequest( + name=token_name, + user_id=t.user_id, + representation=t.representation, + collections=t.collections, + hashed=t.hashed, + ) diff --git a/dump_things_service/utils.py b/dump_things_service/utils.py index 064dcf3..efc535e 100644 --- a/dump_things_service/utils.py +++ b/dump_things_service/utils.py @@ -1,5 +1,14 @@ +""" + + +To speed up processing, multiple indices could be introduced, e.g.: + +- token representation -> token name + +""" from __future__ import annotations +import hashlib import logging import sys from contextlib import contextmanager @@ -7,6 +16,7 @@ from functools import reduce from typing import ( TYPE_CHECKING, Callable, + Iterable, ) import fsspec @@ -22,6 +32,13 @@ from dump_things_service import ( HTTP_413_CONTENT_TOO_LARGE, HTTP_503_SERVICE_UNAVAILABLE, ) +from dump_things_service.abstract_config import ( + Configuration, + TokenConfig, + tokens_for_collection, + CollectionConfig, + TokenCollectionConfig, +) from dump_things_service.auth import ( AuthenticationError, AuthenticationInfo, @@ -37,7 +54,7 @@ if TYPE_CHECKING: from dump_things_service import JSON from dump_things_service.backends.record_dir import RecordDirStore from dump_things_service.backends.sqlite import SQLiteBackend - from dump_things_service.config import InstanceConfig + from dump_things_service.instance_state import InstanceState from dump_things_service.store.model_store import ModelStore @@ -82,12 +99,12 @@ def combine_ttl(documents: list[str]) -> str: def get_schema_type_curie( - instance_config: InstanceConfig, + instance_state: InstanceState, collection: str, class_name: str, ) -> str: - schema_url = instance_config.schemas[collection] - schema_module = instance_config.conversion_objects[schema_url]['schema_module'] + schema_url = instance_state.schemas[collection] + schema_module = instance_state.conversion_objects[schema_url]['schema_module'] class_object = getattr(schema_module, class_name) return class_object.class_class_curie @@ -109,12 +126,12 @@ def wrap_http_exception( def join_default_token_permissions( - instance_config: InstanceConfig, + instance_state: InstanceState, permissions: TokenPermission, collection: str, ) -> TokenPermission: - default_token_name = instance_config.collections[collection].default_token - default_token_permissions = instance_config.tokens[collection][default_token_name]['permissions'] + default_token_name = instance_state.collections[collection].default_token + default_token_permissions = instance_state.tokens[collection][default_token_name]['permissions'] result = TokenPermission() result.curated_read = ( permissions.curated_read | default_token_permissions.curated_read @@ -128,54 +145,15 @@ def join_default_token_permissions( return result -def check_collection( - instance_config: InstanceConfig, - collection: str, -): - if collection not in instance_config.collections: - raise HTTPException( - status_code=HTTP_404_NOT_FOUND, - detail=f"No such collection: '{collection}'.", - ) - - -def check_label( - instance_config: InstanceConfig, - collection: str, - label: str, -): - # Get the on-disk labels for the collection - if ( - label not in get_config_labels(instance_config, collection) - and label not in get_on_disk_labels(instance_config, collection) - ): - raise HTTPException( - status_code=HTTP_404_NOT_FOUND, - detail=f"No incoming label: '{label}' in collection: '{collection}'.", - ) - - -def get_config_labels( - instance_config: InstanceConfig, - collection: str, -) -> set[str]: - check_collection(instance_config, collection) - return { - token['incoming_label'] - for token in instance_config.tokens[collection].values() - if token['incoming_label'] != '' - } - - def get_on_disk_labels( - instance_config: InstanceConfig, - collection: str, + store_path: Path, + abstract_config: Configuration, + collection: str, ) -> set[str]: - check_collection(instance_config, collection) + check_collection(abstract_config, collection) incoming_path = ( - instance_config.store_path - / instance_config.collections[collection].incoming + store_path / abstract_config.collections[collection].incoming ) if not incoming_path or not incoming_path.exists(): return set() @@ -187,56 +165,8 @@ def get_on_disk_labels( } -def get_default_token_name( - instance_config: InstanceConfig, - collection: str -) -> str: - check_collection(instance_config, collection) - return instance_config.collections[collection].default_token - - -async def process_token( - instance_config: InstanceConfig, - api_key: str, - collection: str, -) -> tuple[TokenPermission, ModelStore]: - token = ( - get_default_token_name(instance_config, collection) - if api_key is None - else api_key - ) - - token_store, token, token_permissions, _ = get_token_store( - instance_config, - collection, - token, - ) - final_permissions = join_default_token_permissions( - instance_config, token_permissions, collection - ) - - # Check for maintenance mode - if collection in instance_config.maintenance_mode: - if not ( - final_permissions.curated_read - and final_permissions.curated_write - and final_permissions.zones_access - ): - raise HTTPException( - status_code=HTTP_503_SERVICE_UNAVAILABLE, - detail=f"Collection '{collection}' is in maintenance mode", - ) - - if not final_permissions.incoming_read and not final_permissions.curated_read: - raise HTTPException( - status_code=HTTP_403_FORBIDDEN, - detail=f"No read access to curated or incoming data in collection '{collection}'.", - ) - return final_permissions, token_store - - def resolve_hashed_token( - instance_config: InstanceConfig, + instance_state: InstanceState, collection_name: str, token: str, ) -> str: @@ -244,7 +174,7 @@ def resolve_hashed_token( # Check for hashed token and return the hashed token value instead # of the plain text token value if the token is hashed. if '-' in token: - return instance_config.hashed_tokens[collection_name].get( + return instance_state.hashed_tokens[collection_name].get( get_token_parts(token)[0], token, ) @@ -252,29 +182,29 @@ def resolve_hashed_token( def authenticate_token( - instance_config: InstanceConfig, + instance_state: InstanceState, collection_name: str, - plain_token: str, + token_representation: str, ) -> AuthenticationInfo: # Try to authenticate the token with the authentication providers that # are associated with the collection. auth_info = None messages = [] - for auth_provider in instance_config.auth_providers[collection_name]: + for auth_source in instance_state.auth_sources[collection_name]: try: - logger.debug('trying to authenticate with %s', auth_provider) - auth_info = auth_provider.authenticate(plain_token) + logger.debug('trying to authenticate with %s', auth_source) + auth_info = auth_source.authenticate(token_representation) break except AuthenticationError as ae: logger.debug( 'Authentication provider %s could not ' 'authenticate token for collection %s: %s', - auth_provider, + auth_source, collection_name, str(ae), ) - messages.append(f'{auth_provider.__class__.__name__} failed with: {ae}') + messages.append(f'{auth_source.__class__.__name__} failed with: {ae}') continue if not auth_info: @@ -289,38 +219,41 @@ def authenticate_token( def get_token_store( - instance_config: InstanceConfig, + instance_state: InstanceState, collection_name: str, - plain_token: str + token_representation: str, ) -> tuple[ModelStore, str, TokenPermission, str] | tuple[None, None, None, None]: - check_collection(instance_config, collection_name) # Try to authenticate the token with the authentication providers that # are associated with the collection. - auth_info = authenticate_token(instance_config, collection_name, plain_token) + auth_info = authenticate_token( + instance_state, + collection_name, + token_representation, + ) permissions = auth_info.token_permission # If the token is hashed, get the hashed value. This is required because # we associate token info with the hashed version of the token. hashed_token = resolve_hashed_token( - instance_config, + instance_state, collection_name, - plain_token, + token_representation, ) # If the token has no incoming-read or incoming-write permissions, we do not # need to create a store. if not permissions.incoming_read and not permissions.incoming_write: - instance_config.token_stores[collection_name][plain_token] = ( + instance_state.token_stores[collection_name][token_representation] = ( None, hashed_token, permissions, auth_info.user_id, ) - return instance_config.token_stores[collection_name][plain_token] + return instance_state.token_stores[collection_name][token_representation] # Check whether the collection has an incoming definition - incoming = instance_config.incoming.get(collection_name) + incoming = instance_state.incoming.get(collection_name) if not incoming: raise HTTPException( status_code=HTTP_401_UNAUTHORIZED, @@ -328,28 +261,28 @@ def get_token_store( ) # Check whether a store for this collection and token does already exist. - store_info = instance_config.token_stores[collection_name].get(plain_token) + store_info = instance_state.token_stores[collection_name].get(token_representation) if store_info: return store_info - store_dir = instance_config.store_path / incoming / auth_info.incoming_label + store_dir = instance_state.store_path / incoming / auth_info.incoming_label token_store = create_token_store( - instance_config=instance_config, + instance_state=instance_state, collection_name=collection_name, store_dir=store_dir, ) - instance_config.token_stores[collection_name][plain_token] = ( + instance_state.token_stores[collection_name][token_representation] = ( token_store, hashed_token, permissions, auth_info.user_id, ) - return instance_config.token_stores[collection_name][plain_token] + return instance_state.token_stores[collection_name][token_representation] def create_token_store( - instance_config: InstanceConfig, + instance_state: InstanceState, collection_name: str, store_dir: Path, ) -> ModelStore: @@ -362,11 +295,11 @@ def create_token_store( # Check if the store was already created and if it was created for the # same schema. - if store_dir in instance_config.all_stores: - existing_collection_name, existing_model_store = instance_config.all_stores[store_dir] + if store_dir in instance_state.all_stores: + existing_collection_name, existing_model_store = instance_state.all_stores[store_dir] if ( existing_collection_name != collection_name - and instance_config.schemas[existing_collection_name] != instance_config.schemas[collection_name] + and instance_state.schemas[existing_collection_name] != instance_state.schemas[collection_name] ): msg = ( f"collections '{existing_collection_name}' and " @@ -381,16 +314,16 @@ def create_token_store( store_dir.mkdir(parents=True, exist_ok=True) - schema_uri = instance_config.schemas[collection_name] + schema_uri = instance_state.schemas[collection_name] # We get the backend information from the curated store - backend_type = instance_config.backend[collection_name].type + backend_type = instance_state.backend[collection_name].type backend_name, extension = get_backend_and_extension(backend_type) - backend = instance_config.curated_stores[collection_name].backend + backend = instance_state.curated_stores[collection_name].backend if backend_name == 'record_dir': # The configuration routines have read the backend configuration of the - # curated store from disk and stored it in `instance_config`. We fetch + # curated store from disk and stored it in `instance_state`. We fetch # it from there. if extension == 'stl': backend = backend.backend @@ -398,7 +331,7 @@ def create_token_store( token_store = create_record_dir_token_store( store_dir=store_dir, order_by=backend.order_by, - schema_uri=instance_config.schemas[collection_name], + schema_uri=instance_state.schemas[collection_name], mapping_function=backend.pid_mapping_function, suffix=backend.suffix, ) @@ -416,13 +349,13 @@ def create_token_store( if extension == 'stl': token_store = SchemaTypeLayer(backend=token_store, schema=schema_uri) - submission_tags = instance_config.collections[collection_name].submission_tags + submission_tags = instance_state.collections[collection_name].submission_tags tags = { 'id': submission_tags.submitter_id_tag, 'time': submission_tags.submission_time_tag, } model_store = ModelStore(backend=token_store, schema=schema_uri, tags=tags) - instance_config.all_stores[store_dir] = (collection_name, model_store) + instance_state.all_stores[store_dir] = (collection_name, model_store) return model_store @@ -473,3 +406,51 @@ def check_bounds( detail=f"Too many records found in collection '{collection}'. " f'Please use pagination (/{collection}{alternative_url}).', ) + + +async def process_token( + abstract_config: Configuration, + api_key: str | None, + collection: str, +) -> tuple[TokenPermission, ModelStore]: + + token_config = ( + get_default_token_config(abstract_config, collection) + if api_key is None + else get_token_config_by_representation(abstract_config, api_key) + ) + + if not token_config: + detail = f'invalid token' + raise HTTPException( + status_code=HTTP_401_UNAUTHORIZED, + detail=detail, + ) + + token_store, token, token_permissions, _ = get_token_store( + instance_state, + collection, + token_config, + ) + final_permissions = join_default_token_permissions( + instance_state, token_permissions, collection + ) + + # Check for maintenance mode + if collection in instance_state.maintenance_mode: + if not ( + final_permissions.curated_read + and final_permissions.curated_write + and final_permissions.zones_access + ): + raise HTTPException( + status_code=HTTP_503_SERVICE_UNAVAILABLE, + detail=f"Collection '{collection}' is in maintenance mode", + ) + + if not final_permissions.incoming_read and not final_permissions.curated_read: + raise HTTPException( + status_code=HTTP_403_FORBIDDEN, + detail=f"No read access to curated or incoming data in collection '{collection}'.", + ) + return final_permissions, token_store -- 2.52.0 From adb100d67e73c2fe7039be0c063179f9c8daaf25 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 28 Apr 2026 10:11:22 +0200 Subject: [PATCH 04/64] [temp] add authentication source to instance state --- dump_things_service/abstract_config.py | 1 + dump_things_service/auth/__init__.py | 1 + dump_things_service/auth/config.py | 10 ++-- dump_things_service/collection.py | 65 ++++++++++++++++++++++---- dump_things_service/instance_state.py | 2 +- dump_things_service/manifest.py | 3 +- 6 files changed, 66 insertions(+), 16 deletions(-) diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index 3fa19e1..f368c21 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -1,4 +1,5 @@ import enum +import hashlib import logging from pathlib import ( Path, diff --git a/dump_things_service/auth/__init__.py b/dump_things_service/auth/__init__.py index e2d87b1..8786e62 100644 --- a/dump_things_service/auth/__init__.py +++ b/dump_things_service/auth/__init__.py @@ -14,6 +14,7 @@ import abc import dataclasses from typing import TYPE_CHECKING + if TYPE_CHECKING: from dump_things_service.token import TokenPermission diff --git a/dump_things_service/auth/config.py b/dump_things_service/auth/config.py index 37888cf..7dff632 100644 --- a/dump_things_service/auth/config.py +++ b/dump_things_service/auth/config.py @@ -7,17 +7,17 @@ from dump_things_service.auth import ( InvalidTokenError, ) from dump_things_service.config import get_permissions -from dump_things_service.utils import get_token_config_for_representation_and_collection +from dump_things_service.abstract_config import get_token_config_for_representation_and_collection class ConfigAuthenticationSource(AuthenticationSource): def __init__( self, abstract_configuration: Configuration, - collection: str, + collection_name: str, ): self.abstract_configuration = abstract_configuration - self.collection = collection + self.collection_name = collection_name def authenticate( self, @@ -26,12 +26,12 @@ class ConfigAuthenticationSource(AuthenticationSource): result = get_token_config_for_representation_and_collection( self.abstract_configuration, - self.collection, + self.collection_name, token_representation, ) if not result: - msg = f'Token not valid for collection `{self.collection}`' + msg = f'Token not valid for collection `{self.collection_name}`' raise InvalidTokenError(msg) _, token_config, token_collection_config = result diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index ca2e75b..2c0e17d 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -20,14 +20,19 @@ from dump_things_service import ( HTTP_403_FORBIDDEN, HTTP_422_UNPROCESSABLE_CONTENT, ) -from .abstract_config import ( +from dump_things_service.abstract_config import ( CollectionConfig, + Configuration, + ConfigAuthSpec, + ForgejoAuthSpec, RecordDirBackendConfig, SQLiteBackendConfig, read_config, check_collection, get_default_token_name, ) +from dump_things_service.auth.config import ConfigAuthenticationSource +from dump_things_service.auth.forgejo import ForgejoAuthenticationSource from dump_things_service.backends.sqlite import record_file_name as sqlite_db_filename from dump_things_service.instance_state import ( InstanceState, @@ -115,18 +120,21 @@ async def {name}( def create_collection( instance_state: InstanceState, - collection_configuration: CollectionConfig, + configuration: Configuration, + collection_name: str, ): - """Create a collection as specified by `collection_configuration` + """Create a collection instance as specified by `collection_configuration` Reuse existing disk structures, if they are compatible. If they are not compatible, raise an error. :param instance_state: - :param collection_configuration: + :param configuration: + :param collection_name: :return: """ + collection_configuration = configuration.collections[collection_name] curated_path = Path(instance_state.store_path / collection_configuration.curated) incoming_path = ( None @@ -204,13 +212,22 @@ def create_collection( collection_configuration.schema, ) - # Create the audit log + # Create the audit log backends for audit_backend in collection_configuration.audit_backends: audit_path = Path(instance_state.store_path / audit_backend.path) if not audit_path.exists(): create_audit_store(audit_path) created_directories.append(audit_path) + # Create the authentication sources + for authentication_spec in collection_configuration.auth_sources: + create_authentication_source( + configuration, + collection_name, + authentication_spec, + instance_state, + ) + # Create the dynamic endpoints for record storing & validation, for # inbox-storing, and for curated area storing. create_endpoints_for_collection( @@ -219,7 +236,7 @@ def create_collection( ) # Create the collection configuration element - instance_state.xxx_collections[collection_configuration.name] = collection_configuration + #instance_state.collections[collection_configuration.name] = collection_configuration def create_backend( @@ -230,6 +247,37 @@ def create_backend( assert backend_config.type == 'record_dir+stl' +def create_authentication_source( + abstract_configuration: Configuration, + collection_name: str, + authentication_spec: ConfigAuthSpec | ForgejoAuthSpec, + instance_state: InstanceState, +): + if collection_name not in instance_state.auth_sources: + instance_state.auth_sources[collection_name] = [] + + auth_sources = instance_state.auth_sources[collection_name] + if isinstance(authentication_spec, ConfigAuthSpec): + auth_source = ConfigAuthenticationSource( + abstract_configuration=abstract_configuration, + collection_name=collection_name, + ) + elif isinstance(authentication_spec, ForgejoAuthSpec): + auth_source = ForgejoAuthenticationSource( + api_url=authentication_spec.url, + organization=authentication_spec.organization, + team=authentication_spec.team, + label_type=authentication_spec.label_type, + instance_id=authentication_spec.repository, + repository=authentication_spec.repository, + ) + else: + msg = f"Unsupported authentication config type: '{type(authentication_spec)}'" + raise ConfigError(msg) + + auth_sources.append(auth_source) + + # TODO: should this be in instance_state? def initialize_backend( incoming_path: Path, @@ -275,9 +323,8 @@ def check_store_compatibility( elif isinstance(backend_config, SQLiteBackendConfig): check_sqlite_compatibility(store_path, backend_config, schema) else: - raise ConfigError( - f"Unsupported backend config type: '{type(backend_config)}'" - ) + msg = f"Unsupported backend config type: '{type(backend_config)}'" + raise ConfigError(msg) return diff --git a/dump_things_service/instance_state.py b/dump_things_service/instance_state.py index 1df8811..9f0706b 100644 --- a/dump_things_service/instance_state.py +++ b/dump_things_service/instance_state.py @@ -91,7 +91,7 @@ class InstanceState: # Created based on abstract configuration collectiona: dict = dataclasses.field(default_factory=dict) tokens: dict = dataclasses.field(default_factory=dict) - auth_sources: dict = dataclasses.field(default_factory=dict) + auth_sources: dict[str, list] = dataclasses.field(default_factory=dict) audit_backends: dict = dataclasses.field(default_factory=dict) model_info: dict = dataclasses.field(default_factory=dict) diff --git a/dump_things_service/manifest.py b/dump_things_service/manifest.py index 3c0ad7e..d360bdb 100644 --- a/dump_things_service/manifest.py +++ b/dump_things_service/manifest.py @@ -76,7 +76,8 @@ def manifest_configuration( for collection_name in new_collection_names: create_collection( instance_state, - configuration.collections[collection_name], + configuration, + collection_name, ) # Delete all token objects and recreate the tokens. This ensures that -- 2.52.0 From e93b6b6a48c6bdcd55c4cfeb30a426b40cf19280 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 28 Apr 2026 11:12:53 +0200 Subject: [PATCH 05/64] add audit backend generation and general fixes --- dump_things_service/abstract_config.py | 110 ++- .../{token.py => api_token.py} | 2 - dump_things_service/auth/__init__.py | 2 +- dump_things_service/auth/config.py | 6 +- dump_things_service/auth/forgejo.py | 2 +- dump_things_service/backends/sqlite.py | 3 + dump_things_service/collection.py | 280 +++++--- dump_things_service/collection_endpoints.py | 20 +- dump_things_service/commands/copy_store.py | 2 +- dump_things_service/commands/rebuild_index.py | 5 +- dump_things_service/converter.py | 44 +- dump_things_service/curated.py | 72 +- dump_things_service/dynamic_endpoints.py | 4 +- dump_things_service/export/json.py | 12 +- dump_things_service/export/tree.py | 25 +- dump_things_service/incoming.py | 70 +- dump_things_service/instance_state.py | 168 +++-- dump_things_service/main.py | 288 ++++---- dump_things_service/manifest.py | 17 +- dump_things_service/model.py | 58 +- dump_things_service/tests/create_store.py | 28 +- dump_things_service/tests/fixtures.py | 656 +++++++++++------- dump_things_service/tests/test_auth.py | 2 +- dump_things_service/tests/test_basic.py | 33 +- dump_things_service/tests/test_config.py | 19 +- dump_things_service/tests/test_curated.py | 18 +- .../tests/test_extract_inline.py | 14 +- dump_things_service/tests/test_incoming.py | 38 +- .../tests/test_pid_resolution.py | 4 +- dump_things_service/tests/test_roundtrip.py | 4 +- .../tests/test_roundtrip_flatsocial.py | 20 +- .../tests/test_token_endpoints.py | 4 +- dump_things_service/tests/test_utils.py | 2 +- .../tests/test_web_interface.py | 6 +- dump_things_service/token_endpoints.py | 51 +- dump_things_service/utils.py | 149 ++-- dump_things_service/validate.py | 98 +++ .../{config.py => xxx_no_config.py} | 8 +- 38 files changed, 1415 insertions(+), 929 deletions(-) rename dump_things_service/{token.py => api_token.py} (85%) create mode 100644 dump_things_service/validate.py rename dump_things_service/{config.py => xxx_no_config.py} (99%) diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index f368c21..9acf159 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -14,15 +14,9 @@ from fastapi import HTTPException from pydantic import ( BaseModel, ConfigDict, - Field, ) -from dump_things_service import ( - HTTP_401_UNAUTHORIZED, - HTTP_403_FORBIDDEN, - HTTP_404_NOT_FOUND, - HTTP_503_SERVICE_UNAVAILABLE, -) +from dump_things_service import HTTP_404_NOT_FOUND from dump_things_service.audit.gitaudit import GitAuditBackend from dump_things_service.backends.record_dir import ( _RecordDirStore, @@ -65,14 +59,14 @@ class TagSpec(BaseModel): class RecordDirBackendConfig(StrictModel): + model_config = ConfigDict(use_enum_values=True) type: Literal['record_dir', 'record_dir+stl'] - mapping_method: MappingMethod = MappingMethod.digest_md5 - class Config: - use_enum_values = True + mapping_method: str = MappingMethod.digest_md5.value class SQLiteBackendConfig(StrictModel): type: Literal['sqlite', 'sqlite+stl'] + schema: str class GitAuditBackendConfig(StrictModel): @@ -96,6 +90,15 @@ class CollectionConfig(BaseModel): ignore_classes: list[str] = [] +class RecordDirConfigFileContent(BaseModel): + model_config = ConfigDict(extra='forbid') + type: Literal['records'] + version: Literal[1] + schema: str + format: Literal['yaml'] + idfx: MappingMethod + + class TokenModes(enum.Enum): READ_CURATED = 'READ_CURATED' READ_COLLECTION = 'READ_COLLECTION' @@ -121,7 +124,7 @@ class TokenPermission(BaseModel): class TokenCollectionConfig(StrictModel): model_config = ConfigDict(extra='forbid', use_enum_values=True) mode: TokenModes - incoming_label: str = Field(strict=True) + incoming_label: str = '' class TokenConfig(StrictModel): @@ -145,6 +148,47 @@ class Configuration(BaseModel): pid: str = dump_things_config_iri +mode_mapping = { + TokenModes.READ_CURATED: TokenPermission(curated_read=True), + TokenModes.READ_COLLECTION: TokenPermission( + curated_read=True, + incoming_read=True, + ), + TokenModes.WRITE_COLLECTION: TokenPermission( + curated_read=True, + incoming_read=True, + incoming_write=True, + ), + TokenModes.READ_SUBMISSIONS: TokenPermission(incoming_read=True), + TokenModes.WRITE_SUBMISSIONS: TokenPermission( + incoming_read=True, + incoming_write=True, + ), + TokenModes.SUBMIT: TokenPermission(curated_read=True, incoming_write=True), + TokenModes.SUBMIT_ONLY: TokenPermission(incoming_write=True), + TokenModes.NOTHING: TokenPermission(), + TokenModes.CURATOR: TokenPermission( + curated_read=True, + incoming_read=True, + incoming_write=True, + curated_write=True, + zones_access=True, + ), + TokenModes.ADMIN: TokenPermission( + curated_read=True, + incoming_read=True, + incoming_write=True, + curated_write=True, + zones_access=True, + admin=True, + ) +} + + +def get_permissions(mode: str) -> TokenPermission: + return mode_mapping[TokenModes(mode)] + + def get_config_backends( store_path: Path, ) -> tuple[_RecordDirStore, GitAuditBackend]: @@ -187,6 +231,15 @@ def read_config( return g_abstract_configuration +def get_config() -> Configuration: + global g_abstract_configuration + + if not g_abstract_configuration: + msg = 'Configuration not yet loaded' + raise RuntimeError(msg) + return g_abstract_configuration + + def store_config( store_path, config: Configuration, @@ -253,14 +306,17 @@ def check_collection( def check_label( + store_path: Path, abstract_config: Configuration, collection: str, label: str, ): + from dump_things_service.utils import get_on_disk_labels + """Check that a label exists in a collection configuration or on disk""" if ( label not in get_config_labels(abstract_config, collection) - and label not in get_on_disk_labels(abstract_config, collection) + and label not in get_on_disk_labels(store_path, abstract_config, collection) ): raise HTTPException( status_code=HTTP_404_NOT_FOUND, @@ -293,7 +349,7 @@ def get_token_info_by_representation( token_representation: str, ) -> tuple[str, TokenConfig] | None: """Get the name of the token given in `token_representation`""" - hashed_representation = hashlib.sha1(token_representation.encode()).hexdigest() + hashed_representation = hash_token_representation(token_representation) for token_name, token_config in abstract_config.tokens.items(): if token_config.hashed: compare_representation = hashed_representation @@ -304,6 +360,12 @@ def get_token_info_by_representation( return None +def hash_token_representation( + token_representation: str, +) -> str: + return hashlib.sha1(token_representation.encode()).hexdigest() + + def get_token_config_by_name( abstract_config: Configuration, token_name: str, @@ -358,7 +420,7 @@ def get_collection_config_by_name( def get_default_token_config( abstract_config: Configuration, collection: str, -) -> TokenConfig: +) -> TokenConfig | None: default_token_name = get_collection_config_by_name( abstract_config, @@ -366,3 +428,23 @@ def get_default_token_config( ).default_token return get_token_config_by_name(abstract_config, default_token_name) + + +def get_default_token_representation( + abstract_config: Configuration, + collection: str, +) -> str | None: + default_token_config = get_default_token_config( + abstract_config, + collection, + ) + return default_token_config.representation if default_token_config else None + + +def get_mapping_function(record_dir_backend_config: RecordDirBackendConfig): + return mapping_functions[MappingMethod(record_dir_backend_config.mapping_method)] + + +def get_backend_and_extension(backend_type: str) -> tuple[str, str]: + elements = backend_type.split('+') + return (elements[0], elements[1]) if len(elements) > 1 else (elements[0], '') diff --git a/dump_things_service/token.py b/dump_things_service/api_token.py similarity index 85% rename from dump_things_service/token.py rename to dump_things_service/api_token.py index 461fbbd..da173ee 100644 --- a/dump_things_service/token.py +++ b/dump_things_service/api_token.py @@ -1,7 +1,5 @@ import hashlib -from dump_things_service.abstract_config import TokenPermission - def get_token_parts(token: str) -> list[str]: parts = token.split('-', 1) diff --git a/dump_things_service/auth/__init__.py b/dump_things_service/auth/__init__.py index 8786e62..56231dd 100644 --- a/dump_things_service/auth/__init__.py +++ b/dump_things_service/auth/__init__.py @@ -16,7 +16,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from dump_things_service.token import TokenPermission + from dump_things_service.api_token import TokenPermission class AuthenticationError(Exception): diff --git a/dump_things_service/auth/config.py b/dump_things_service/auth/config.py index 7dff632..916b2b4 100644 --- a/dump_things_service/auth/config.py +++ b/dump_things_service/auth/config.py @@ -6,8 +6,10 @@ from dump_things_service.auth import ( AuthenticationSource, InvalidTokenError, ) -from dump_things_service.config import get_permissions -from dump_things_service.abstract_config import get_token_config_for_representation_and_collection +from dump_things_service.abstract_config import ( + get_permissions, + get_token_config_for_representation_and_collection, +) class ConfigAuthenticationSource(AuthenticationSource): diff --git a/dump_things_service/auth/forgejo.py b/dump_things_service/auth/forgejo.py index d088ff4..3b99524 100644 --- a/dump_things_service/auth/forgejo.py +++ b/dump_things_service/auth/forgejo.py @@ -28,7 +28,7 @@ from dump_things_service.auth import ( AuthenticationSource, InvalidTokenError, ) -from dump_things_service.config import TokenPermission +from dump_things_service.abstract_config import TokenPermission logger = logging.getLogger('dump_things_service') diff --git a/dump_things_service/backends/sqlite.py b/dump_things_service/backends/sqlite.py index 7b10ddb..be89d8f 100644 --- a/dump_things_service/backends/sqlite.py +++ b/dump_things_service/backends/sqlite.py @@ -124,6 +124,9 @@ class _SQLiteBackend(StorageBackend): order_by: Iterable[str] | None = None, echo: bool = False, ) -> None: + assert db_path.is_absolute(), f'db_path not absolute {db_path}' + if db_path.exists(): + assert db_path.is_file(), f'db_path not a file {db_path}' super().__init__(order_by=order_by) self.db_path = db_path self.perform_file_name_conversion() diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index 2c0e17d..99c08fa 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -13,7 +13,11 @@ from pydantic import ( TypeAdapter, ValidationError, ) - +from starlette.responses import ( + JSONResponse, + PlainTextResponse, +) +from starlette.status import HTTP_401_UNAUTHORIZED from dump_things_service import ( HTTP_400_BAD_REQUEST, @@ -27,17 +31,37 @@ from dump_things_service.abstract_config import ( ForgejoAuthSpec, RecordDirBackendConfig, SQLiteBackendConfig, + TagSpec, read_config, check_collection, - get_default_token_name, + get_backend_and_extension, + get_default_token_representation, + get_mapping_function, ) +from dump_things_service.audit import AuditBackend +from dump_things_service.audit.gitaudit import GitAuditBackend from dump_things_service.auth.config import ConfigAuthenticationSource from dump_things_service.auth.forgejo import ForgejoAuthenticationSource -from dump_things_service.backends.sqlite import record_file_name as sqlite_db_filename +from dump_things_service.backends.record_dir import ( + _RecordDirStore, + RecordDirStore, +) +from dump_things_service.backends.sqlite import ( + _SQLiteBackend, + SQLiteBackend, + record_file_name as sqlite_db_filename, +) +from dump_things_service.backends.schema_type_layer import SchemaTypeLayer from dump_things_service.instance_state import ( InstanceState, - get_collection_dir_config, + get_record_dir_config, get_instance_state, + get_schema_info, + record_dir_config_file_name, InstanceStateCollectionInfo, +) +from dump_things_service.store.model_store import ( + _ModelStore, + ModelStore, ) from dump_things_service.converter import FormatConverter from dump_things_service.exceptions import ( @@ -51,32 +75,37 @@ from dump_things_service.utils import ( get_token_store, join_default_token_permissions, wrap_http_exception, + var_escape, ) # This following lines are required for dynamic endpoint generation from typing import Annotated -from fastapi import Body, Depends +from fastapi import ( + Body, + Depends, +) from dump_things_service import Format from dump_things_service.api_key import api_key_header_scheme -from starlette.responses import JSONResponse, PlainTextResponse +from dump_things_service.curated import store_curated_record +from dump_things_service.incoming import store_incoming_record +from dump_things_service.validate import validate_record logger = logging.getLogger('dump_things_service') _endpoint_template = """ -async def {name}( +def {name}( data: {model_var_name}.{class_name} | Annotated[str, Body(media_type='text/plain')], api_key: str = Depends(api_key_header_scheme), format: Format = Format.json, ) -> JSONResponse | PlainTextResponse: logger.info('{name}(%s, %s, %s, %s)', repr(data), repr('{class_name}'), repr({model_var_name}), repr(format)) - return await {handler}('{collection}', data, '{class_name}', {model_var_name}, format, api_key) + return {handler}('{collection}', data, '{class_name}', {model_var_name}, format, api_key) """ - _endpoint_curated_template = """ -async def {name}( +def {name}( data: {model_var_name}.{class_name}, author_id: str | None = None, api_key: str = Depends(api_key_header_scheme), @@ -87,7 +116,7 @@ async def {name}( repr(author_id), repr({model_var_name}), ) - return await store_curated_record( + return store_curated_record( '{collection}', data, '{class_name}', @@ -151,40 +180,23 @@ def create_collection( collection_configuration.schema, ) - if incoming_path and incoming_path.exists(): - check_store_compatibility( - incoming_path, - collection_configuration.backend, - collection_configuration.schema, - ) - for audit_backend in collection_configuration.audit_backends: audit_path = Path(instance_state.store_path / audit_backend.path) if audit_path.exists(): check_audit_compatibility(audit_path) - # We knoe now that all existing structures are compatible with the - # collection specification. We record what was created in order to delete + # We know now that all existing structures are compatible with the + # collection specification. We record what we create in order to delete # it in case of an error. created_directories = [] try: if not curated_path.exists(): curated_path.mkdir(parents=True) created_directories.append(curated_path) - initialize_backend( - curated_path, - collection_configuration.backend, - collection_configuration.schema, - ) if incoming_path and not incoming_path.exists(): incoming_path.mkdir(parents=True) created_directories.append(incoming_path) - initialize_backend( - incoming_path, - collection_configuration.backend, - collection_configuration.schema, - ) for audit_backend in collection_configuration.audit_backends: audit_path = Path(instance_state.store_path / audit_backend.path) @@ -198,26 +210,58 @@ def create_collection( shutil.rmtree(directory) raise - # Create the backends - create_backend( + # Create the curated store + curated_store = create_store( + instance_state, curated_path, collection_configuration.backend, collection_configuration.schema, + collection_configuration.submission_tags + ) + instance_state.curated_stores[collection_name] = curated_store + + # Incoming stores are created on demand when a token is authenticated + instance_state.incoming_stores[collection_name] = {} + + # Create the schema modules, schema view, and conversion objects + schema_location = collection_configuration.schema + instance_state.schema_info[schema_location] = get_schema_info(schema_location) + + # Determine the active classes based on the classes defined in the schema + # and the configuration of the collection + active_classes = set(instance_state.schema_info[schema_location].classes) + if collection_configuration.use_classes: + active_classes &= set(collection_configuration.use_classes) + if collection_configuration.ignore_classes: + active_classes -= set(collection_configuration.ignore_classes) + instance_state.collections[collection_name] = InstanceStateCollectionInfo( + active_classes=active_classes ) - if incoming_path: - create_backend( - incoming_path, - collection_configuration.backend, - collection_configuration.schema, - ) + # Create a validator for the collection + instance_state.validators[collection_name] = FormatConverter( + schema=collection_configuration.schema, + input_format=Format.json, + output_format=Format.ttl, + ) - # Create the audit log backends - for audit_backend in collection_configuration.audit_backends: - audit_path = Path(instance_state.store_path / audit_backend.path) - if not audit_path.exists(): - create_audit_store(audit_path) - created_directories.append(audit_path) + x = """ + if incoming_path: + if collection_name not in instance_state.incoming_stores: + instance_state.incoming_stores[collection_name] = {} + + # Create a store for each incoming area label in this collection + for token_name, path in active_incoming_store_info: + label_path = incoming_path / path + incoming_store = create_store( + instance_state, + label_path, + collection_configuration.backend, + collection_configuration.schema, + collection_configuration.submission_tags, + ) + instance_state.incoming_stores[collection_name][token_name] = incoming_store + """ # Create the authentication sources for authentication_spec in collection_configuration.auth_sources: @@ -228,23 +272,81 @@ def create_collection( instance_state, ) + # Create the audit-backends + instance_state.audit_backends[collection_name] = [] + for audit_backend_config in collection_configuration.audit_backends: + instance_state.audit_backends[collection_name].append( + GitAuditBackend( + path=Path(instance_state.store_path / audit_backend_config.path), + auto_flush_timeout=audit_backend_config.auto_flush_timeout, + ) + ) + # Create the dynamic endpoints for record storing & validation, for # inbox-storing, and for curated area storing. create_endpoints_for_collection( + instance_state, collection_configuration, instance_state.fastapi_app, ) - # Create the collection configuration element - #instance_state.collections[collection_configuration.name] = collection_configuration - -def create_backend( - incoming_path: Path, +def create_store( + instance_state: InstanceState, + relative_path: Path, backend_config: RecordDirBackendConfig | SQLiteBackendConfig, schema: str, -): - assert backend_config.type == 'record_dir+stl' + submission_tags: TagSpec, +) -> _ModelStore: + + backend_type, extension = get_backend_and_extension(backend_config.type) + if isinstance(backend_config, RecordDirBackendConfig): + backend = create_record_dir_backend(instance_state, relative_path, backend_config, schema) + elif isinstance(backend_config, SQLiteBackendConfig): + backend = create_sqlite_backend(instance_state, relative_path) + else: + msg = f'Unsupported backend configuration type: {backend_type} ({type(backend_config)})' + raise ConfigError(msg) + + if extension == 'stl': + backend = SchemaTypeLayer(backend=backend, schema=schema) + + return ModelStore( + schema=schema, + backend=backend, + tags={ + 'id': submission_tags.submitter_id_tag, + 'time': submission_tags.submission_time_tag, + }, + ) + + +def create_record_dir_backend( + instance_state: InstanceState, + relative_path: Path, + backend_config: RecordDirBackendConfig, + schema: str, +) -> _RecordDirStore: + path = instance_state.store_path / relative_path + write_record_dir_config(path, backend_config, schema) + backend = RecordDirStore( + root=path, + pid_mapping_function=get_mapping_function(backend_config), + suffix='yaml', + order_by=instance_state.order_by, + ) + backend.build_index_if_needed(schema=schema) + return backend + + +def create_sqlite_backend( + instance_state: InstanceState, + relative_path: Path, +) -> _SQLiteBackend: + return SQLiteBackend( + db_path=instance_state.store_path / relative_path / sqlite_db_filename, + order_by=instance_state.order_by, + ) def create_authentication_source( @@ -278,26 +380,23 @@ def create_authentication_source( auth_sources.append(auth_source) -# TODO: should this be in instance_state? -def initialize_backend( - incoming_path: Path, - backend_config: RecordDirBackendConfig | SQLiteBackendConfig, +def write_record_dir_config( + path: Path, + backend_config: RecordDirBackendConfig, schema: str, ): - assert backend_config.type == 'record_dir+stl' + assert isinstance(backend_config, RecordDirBackendConfig) - print(f'Incoming path: {incoming_path}') - print(f'backend spec: {backend_config}') - print(f'schema: {schema}') - - (incoming_path / '.dumpthings.yaml').write_text(f""" + record_dir_config_file_path = path / record_dir_config_file_name + if not record_dir_config_file_path.exists(): + record_dir_config_file_path.write_text(f"""# RecordDir Config type: records version: 1 schema: {schema} format: yaml idfx: {backend_config.mapping_method} -""" - ) +""", + ) def create_audit_store(*args, **kwargs): @@ -333,7 +432,7 @@ def check_record_dir_compatibility( backend_config: RecordDirBackendConfig, schema: str, ): - record_dir_config = get_collection_dir_config(store_path) + record_dir_config = get_record_dir_config(store_path) if record_dir_config.schema != schema: raise ConfigCollisionError(f"Existing backend uses a different schema: '{record_dir_config.schema}'") @@ -371,6 +470,7 @@ def check_audit_compatibility( def create_endpoint( operation_name: str, operation_path: str, + instance_state: InstanceState, collection_config: CollectionConfig, template: str, handler: str, @@ -383,18 +483,13 @@ def create_endpoint( collection_config.name, ) + # TODO: get schema_info from instance_state!? model, classes, model_var_name = get_model_for_schema(collection_config.schema) globals()[model_var_name] = model - use_classes = set(classes) - if collection_config.use_classes: - use_classes &= set(collection_config.use_classes) - - if collection_config.ignore_classes: - use_classes -= set(collection_config.ignore_classes) - - for class_name in use_classes: - endpoint_name = f'_endpoint_{collection_config.name}_{operation_name}_{class_name}' + active_classes = instance_state.collections[collection_config.name].active_classes + for class_name in active_classes: + endpoint_name = f'_endpoint_{var_escape(collection_config.name)}_{operation_name}_{class_name}' endpoint_source = template.format( name=endpoint_name, model_var_name=model_var_name, @@ -417,12 +512,13 @@ def create_endpoint( logger.info( 'Creation of %d %s-endpoints completed.', - len(use_classes), + len(active_classes), operation_name, ) def create_endpoints_for_collection( + instance_state: InstanceState, collection_config: CollectionConfig, app: FastAPI, ): @@ -434,13 +530,14 @@ def create_endpoints_for_collection( tag_name, ) in ( ('store', 'record', _endpoint_template, 'store_record', f'Write records to collection "{collection_config.name}"'), - ('validate', 'validate', _endpoint_template, 'validate_record', f'Validate records for collection "{collection_config.name}"'), - ('curated', 'curated/record', _endpoint_template, 'store_curated_record', f'Store records in curated area of collection "{collection_config.name}"'), - ('incoming', 'incoming/{label}/record', _endpoint_template, 'store_incoming_record', f'Store records in incoming area "{{label}}" of collection "{collection_config.name}"'), + ('validate', 'validate/record', _endpoint_template, 'validate_record', f'Validate records for collection "{collection_config.name}"'), + ('curated', 'curated/record', _endpoint_curated_template, 'store_curated_record', f'Store records in curated area of collection "{collection_config.name}"'), + ('incoming', 'incoming/{label}/record', _endpoint_incoming_template, 'store_incoming_record', f'Store records in incoming area "{{label}}" of collection "{collection_config.name}"'), ): create_endpoint( operation_name=operation_name, operation_path=operation_path, + instance_state=instance_state, collection_config=collection_config, template=template, handler=handler, @@ -471,24 +568,33 @@ def store_record( abstract_config = read_config(instance_state.store_path) check_collection(abstract_config, collection) - token = ( - get_default_token_name(instance_state, collection) - if api_key is None - else api_key - ) + token_representation = get_default_token_representation( + abstract_config, + collection, + ) if api_key is None else api_key + + if not token_representation: + raise HTTPException( + status_code=HTTP_401_UNAUTHORIZED, + detail=f'Not authorized to submit to collection "{collection}"', + ) # Get the token permissions and extend them by the default permissions. # This call will also convert plaintext tokens into the hashed version of # the token, if the token is hashed. This is necessary because we do not # store the plaintext token, so all token-information is associated with # the hashed representation of the token. - store, token, token_permissions, user_id = get_token_store( + store, token_permissions, user_id = get_token_store( + abstract_config, instance_state, collection, - token, + token_representation, ) final_permissions = join_default_token_permissions( - instance_state, token_permissions, collection + abstract_config, + instance_state, + token_permissions, + collection, ) if not final_permissions.incoming_write: raise HTTPException( @@ -499,7 +605,7 @@ def store_record( if input_format == Format.ttl: with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Conversion error'): json_object = FormatConverter( - instance_state.schemas[collection], + abstract_config.collections[collection].schema, input_format=Format.ttl, output_format=Format.json, ).convert(data, class_name) @@ -516,7 +622,7 @@ def store_record( if input_format == Format.ttl: format_converter = FormatConverter( - instance_state.schemas[collection], + abstract_config.collections[collection].schema, input_format=Format.json, output_format=Format.ttl, ) diff --git a/dump_things_service/collection_endpoints.py b/dump_things_service/collection_endpoints.py index b2d56e9..b15e2ca 100644 --- a/dump_things_service/collection_endpoints.py +++ b/dump_things_service/collection_endpoints.py @@ -31,13 +31,11 @@ from dump_things_service.abstract_config import ( store_config, CollectionConfig, Configuration, - check_collection, - get_default_token_name, ) from dump_things_service.admin import authenticate_admin from dump_things_service.api_key import api_key_header_scheme +from dump_things_service.instance_state import get_instance_state from dump_things_service.manifest import manifest_configuration -#from dump_things_service.config import get_config from dump_things_service.exceptions import ConfigError from dump_things_service.utils import wrap_http_exception @@ -75,10 +73,10 @@ async def create_collection( api_key: str = Depends(api_key_header_scheme), ): - instance_config = get_config() + instance_state = get_instance_state() # Check admin rights - authenticate_admin(instance_config, api_key) + authenticate_admin(instance_state, api_key) # TODO: read the current abstract configuration, check for a collection # of the given name. If it does not exist yet, add a collection @@ -86,7 +84,7 @@ async def create_collection( # new configuration. If there are no errors, persist the new # configuration. configuration: Configuration = read_config( - store_path=instance_config.store_path + store_path=instance_state.store_path ) # Check for existing collection name @@ -97,17 +95,15 @@ async def create_collection( ) # Update the abstract configuration - configuration.collections[body.name] = Configuration( - **(body.model_dump(mode='json')), - ) + configuration.collections[body.name] = body # Manifest the abstract configuration with wrap_http_exception(ConfigError): - manifest_configuration(configuration, instance_config) + manifest_configuration(configuration, instance_state) # Persist the configuration store_config( - store_path=instance_config.store_path, + store_path=instance_state.store_path, config=configuration, ) @@ -155,7 +151,7 @@ def ensure_backend_type( backend_name, extension = get_backend_and_extension(backend_spec) if backend_name is 'record_dir': try: - config = Config.get_collection_dir_config(path) + config = Config.get_record_dir_config(path) except ConfigError as e: raise HTTPException( status_code=HTTP_409_CONFLICT, diff --git a/dump_things_service/commands/copy_store.py b/dump_things_service/commands/copy_store.py index 6585625..8eeda94 100644 --- a/dump_things_service/commands/copy_store.py +++ b/dump_things_service/commands/copy_store.py @@ -17,7 +17,7 @@ from dump_things_service.backends.sqlite import ( from dump_things_service.backends.sqlite import ( record_file_name as sqlite_record_file_name, ) -from dump_things_service.config import get_backend_and_extension +from dump_things_service.abstract_config import get_backend_and_extension if TYPE_CHECKING: from dump_things_service.backends import StorageBackend diff --git a/dump_things_service/commands/rebuild_index.py b/dump_things_service/commands/rebuild_index.py index f909d22..0825dc4 100644 --- a/dump_things_service/commands/rebuild_index.py +++ b/dump_things_service/commands/rebuild_index.py @@ -8,7 +8,8 @@ import yaml from dump_things_service import config_file_name from dump_things_service.backends.record_dir_index import RecordDirIndex -from dump_things_service.config import CollectionDirConfig +from dump_things_service.abstract_config import RecordDirConfigFileContent + parser = ArgumentParser( prog='Rebuild the index of a `record_dir`-store', @@ -59,7 +60,7 @@ def process_config(arguments) -> tuple[Path, str, str]: config_path = ( Path(arguments.config) if arguments.config else store / config_file_name ) - config_object = CollectionDirConfig( + config_object = RecordDirConfigFileContent( **yaml.load(config_path.read_text(), Loader=yaml.SafeLoader) ) return ( diff --git a/dump_things_service/converter.py b/dump_things_service/converter.py index 74d4348..ed85a8d 100644 --- a/dump_things_service/converter.py +++ b/dump_things_service/converter.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +from functools import cache from json import loads as json_loads from typing import ( TYPE_CHECKING, @@ -8,11 +9,11 @@ from typing import ( Callable, ) +from linkml_runtime import SchemaView from linkml.utils.datautils import ( get_dumper, get_loader, ) -from linkml_runtime import SchemaView from rdflib.term import ( URIRef, _toPythonMapping, @@ -24,9 +25,11 @@ from dump_things_service.lazy_list import LazyList from dump_things_service.model import ( get_model_for_schema, get_schema_model_for_schema, + get_schema_view, ) from dump_things_service.utils import cleaned_json + if TYPE_CHECKING: from types import ModuleType @@ -35,9 +38,6 @@ if TYPE_CHECKING: from dump_things_service.backends import RecordInfo -_cached_conversion_objects = {} - - class TypeValidator: def __init__( self, @@ -72,21 +72,27 @@ def add_type_validator( ) -def get_conversion_objects(schema: str): - if schema not in _cached_conversion_objects: - schema_view = SchemaView(schema) - _cached_conversion_objects[schema] = { - 'schema_module': get_schema_model_for_schema(schema), - 'schema_view': schema_view, - } - # Add types to support explicit type clauses in TTL - for type_definition in schema_view.all_types().values(): - uri = schema_view.expand_curie(type_definition.uri) - add_type_validator( - uri_ref=uri, - regex=type_definition.pattern, - ) - return _cached_conversion_objects[schema] +# Get conversion objects and prepare the conversion by adding type +# validators for explicit type expressions to the RDFLib loader. The +# latter is necessary to load the TTL in rdflib loader. +@cache +def get_conversion_objects(schema_location: str) -> dict: + schema_view = get_schema_view(schema_location) + result = { + 'schema_module': get_schema_model_for_schema(schema_location), + 'schema_view': schema_view, + } + + # Add types to support explicit type clauses in TTL + # TODO: this should probably be outside of a cached function and in a + # function with an appropriate name that indicates the side effect + for type_definition in schema_view.all_types().values(): + uri = schema_view.expand_curie(type_definition.uri) + add_type_validator( + uri_ref=uri, + regex=type_definition.pattern, + ) + return result class FormatConverter: diff --git a/dump_things_service/curated.py b/dump_things_service/curated.py index 6e575c6..2f6ef0b 100644 --- a/dump_things_service/curated.py +++ b/dump_things_service/curated.py @@ -19,13 +19,14 @@ from fastapi_pagination import ( from dump_things_service import ( HTTP_401_UNAUTHORIZED, HTTP_404_NOT_FOUND, - HTTP_422_UNPROCESSABLE_CONTENT, + HTTP_422_UNPROCESSABLE_CONTENT, abstract_config, ) -from dump_things_service.abstract_config import check_collection +from dump_things_service.abstract_config import check_collection, read_config, \ + get_config, get_token_config_for_representation_and_collection from dump_things_service.api_key import api_key_header_scheme from dump_things_service.backends.schema_type_layer import _SchemaTypeLayer -#from dump_things_service.config import get_config from dump_things_service.exceptions import CurieResolutionError +from dump_things_service.instance_state import get_instance_state from dump_things_service.lazy_list import ModifierList from dump_things_service.utils import ( authenticate_token, @@ -39,7 +40,8 @@ if TYPE_CHECKING: from dump_things_service.backends import StorageBackend from dump_things_service.lazy_list import LazyList - from dump_things_service.store.model_store import ModelStore + from dump_things_service.store.model_store import _ModelStore + _endpoint_curated_template = """ async def {name}( @@ -79,8 +81,8 @@ async def read_curated_records_of_type( matching: str | None = None, api_key: str | None = Depends(api_key_header_scheme), ): - instance_config = get_config() - if class_name not in instance_config.use_classes[collection]: + instance_state = get_instance_state() + if class_name not in instance_state.collections[collection].active_classes: raise HTTPException( status_code=HTTP_404_NOT_FOUND, detail=f"No '{class_name}'-class in collection '{collection}'.", @@ -108,8 +110,8 @@ async def read_curated_records_of_type_paginated( api_key: str | None = Depends(api_key_header_scheme), ) -> Page[dict]: - instance_config = get_config() - if class_name not in instance_config.use_classes[collection]: + instance_state = get_instance_state() + if class_name not in instance_state.collections[collection].active_classes: raise HTTPException( status_code=HTTP_404_NOT_FOUND, detail=f"No '{class_name}'-class in collection '{collection}'.", @@ -207,10 +209,10 @@ async def _read_curated_records( pid: str | None, matching: str | None = None, api_key: str | None = None, - upper_bound: int = 1000, + upper_bound: int | None = 1000, ) -> LazyList | dict | None: - model_store, backend = await _get_store_and_backend(collection, api_key) + model_store, backend = _get_store_and_backend(collection, api_key) if pid: record_info = backend.get_record_by_iri(model_store.pid_to_iri(pid)) @@ -244,7 +246,7 @@ async def _delete_curated_record( api_key: str | None = None, ) -> bool: with wrap_http_exception(Exception): - model_store, backend = await _get_store_and_backend(collection, api_key) + model_store, backend = _get_store_and_backend(collection, api_key) result = backend.remove_record(model_store.pid_to_iri(pid)) if not result: raise HTTPException( @@ -255,10 +257,10 @@ async def _delete_curated_record( return True -async def _get_store_and_backend( +def _get_store_and_backend( collection: str, plain_token: str | None, -) -> tuple[ModelStore, StorageBackend]: +) -> tuple[_ModelStore, StorageBackend]: # A token is required if plain_token is None: @@ -267,13 +269,14 @@ async def _get_store_and_backend( detail='token required', ) - instance_config = get_config() + instance_state = get_instance_state() + abstract_config = read_config(instance_state.store_path) # Check that the collection exists - check_collection(instance_config, collection) + check_collection(abstract_config=abstract_config, collection=collection) # Get token permissions - auth_info = authenticate_token(instance_config, collection, plain_token) + auth_info = authenticate_token(instance_state, collection, plain_token) permissions = auth_info.token_permission if permissions.curated_write is False: raise HTTPException( @@ -282,14 +285,14 @@ async def _get_store_and_backend( ) # Get the curated model store - model_store = instance_config.curated_stores[collection] + model_store = instance_state.curated_stores[collection] backend = model_store.backend if isinstance(backend, _SchemaTypeLayer): return model_store, backend.backend return model_store, backend -def create_curated_endpoints( +def xxx_create_curated_endpoints( app: FastAPI, tag_info: list[dict[str, str]], placeholder: str, @@ -299,7 +302,8 @@ def create_curated_endpoints( logger.info('Creating dynamic curated endpoints...') serial_number = count() - instance_config = get_config() + instance_state = get_instance_state() + abstract_config = read_config(instance_state.store_path) generated_tags = [] for collection, ( @@ -313,7 +317,7 @@ def create_curated_endpoints( if model_var_name not in global_dict: global_dict[model_var_name] = model - for class_name in instance_config.use_classes[collection]: + for class_name in instance_state.collections[collection].active_classes: # Create an endpoint to dump data of type `class_name` of schema # `application`. @@ -352,20 +356,19 @@ def create_curated_endpoints( ) -async def store_curated_record( - collection: str, - data: BaseModel, - class_name: str, - author_id: str | None = None, - api_key: str | None = Depends(api_key_header_scheme), +def store_curated_record( + collection: str, + data: BaseModel, + class_name: str, + author_id: str | None = None, + api_key: str | None = Depends(api_key_header_scheme), ): - - instance_config = get_config() + instance_state = get_instance_state() with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): - instance_config.validators[collection].validate(data) + instance_state.validators[collection].validate(data) pid = data.pid - model_store, backend = await _get_store_and_backend(collection, api_key) + model_store, backend = _get_store_and_backend(collection, api_key) json_object = cleaned_json( data.model_dump(exclude_none=True, mode='json'), @@ -379,9 +382,14 @@ async def store_curated_record( json_object, ) - for audit_backend in instance_config.audit_backends[collection]: + _, token_config, _ = get_token_config_for_representation_and_collection( + abstract_config=get_config(), + token_representation=api_key, + collection_name=collection, + ) + for audit_backend in instance_state.audit_backends[collection]: audit_backend.add_record( record=json_object, - committer_id=instance_config.tokens[collection][api_key]['user_id'], + committer_id=token_config.user_id, author_id=author_id, ) diff --git a/dump_things_service/dynamic_endpoints.py b/dump_things_service/dynamic_endpoints.py index dda8262..0c01c53 100644 --- a/dump_things_service/dynamic_endpoints.py +++ b/dump_things_service/dynamic_endpoints.py @@ -22,7 +22,7 @@ async def {name}( """ -def create_store_endpoints( +def xxx_create_store_endpoints( app: FastAPI, configuration: Configuration, tag_info: list[dict[str, str]], @@ -79,7 +79,7 @@ def create_store_endpoints( logger.info('Creation of %d endpoints completed.', next(serial_number)) -def create_validate_endpoints( +def xxx_create_validate_endpoints( app: FastAPI, configuration: Configuration, tag_info: list[dict[str, str]], diff --git a/dump_things_service/export/json.py b/dump_things_service/export/json.py index 122ee52..3181482 100644 --- a/dump_things_service/export/json.py +++ b/dump_things_service/export/json.py @@ -3,7 +3,7 @@ import sys from pathlib import Path from typing import TextIO -from dump_things_service.config import InstanceConfig +from dump_things_service.abstract_config import Configuration from dump_things_service.lazy_list import LazyList from dump_things_service.model import get_classes from dump_things_service.store.model_store import ModelStore @@ -35,7 +35,7 @@ def _lookahead(iterable): def export_json( - instance_config: InstanceConfig, + abstract_config: Configuration, destination: str, ): if destination == '-': @@ -44,9 +44,9 @@ def export_json( output = Path(destination).open('wt', encoding='utf-8') # noqa: SIM115 output.write('{\n') - for collection, is_last in _lookahead(instance_config.collections): + for collection, is_last in _lookahead(abstract_config.collections): output.write(f'{level_width * " "}"{collection}": {{\n') - export_collection(instance_config, collection, 2 * level_width, output) + export_collection(abstract_config, collection, 2 * level_width, output) if is_last: output.write(f'\n{level_width * " "}}}\n') else: @@ -55,12 +55,12 @@ def export_json( def export_collection( - instance_config: InstanceConfig, + abstract_config: Configuration, collection: str, indent: int, output: TextIO, ): - output.write(f'{indent * " "}"schema": "{instance_config.schemas[collection]}",\n') + output.write(f'{indent * " "}"schema": "{abstract_config.collections[collection].schema}",\n') output.write(f'{indent * " "}"curated": {{\n') append_classes( instance_config.curated_stores[collection], indent + level_width, output diff --git a/dump_things_service/export/tree.py b/dump_things_service/export/tree.py index d12855f..df11d27 100644 --- a/dump_things_service/export/tree.py +++ b/dump_things_service/export/tree.py @@ -2,10 +2,12 @@ from pathlib import Path import yaml -from dump_things_service.config import ( - InstanceConfig, + +from dump_things_service.abstract_config import ( + Configuration, get_mapping_function_by_name, ) +from dump_things_service.instance_state import InstanceState from dump_things_service.model import get_classes from dump_things_service.store.model_store import ModelStore @@ -13,7 +15,8 @@ idfx = get_mapping_function_by_name('digest-md5-p3-p3') def export_tree( - instance_config: InstanceConfig, + abstract_config: Configuration, + instance_state: InstanceState, destination: str, ): destination = Path(destination) @@ -22,16 +25,18 @@ def export_tree( raise ValueError(msg) destination.mkdir(parents=True, exist_ok=True) - for collection in instance_config.collections: + for collection_name in abstract_config.collections: export_collection( - instance_config, - collection, + abstract_config, + instance_state, + collection_name, destination, ) def export_collection( - instance_config: InstanceConfig, + abstract_config: Configuration, + instance_state: InstanceState, collection: str, destination: Path, ): @@ -41,7 +46,7 @@ def export_collection( config_content = ( 'type: records\n' 'version: 1\n' - f'schema: {instance_config.schemas[collection]}\n' + f'schema: {abstract_config.collections[collection].schema}\n' 'format: yaml\n' 'idfx: digest-md5-p3-p3\n' ) @@ -50,9 +55,9 @@ def export_collection( curated_destination.mkdir(parents=True, exist_ok=True) (curated_destination / '.dumpthings.yaml').write_text(config_content) exported_stores = { - id(instance_config.curated_stores[collection]): curated_destination + id(instance_state.curated_stores[collection]): curated_destination } - export_classes(instance_config.curated_stores[collection], curated_destination) + export_classes(instance_state.curated_stores[collection], curated_destination) # Determine stores for incoming zones zones = { diff --git a/dump_things_service/incoming.py b/dump_things_service/incoming.py index 1e91c22..318108c 100644 --- a/dump_things_service/incoming.py +++ b/dump_things_service/incoming.py @@ -25,11 +25,12 @@ from dump_things_service.abstract_config import ( check_collection, check_label, get_config_labels, + get_config, ) from dump_things_service.api_key import api_key_header_scheme from dump_things_service.backends.schema_type_layer import _SchemaTypeLayer -#from dump_things_service.config import get_config from dump_things_service.exceptions import CurieResolutionError +from dump_things_service.instance_state import get_instance_state from dump_things_service.lazy_list import ModifierList from dump_things_service.utils import ( authenticate_token, @@ -45,7 +46,8 @@ if TYPE_CHECKING: from dump_things_service.backends import StorageBackend from dump_things_service.lazy_list import LazyList - from dump_things_service.store.model_store import ModelStore + from dump_things_service.store.model_store import _ModelStore + _endpoint_incoming_template = """ async def {name}( @@ -85,8 +87,10 @@ async def incoming_read_labels( ) -> list[str]: # Authorize api_key await authorize_zones(collection, api_key) + + instance_state = get_instance_state() configured_labels = get_config_labels(get_config(), collection) - on_disk_labels = get_on_disk_labels(get_config(), collection) + on_disk_labels = get_on_disk_labels(instance_state.store_path, get_config(), collection) return list(configured_labels.union(on_disk_labels)) @@ -102,8 +106,8 @@ async def incoming_read_records_of_type( matching: str | None = None, api_key: str | None = Depends(api_key_header_scheme), ): - instance_config = get_config() - if class_name not in instance_config.use_classes[collection]: + instance_state = get_instance_state() + if class_name not in instance_state.collections[collection].active_classes: raise HTTPException( status_code=HTTP_404_NOT_FOUND, detail=f"No '{class_name}'-class in collection '{collection}'.", @@ -133,8 +137,8 @@ async def incoming_read_records_of_type_paginated( api_key: str | None = Depends(api_key_header_scheme), ) -> Page[dict]: - instance_config = get_config() - if class_name not in instance_config.use_classes[collection]: + instance_state = get_instance_state() + if class_name not in instance_state.collections[collection].active_classes: raise HTTPException( status_code=HTTP_404_NOT_FOUND, detail=f"No '{class_name}'-class in collection '{collection}'.", @@ -295,49 +299,58 @@ async def _get_store_and_backend( collection: str, label: str, plain_token: str | None, -) -> tuple[ModelStore, StorageBackend]: +) -> tuple[_ModelStore, StorageBackend]: # Authorize api_key await authorize_zones(collection, plain_token) # Check that the incoming zone exists - instance_config = get_config() - check_label(instance_config, collection, label) + instance_state = get_instance_state() + abstract_config = get_config() + check_label(instance_state.store_path, abstract_config, collection, label) # Create a store (or get an already created store) for collection # `collection` and storage dir `store_dir`. store_dir = ( - instance_config.store_path - / instance_config.incoming[collection] - / label + instance_state.store_path + / abstract_config.collections[collection].incoming + / label ) # `create_token_store` will cache and return already created stores with # the same collection and storage dir. model_store = create_token_store( - instance_config=instance_config, + abstract_configuration=abstract_config, + instance_state=instance_state, collection_name=collection, store_dir=store_dir, ) + xxx = """ # For consistency, associate the store with all matching tokens from the - # configuration file. + # configuration file. That means with all tokens that have the same + # input matching_tokens = [ - token - for token, token_info in instance_config.tokens[collection].items() - if token_info['incoming_label'] == label + token_name + for token_name, token_info in abstract_config.tokens.items() + if (collection, label) in [ + (collection_name, token_collection_info.incoming_label) + for collection_name, token_collection_info in token_info.items() + ] ] + for matching_token in matching_tokens: # Associate the store with all matching tokens in the configuration. # Note: there are stores that are not associated with a token in - # the configuration. These are stores that belong to a token that - # are authenticated with an external authentication source. - token_info = instance_config.tokens[collection][matching_token] - instance_config.token_stores[collection][matching_token] = ( + # the abstract configuration. These are stores that belong to a token + # that is authenticated with an external authentication source. + token_info = instance_state.tokens[collection][matching_token] + instance_state.token_stores[collection][matching_token] = ( model_store, matching_token, token_info['permissions'], token_info['user_id'], ) + """ backend = model_store.backend if isinstance(backend, _SchemaTypeLayer): @@ -356,12 +369,13 @@ async def authorize_zones( detail='token required', ) - instance_config = get_config() + abstract_config = get_config() + instance_state = get_instance_state() # Check that the collection exists - check_collection(instance_config, collection) + check_collection(abstract_config, collection) - auth_info = authenticate_token(instance_config, collection, plain_token) + auth_info = authenticate_token(instance_state, collection, plain_token) permissions = auth_info.token_permission if permissions.zones_access is False: raise HTTPException( @@ -370,7 +384,7 @@ async def authorize_zones( ) -def create_incoming_endpoints( +def xxx_create_incoming_endpoints( app: FastAPI, tag_info: list[dict[str, str]], placeholder: str, @@ -441,9 +455,9 @@ async def store_incoming_record( api_key: str | None = Depends(api_key_header_scheme), ): - instance_config = get_config() + instance_state = get_instance_state() with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): - instance_config.validators[collection].validate(data) + instance_state.validators[collection].validate(data) pid = data.pid model_store, backend = await _get_store_and_backend( diff --git a/dump_things_service/instance_state.py b/dump_things_service/instance_state.py index 9f0706b..d277ba3 100644 --- a/dump_things_service/instance_state.py +++ b/dump_things_service/instance_state.py @@ -1,84 +1,83 @@ from __future__ import annotations import dataclasses -import enum -import hashlib import logging -from functools import partial +from functools import cache from pathlib import Path +from types import ModuleType from typing import ( - TYPE_CHECKING, Any, Callable, - Literal, cast, ) import yaml -from fastapi import ( - FastAPI, - HTTPException, -) -from pydantic import ( - BaseModel, - ConfigDict, - Field, - ValidationError, -) +from fastapi import FastAPI +from linkml_runtime import SchemaView +from pydantic import ValidationError from yaml.scanner import ScannerError -from dump_things_service import ( - HTTP_404_NOT_FOUND, - Format, -) from dump_things_service.abstract_config import ( - Configuration, - check_collection, + RecordDirConfigFileContent, MappingMethod, -) -from dump_things_service.audit.gitaudit import GitAuditBackend -from dump_things_service.backends.record_dir import RecordDirStore -from dump_things_service.backends.schema_type_layer import SchemaTypeLayer -from dump_things_service.backends.sqlite import SQLiteBackend -from dump_things_service.backends.sqlite import ( - record_file_name as sqlite_record_file_name, + mapping_functions, ) -from dump_things_service.converter import FormatConverter, get_conversion_objects +from dump_things_service.converter import get_conversion_objects from dump_things_service.exceptions import ( ConfigError, - CurieResolutionError, ) -from dump_things_service.model import get_model_for_schema -from dump_things_service.resolve_curie import resolve_curie -from dump_things_service.store.model_store import ModelStore -from dump_things_service.token import ( - TokenPermission, - get_token_parts, - hash_token, +from dump_things_service.model import ( + get_model_for_schema, + get_schema_model_for_schema, + get_schema_view, ) -if TYPE_CHECKING: - import types logger = logging.getLogger('dump_things_service') -config_file_name = '.dumpthings.yaml' -ignored_files = {'.', '..', config_file_name} +record_dir_config_file_name = '.dumpthings.yaml' +ignored_files = {'.', '..', record_dir_config_file_name} -class CollectionDirConfig(BaseModel): - model_config = ConfigDict(extra='forbid') - type: Literal['records'] - version: Literal[1] - schema: str - format: Literal['yaml'] - idfx: MappingMethod +@dataclasses.dataclass +class PydanticModuleInfo: + module: ModuleType + module_var_name: str + + +@dataclasses.dataclass +class SchemaInfo: + schema_view: SchemaView + classes: list[str] + pydantic_module_info: PydanticModuleInfo + python_module: ModuleType + conversion_objects: tuple[Any, Any] + + +@dataclasses.dataclass +class InstanceStateCollectionInfo: + active_classes: set[str] + + +@cache +def get_schema_info(schema_location: str): + module, classes, module_var_name = get_model_for_schema(schema_location) + return SchemaInfo( + schema_view=get_schema_view(schema_location), + classes=classes, + pydantic_module_info=PydanticModuleInfo( + module=module, + module_var_name=module_var_name, + ), + python_module=get_schema_model_for_schema(schema_location), + conversion_objects=get_conversion_objects(schema_location), + ) @dataclasses.dataclass class InstanceState: - # foundational information from command line or initialisation code + # foundational information from command line or initialization code store_path: Path bootstrap_token: str | None @@ -89,35 +88,42 @@ class InstanceState: maintenance_mode: set = dataclasses.field(default_factory=set) # Created based on abstract configuration - collectiona: dict = dataclasses.field(default_factory=dict) + collections: dict[str, InstanceStateCollectionInfo] = dataclasses.field(default_factory=dict) tokens: dict = dataclasses.field(default_factory=dict) auth_sources: dict[str, list] = dataclasses.field(default_factory=dict) - audit_backends: dict = dataclasses.field(default_factory=dict) - - model_info: dict = dataclasses.field(default_factory=dict) - schemas: dict = dataclasses.field(default_factory=dict) - conversion_objects: dict = dataclasses.field(default_factory=dict) - - order_by: list[str] = dataclasses.field(default_factory=list) - collections: dict = dataclasses.field(default_factory=dict) - all_stores: dict = dataclasses.field(default_factory=dict) + audit_backends: dict[str, list] = dataclasses.field(default_factory=dict) curated_stores: dict = dataclasses.field(default_factory=dict) - incoming: dict = dataclasses.field(default_factory=dict) - zones: dict = dataclasses.field(default_factory=dict) - permissions: dict = dataclasses.field(default_factory=dict) - #model_info: dict = dataclasses.field(default_factory=dict) - token_stores: dict = dataclasses.field(default_factory=dict) + incoming_stores: dict = dataclasses.field(default_factory=dict) + schema_info: dict[str, SchemaInfo] = dataclasses.field(default_factory=dict) + validators: dict = dataclasses.field(default_factory=dict) + order_by: list[str] = dataclasses.field(default_factory=list) + all_stores: dict = dataclasses.field(default_factory=dict) + + # OLD STUFF #schemas: dict = dataclasses.field(default_factory=dict) #conversion_objects: dict = dataclasses.field(default_factory=dict) - backend: dict = dataclasses.field(default_factory=dict) + #model_info: dict = dataclasses.field(default_factory=dict) + + #order_by: list[str] = dataclasses.field(default_factory=list) + #collections: dict = dataclasses.field(default_factory=dict) + #curated_stores: dict = dataclasses.field(default_factory=dict) + #incoming_stores: dict = dataclasses.field(default_factory=dict) + #incoming: dict = dataclasses.field(default_factory=dict) + #zones: dict = dataclasses.field(default_factory=dict) + #permissions: dict = dataclasses.field(default_factory=dict) + #model_info: dict = dataclasses.field(default_factory=dict) + # token_stores: dict = dataclasses.field(default_factory=dict) + #schemas: dict = dataclasses.field(default_factory=dict) + #conversion_objects: dict = dataclasses.field(default_factory=dict) + #backend: dict = dataclasses.field(default_factory=dict) #tokens: dict = dataclasses.field(default_factory=dict) - hashed_tokens: dict = dataclasses.field(default_factory=dict) - validators: dict = dataclasses.field(default_factory=dict) - use_classes: dict = dataclasses.field(default_factory=dict) + #hashed_tokens: dict = dataclasses.field(default_factory=dict) + #validators: dict = dataclasses.field(default_factory=dict) + #use_classes: dict = dataclasses.field(default_factory=dict) #maintenance_mode: set = dataclasses.field(default_factory=set) #audit_backends: dict = dataclasses.field(default_factory=dict) - xxx_tokens: dict = dataclasses.field(default_factory=dict) - xxx_collections: dict = dataclasses.field(default_factory=dict) + #xxx_tokens: dict = dataclasses.field(default_factory=dict) + #xxx_collections: dict = dataclasses.field(default_factory=dict) g_instance_state:InstanceState | None = None @@ -141,7 +147,7 @@ def create_instance_state( return cast(InstanceState, g_instance_state) -def get_instance_state() -> InstanceState | None: +def get_instance_state() -> InstanceState: global g_instance_state if not g_instance_state: @@ -150,16 +156,16 @@ def get_instance_state() -> InstanceState | None: return g_instance_state -def get_collection_dir_config( +def get_record_dir_config( path: Path, - file_name: str = config_file_name, -) -> CollectionDirConfig: + file_name: str = record_dir_config_file_name, +) -> RecordDirConfigFileContent: config_path = path / file_name if not config_path.exists(): msg = f'Config file does not exist: {config_path}' raise ConfigError(msg) try: - return CollectionDirConfig( + return RecordDirConfigFileContent( **yaml.load(config_path.read_text(), Loader=yaml.SafeLoader) ) except ScannerError as e: @@ -170,6 +176,14 @@ def get_collection_dir_config( raise ConfigError(msg) from e +def get_mapping_function_by_name(mapping_function_name: str) -> Callable: + return mapping_functions[MappingMethod(mapping_function_name)] + + +def get_mapping_function(collection_config: RecordDirConfigFileContent): + return mapping_functions[collection_config.idfx] + + x = """ class StrictModel(BaseModel): model_config = ConfigDict(extra='forbid') @@ -390,7 +404,7 @@ class Config: return Config.get_config_from_file(path / file_name) @staticmethod - def get_collection_dir_config( + def get_record_dir_config( path: Path, file_name: str = config_file_name, ) -> CollectionDirConfig: @@ -513,7 +527,7 @@ def process_config_object( backend_name, extension = get_backend_and_extension(backend.type) if backend_name == 'record_dir': # Get the config from the curated directory - collection_config = Config.get_collection_dir_config( + collection_config = Config.get_record_dir_config( store_path / collection_info.curated ) schema = collection_config.schema diff --git a/dump_things_service/main.py b/dump_things_service/main.py index acd29c0..3ba768e 100644 --- a/dump_things_service/main.py +++ b/dump_things_service/main.py @@ -9,6 +9,7 @@ from typing import ( TYPE_CHECKING, ) +from dump_things_service.manifest import manifest_configuration # Perform the patching before importing any third-party libraries from dump_things_service.patches import enabled # noqa: F401 @@ -44,52 +45,33 @@ from dump_things_service import ( HTTP_404_NOT_FOUND, HTTP_422_UNPROCESSABLE_CONTENT, Format, - config_file_name, ) from dump_things_service.__about__ import __version__ -from dump_things_service.api_key import api_key_header_scheme -from dump_things_service.config import ( - get_config, - process_config, +from dump_things_service.abstract_config import ( + check_collection, + get_default_token_name, + read_config, ) +from dump_things_service.api_key import api_key_header_scheme from dump_things_service.converter import ( FormatConverter, ConvertingList, ) -from dump_things_service.curated import ( - create_curated_endpoints, - router as curated_router, - store_curated_record, # noqa F401 -- used by generated code -) +from dump_things_service.curated import router as curated_router from dump_things_service.exceptions import CurieResolutionError -from dump_things_service.incoming import ( - create_incoming_endpoints, - router as incoming_router, - store_incoming_record, # noqa F401 -- used by generated code -) -from dump_things_service.dynamic_endpoints import ( - create_store_endpoints, - create_validate_endpoints, -) +from dump_things_service.incoming import router as incoming_router +from dump_things_service.instance_state import create_instance_state from dump_things_service.lazy_list import ( PriorityList, ModifierList, ) -from dump_things_service.model import ( - get_subclasses, -) -from dump_things_service.collection_endpoints import ( - router as collection_router, -) -from dump_things_service.token_endpoints import ( - router as token_router, -) +from dump_things_service.model import get_subclasses +from dump_things_service.collection_endpoints import router as collection_router +from dump_things_service.token_endpoints import router as token_router from dump_things_service.utils import ( authenticate_token, check_bounds, - check_collection, combine_ttl, - get_default_token_name, get_token_store, join_default_token_permissions, process_token, @@ -129,12 +111,12 @@ parser = argparse.ArgumentParser() parser.add_argument('--host', default='0.0.0.0') # noqa S104 parser.add_argument('--port', default=8000, type=int) parser.add_argument('--origins', action='append', default=[]) -parser.add_argument( - '-c', - '--config', - metavar='CONFIG_FILE', - help="Read the configuration from 'CONFIG_FILE' instead of looking for it in the data store root directory. ", -) +#parser.add_argument( +# '-c', +# '--config', +# metavar='CONFIG_FILE', +# help="Read the configuration from 'CONFIG_FILE' instead of looking for it in the data store root directory. ", +#) parser.add_argument( '--root-path', default='', @@ -241,38 +223,82 @@ if not store_path.exists(): raise SystemExit(1) -config_path = ( - Path(arguments.config).resolve() if arguments.config else store_path / config_file_name -) -if not config_path.exists(): - logger.error(f'Config file does not exist: {config_path}') - raise SystemExit(1) - - -process_config( - store_path=store_path, - config_file=config_path, - order_by=['pid'], - globals_dict=globals(), -) -g_instance_config = get_config() - - disable_installed_extensions_check() + app = FastAPI( title='Dump Things Service', description=description, version=__version__, openapi_tags=tag_info ) -app.include_router(collection_router) + app.include_router(curated_router) app.include_router(incoming_router) app.include_router(token_router) +app.include_router(collection_router) + +# Add CORS origins +app.add_middleware( + CORSMiddleware, + allow_origins=arguments.origins, + allow_credentials=True, + allow_methods=['*'], + allow_headers=['*'], +) + +# Add pagination +add_pagination(app) -def store_record( +#new_process_config( +# store_path=store_path, +# fastapi_app=app, +# order_by=['pid'], +# globals_dict=globals(), +#) +#g_instance_config = get_config() + + +g_instance_state = create_instance_state( + store_path=store_path, + bootstrap_token='admin-1', + fastapi_app=app, +) + + +g_configuration = read_config(store_path) + + +manifest_configuration( + configuration=g_configuration, + instance_state=g_instance_state, +) + +x = """ +create_store_endpoints( + app=app, + configuration=g_configuration, + tag_info=tag_info, + placeholder='placeholder_write', + global_dict=globals(), +) +create_validate_endpoints( + app=app, + configuration=g_configuration, + tag_info=tag_info, + placeholder='placeholder_validate', + global_dict=globals(), +) +""" + + +g_instance_state.fastapi_app.openapi_schema = None +g_instance_state.fastapi_app.setup() +add_pagination(g_instance_state.fastapi_app) + + +def xxx_store_record( collection: str, data: BaseModel | str, class_name: str, @@ -290,10 +316,10 @@ def store_record( status_code=HTTP_400_BAD_REQUEST, detail='Invalid ttl data provided.' ) - check_collection(g_instance_config, collection) + check_collection(g_instance_state, collection) token = ( - get_default_token_name(g_instance_config, collection) + get_default_token_name(g_instance_state, collection) if api_key is None else api_key ) @@ -303,13 +329,16 @@ def store_record( # the token, if the token is hashed. This is necessary because we do not # store the plaintext token, so all token-information is associated with # the hashed representation of the token. - store, token, token_permissions, user_id = get_token_store( - g_instance_config, + store, _, token_permissions, user_id = get_token_store( + g_instance_state, collection, token, ) final_permissions = join_default_token_permissions( - g_instance_config, token_permissions, collection + g_configuration, + g_instance_state, + token_permissions, + collection, ) if not final_permissions.incoming_write: raise HTTPException( @@ -320,7 +349,7 @@ def store_record( if input_format == Format.ttl: with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Conversion error'): json_object = FormatConverter( - g_instance_config.schemas[collection], + g_configuration.collections[collection].schema, input_format=Format.ttl, output_format=Format.json, ).convert(data, class_name) @@ -330,14 +359,14 @@ def store_record( record = data with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): - g_instance_config.validators[collection].validate(record) + g_instance_state.validators[collection].validate(record) with wrap_http_exception(CurieResolutionError): stored_records = store.store_object(obj=record, submitter=user_id) if input_format == Format.ttl: format_converter = FormatConverter( - g_instance_config.schemas[collection], + g_configuration.collections[collection].schema, input_format=Format.json, output_format=Format.ttl, ) @@ -357,64 +386,6 @@ def store_record( return JSONResponse([record for _, record in stored_records]) -def validate_record( - collection: str, - data: BaseModel | str, - class_name: str, - model: Any, - input_format: Format, - api_key: str | None = Depends(api_key_header_scheme), -) -> JSONResponse: - if input_format == Format.json and isinstance(data, str): - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail='Invalid JSON data provided.' - ) - - if input_format == Format.ttl and not isinstance(data, str): - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail='Invalid ttl data provided.' - ) - - check_collection(g_instance_config, collection) - - token = ( - get_default_token_name(g_instance_config, collection) - if api_key is None - else api_key - ) - - store, token, token_permissions, user_id = get_token_store( - g_instance_config, - collection, - token, - ) - final_permissions = join_default_token_permissions( - g_instance_config, token_permissions, collection - ) - if not final_permissions.incoming_write: - raise HTTPException( - status_code=HTTP_403_FORBIDDEN, - detail=f"Not authorized to validate records for collection '{collection}'.", - ) - - if input_format == Format.ttl: - with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Conversion error'): - json_object = FormatConverter( - g_instance_config.schemas[collection], - input_format=Format.ttl, - output_format=Format.json, - ).convert(data, class_name) - with wrap_http_exception(ValidationError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): - TypeAdapter(getattr(model, class_name)).validate_python(json_object) - else: - # Try to convert it into TTL to detect potential errors before storing - # the record - with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): - g_instance_config.validators[collection].validate(data) - - return JSONResponse(True) - - @app.get('/', response_class=RedirectResponse) async def root() -> RedirectResponse: return RedirectResponse('/docs') @@ -431,10 +402,10 @@ async def server() -> ServerResponse: collections = [ ServerCollectionResponse( name=collection_name, - schema=g_instance_config.schemas[collection_name], - classes=g_instance_config.model_info[collection_name][1], + schema=g_configuration.collections[collection_name].schema, + classes=g_instance_state.schema_info[g_configuration.collections[collection_name].schema].classes, ) - for collection_name in g_instance_config.collections + for collection_name in g_configuration.collections ] ) @@ -448,9 +419,6 @@ async def maintenance( body: MaintenanceRequest, api_key: str | None = Depends(api_key_header_scheme), ): - - - if api_key is None: raise HTTPException( status_code=HTTP_400_BAD_REQUEST, @@ -462,8 +430,8 @@ async def maintenance( # Try to authenticate the token with the authentication providers that # are associated with the collection. - check_collection(g_instance_config, collection) - auth_info = authenticate_token(g_instance_config, collection, api_key) + check_collection(g_configuration, collection) + auth_info = authenticate_token(g_instance_state, collection, api_key) permissions = auth_info.token_permission if not ( @@ -477,9 +445,9 @@ async def maintenance( ) if active: - g_instance_config.maintenance_mode.add(collection) + g_instance_state.maintenance_mode.add(collection) else: - g_instance_config.maintenance_mode.remove(collection) + g_instance_state.maintenance_mode.remove(collection) return @@ -494,10 +462,10 @@ async def read_record_with_pid( format: Format = Format.json, # noqa A002 api_key: str = Depends(api_key_header_scheme), ): - check_collection(g_instance_config, collection) + check_collection(g_configuration, collection) final_permissions, token_store = await process_token( - g_instance_config, api_key, collection + g_configuration, g_instance_state, api_key, collection ) class_name, json_object = None, None @@ -507,7 +475,7 @@ async def read_record_with_pid( if not json_object and final_permissions.curated_read: with wrap_http_exception(CurieResolutionError, header='CURIE error:'): - class_name, json_object = g_instance_config.curated_stores[ + class_name, json_object = g_instance_state.curated_stores[ collection ].get_object_by_pid(pid) @@ -516,7 +484,7 @@ async def read_record_with_pid( if format == Format.ttl: converter = FormatConverter( - schema=g_instance_config.schemas[collection], + schema=g_configuration.collections[collection].schema, input_format=Format.json, output_format=format, ) @@ -632,9 +600,9 @@ async def _read_all_records( detail=f'Conversion error: {e}', ) from e - check_collection(g_instance_config, collection) + check_collection(g_configuration, collection) final_permissions, token_store = await process_token( - g_instance_config, api_key, collection + g_configuration, g_instance_state, api_key, collection ) result_list = PriorityList() @@ -645,7 +613,7 @@ async def _read_all_records( result_list.add_list(token_store_list) if final_permissions.curated_read: - curated_store_list = g_instance_config.curated_stores[ + curated_store_list = g_instance_state.curated_stores[ collection ].get_all_objects( matching=matching, @@ -660,7 +628,7 @@ async def _read_all_records( if format == Format.ttl: result_list = ConvertingList( result_list, - g_instance_config.schemas[collection], + g_configuration.collections[collection].schema, input_format=Format.json, output_format=format, exception_handler=convert_to_http_exception, @@ -687,16 +655,17 @@ async def _read_records_of_type( detail=f'Conversion error: {e}', ) from e - check_collection(g_instance_config, collection) - model = g_instance_config.model_info[collection][0] - if class_name not in g_instance_config.use_classes[collection]: + check_collection(g_configuration, collection) + schema_location = g_configuration.collections[collection].schema + model = g_instance_state.schema_info[schema_location].pydantic_module_info.module + if class_name not in g_instance_state.collections[collection].active_classes: raise HTTPException( status_code=HTTP_404_NOT_FOUND, detail=f"No '{class_name}'-class in collection '{collection}'.", ) final_permissions, token_store = await process_token( - g_instance_config, api_key, collection + g_configuration, g_instance_state, api_key, collection ) result_list = PriorityList() @@ -712,7 +681,7 @@ async def _read_records_of_type( if final_permissions.curated_read: for search_class_name in get_subclasses(model, class_name): - curated_store_list = g_instance_config.curated_stores[ + curated_store_list = g_instance_state.curated_stores[ collection ].get_objects_of_class( class_name=search_class_name, @@ -728,7 +697,7 @@ async def _read_records_of_type( if format == Format.ttl: result_list = ConvertingList( result_list, - g_instance_config.schemas[collection], + schema_location, input_format=Format.json, output_format=format, exception_handler=convert_to_http_exception, @@ -751,9 +720,9 @@ async def delete_record( pid: str, api_key: str = Depends(api_key_header_scheme), ): - check_collection(g_instance_config, collection) + check_collection(g_configuration, collection) final_permissions, token_store = await process_token( - g_instance_config, api_key, collection + g_configuration, g_instance_state, api_key, collection ) if not final_permissions.incoming_write: @@ -775,25 +744,12 @@ async def delete_record( # Create dynamic endpoints and rebuild the app to include all dynamically # created endpoints. -create_store_endpoints(app, g_instance_config, tag_info, 'placeholder_write', globals()) -create_validate_endpoints(app, g_instance_config, tag_info, 'placeholder_validate', globals()) -create_curated_endpoints(app, tag_info, 'placeholder_curated_write', globals()) -create_incoming_endpoints(app, tag_info, 'placeholder_incoming_write', globals()) -app.openapi_schema = None -app.setup() - - -# Add CORS origins -app.add_middleware( - CORSMiddleware, - allow_origins=arguments.origins, - allow_credentials=True, - allow_methods=['*'], - allow_headers=['*'], -) - -# Add pagination -add_pagination(app) +#create_store_endpoints(app, g_instance_config, tag_info, 'placeholder_write', globals()) +#create_validate_endpoints(app, g_instance_config, tag_info, 'placeholder_validate', globals()) +#create_curated_endpoints(app, tag_info, 'placeholder_curated_write', globals()) +#create_incoming_endpoints(app, tag_info, 'placeholder_incoming_write', globals()) +#app.openapi_schema = None +#app.setup() def main(): diff --git a/dump_things_service/manifest.py b/dump_things_service/manifest.py index d360bdb..f7e9ca4 100644 --- a/dump_things_service/manifest.py +++ b/dump_things_service/manifest.py @@ -1,5 +1,7 @@ import logging +from fastapi_pagination import add_pagination + from dump_things_service.abstract_config import ( Configuration, TokenConfig, @@ -21,7 +23,6 @@ def manifest_configuration( - create a `ModelStore`-instance with correct `Backend`-instances and check for compatibility with existing data - - create configured `AuthenticationSource`-instances - create schema-related objects - add schema class-specific http-endpoints to: -- validate records @@ -95,6 +96,16 @@ def manifest_configuration( if new_collection_names or deleted_collection_names: instance_state.fastapi_app.openapi_schema = None instance_state.fastapi_app.setup() + add_pagination(instance_state.fastapi_app) + + # We do not create any incoming areas for configuration-file tokens + # here. The reason is that the configuration does not fully determine + # the possible incoming areas because incoming areas come from + # authentication sources and the configuration-file authentication source + # is just one possible authentication source. Other authentication sources + # have unknown means to create incoming area labels. + # Incoming areas are therefore create when a write request for a token + # is authorized. def create_token( @@ -117,3 +128,7 @@ def delete_collection( collection_name: str, ): instance_state.collections.pop(collection_name) + # TODO: remove all collection-related information from + # instance_state. Maybe all collection-specific information + # should go into the instance_state.collection[x]-object!? + # That would allow to remove it easily. diff --git a/dump_things_service/model.py b/dump_things_service/model.py index e8fb7c0..9155490 100644 --- a/dump_things_service/model.py +++ b/dump_things_service/model.py @@ -3,8 +3,8 @@ from __future__ import annotations import dataclasses # noqa F401 -- used by generated code import logging import sys +from functools import cache from itertools import count -from types import ModuleType from typing import ( TYPE_CHECKING, Any, @@ -35,10 +35,6 @@ lgr = logging.getLogger('dump_things_service') serial_number = count() _model_counter = count() -_model_cache = {} -_schema_model_cache = {} -_schema_view_cache = {} - # Pydantic module generation might require a higher recursion limit than the # default. Add a mechanism to increase it as needed, up to a maximum. @@ -58,6 +54,9 @@ def get_subclasses( class_name: str, ) -> list[str]: """get names of all subclasses (includes class_name itself)""" + + # TODO: this could also be implemented via SchemaView: + # return schema_view.class_children(class_name, mixins=False) super_class = getattr(model, class_name) return [ name @@ -66,6 +65,21 @@ def get_subclasses( ] +# TODO: shall we use the following code? +# The code below would use schema-definitions to determine classes and not +# go through thw pydantic module generation. +@cache +def get_subclasses_2( + collection_name: str, + class_name: str, +) -> list[str]: + from dump_things_service.instance_state import get_instance_state + + instance_state = get_instance_state() + schema_view = instance_state.schema_info[collection_name].schema_view + return schema_view.class_children(class_name, mixins=False) + + def compile_module_with_increasing_recursion_limit( pydantic_generator: PydanticGenerator, schema_location: str, @@ -99,33 +113,29 @@ def compile_module_with_increasing_recursion_limit( return module +@cache def get_model_for_schema( schema_location: str, ) -> tuple[ModuleType, list[str], str]: - if schema_location not in _model_cache: - lgr.info(f'Building model for schema {schema_location}.') - pydantic_generator = PydanticGenerator(schema_location) - model = compile_module_with_increasing_recursion_limit( - pydantic_generator, - schema_location, - ) - classes = get_classes(model) - model_var_name = f'model_{next(_model_counter)}' - _model_cache[schema_location] = model, classes, model_var_name - return _model_cache[schema_location] + lgr.info(f'Building pydantic modulr for schema {schema_location}') + pydantic_generator = PydanticGenerator(schema_location) + model = compile_module_with_increasing_recursion_limit( + pydantic_generator, + schema_location, + ) + classes = get_classes(model) + model_var_name = f'model_{next(_model_counter)}' + return model, classes, model_var_name +@cache def get_schema_view(schema_location: str) -> SchemaView: - if schema_location not in _schema_view_cache: - _schema_view_cache[schema_location] = SchemaView(schema_location) - return _schema_view_cache[schema_location] + return SchemaView(schema_location) +@cache def get_schema_model_for_schema( schema_location: str, ) -> ModuleType: - if schema_location not in _schema_model_cache: - _schema_model_cache[schema_location] = PythonGenerator( - schema_location - ).compile_module() - return _schema_model_cache[schema_location] + lgr.info(f'Building python module for schema {schema_location}') + return PythonGenerator(schema_location).compile_module() diff --git a/dump_things_service/tests/create_store.py b/dump_things_service/tests/create_store.py index 2248fe6..5f038c0 100644 --- a/dump_things_service/tests/create_store.py +++ b/dump_things_service/tests/create_store.py @@ -4,18 +4,16 @@ from typing import TYPE_CHECKING import yaml +from dump_things_service.backends.record_dir import RecordDirStore from dump_things_service.backends.sqlite import ( SQLiteBackend, -) -from dump_things_service.backends.sqlite import ( record_file_name as sqlite_record_file_name, ) -from dump_things_service.config import ( - BackendConfigRecordDir, +from dump_things_service.abstract_config import ( + RecordDirBackendConfig, CollectionConfig, - GlobalConfig, + Configuration, MappingMethod, - config_file_name, mapping_functions, ) from dump_things_service.model import get_model_for_schema @@ -25,6 +23,8 @@ if TYPE_CHECKING: from pathlib import Path +config_file_name = '.dumpthings.yaml' + collection_config_template = """type: records version: 1 schema: {schema} @@ -58,21 +58,11 @@ faulty_yaml = ': : -: : :' def create_store( root_dir: Path, - config: GlobalConfig, + abstract_config: Configuration, per_collection_info: dict[str, tuple[str, str]], default_entries: dict[str, list[tuple[str, str, str]]] | None = None, ): - # Create the global config file - config_text = yaml.safe_dump( - config.model_dump(mode='json', exclude_none=True), - allow_unicode=True, - sort_keys=False, - ) - with open(root_dir / config_file_name, 'w') as f: - f.write(config_text) - - # Create all collection directories - for collection_name, collection_config in config.collections.items(): + for collection_name, collection_config in abstract_config.collections.items(): create_collection( root_dir=root_dir, collection_config=collection_config, @@ -102,7 +92,7 @@ def create_collection( curated_dir.mkdir(parents=True, exist_ok=True) if collection_config.backend is None: - collection_config.backend = BackendConfigRecordDir(type='record_dir+stl') + collection_config.backend = RecordDirBackendConfig(type='record_dir+stl') if collection_config.backend.type == 'record_dir+stl': # Add the collection level config file diff --git a/dump_things_service/tests/fixtures.py b/dump_things_service/tests/fixtures.py index 491553e..8f9530e 100644 --- a/dump_things_service/tests/fixtures.py +++ b/dump_things_service/tests/fixtures.py @@ -1,11 +1,30 @@ import sys -from pathlib import Path +from pathlib import ( + Path, + PurePosixPath, +) +from types import ModuleType import pytest import yaml -from dump_things_service import config_file_name -from dump_things_service.config import GlobalConfig +from dump_things_service.abstract_config import ( + CollectionConfig, + GitAuditBackendConfig, + SQLiteBackendConfig, + TokenCollectionConfig, + TokenModes, +) +from dump_things_service.backends import StorageBackend +from dump_things_service.backends.record_dir import RecordDirStore +from dump_things_service.backends.sqlite import ( + SQLiteBackend, + record_file_name as sqlite_db_filename, +) +from dump_things_service.instance_state import get_mapping_function_by_name +from dump_things_service.model import get_model_for_schema +from dump_things_service.resolve_curie import resolve_curie +from dump_things_service.token_endpoints import TokenRequest from dump_things_service.tests.create_store import ( create_store, pid, @@ -16,258 +35,302 @@ from dump_things_service.tests.create_store import ( test_record_trr, ) + # String representation of curated- and incoming-path curated = 'curated' incoming = 'incoming' # Path to a local simple test schema -schema_path = Path(__file__).parent / 'testschema.yaml' +test_schema_location = str((Path(__file__).parent / 'testschema.yaml').absolute()) +flat_social_schema_location = 'https://concepts.datalad.org/s/flat-social/unreleased.yaml' -# The global configuration file, all collections and -# staging areas share the same directories. All tokens -# of the same collection share an "incoming_label". -global_config_text = f""" -type: collections -version: 1 -collections: - collection_1: - default_token: basic_access - curated: {curated}/in_token_1 - incoming: {incoming} - backend: - type: record_dir+stl - auth_sources: - - type: config - submission_tags: - submitter_id_tag: oxo:NCIT_C54269 - submission_time_tag: https://time - audit_backends: - - type: gitaudit - path: {{audit_store_path}} - auto_flush_timeout: 2 - collection_2: - default_token: basic_access - curated: {curated}/collection_2 - incoming: incoming_2 - backend: - type: record_dir+stl - collection_3: - default_token: basic_access - curated: {curated}/collection_3 - incoming: incoming_3 - backend: - type: record_dir+stl - collection_4: - default_token: basic_access - curated: {curated}/collection_4 - incoming: incoming_4 - backend: - type: record_dir+stl - collection_5: - default_token: basic_access - curated: {curated}/collection_5 - incoming: incoming_5 - backend: - type: record_dir+stl - collection_6: - default_token: basic_access - curated: {curated}/collection_6 - incoming: incoming_6 - backend: - type: record_dir+stl - collection_7: - default_token: basic_access - curated: {curated}/collection_7 - incoming: incoming_7 - backend: - type: record_dir+stl - collection_8: - default_token: basic_access - curated: {curated}/collection_8 - incoming: incoming_8 - backend: - type: sqlite - schema: {schema_path} - collection_dlflatsocial-1: - default_token: basic_access - curated: {curated}/collection_dlflatsocial-1 - incoming: {incoming}/collection_dlflatsocial-1 - backend: - type: record_dir+stl - collection_dlflatsocial-2: - default_token: basic_access - curated: {curated}/collection_dlflatsocial-2 - incoming: {incoming}/collection_dlflatsocial-2 - backend: - type: sqlite - schema: https://concepts.datalad.org/s/flat-social/unreleased.yaml - use_classes: - - Organization - - Person - - Project - ignore_classes: - - Organization - - Project +# The test store is created empty and collections are added via the admin +# web interface. +g_default_collections = [ + CollectionConfig( + name=f'collection_{i}', + default_token='test_default_token', + curated=PurePosixPath(f'{curated}/collection_{i}'), + schema=test_schema_location, + incoming=PurePosixPath(f'{incoming}/collection_{i}'), + ) + for i in range(1, 8) +] -tokens: - basic_access: - user_id: anonymous - collections: - collection_1: - mode: READ_CURATED - incoming_label: '' - collection_2: - mode: READ_CURATED - incoming_label: '' - collection_3: - mode: READ_CURATED - incoming_label: '' - collection_4: - mode: READ_CURATED - incoming_label: '' - collection_5: - mode: READ_CURATED - incoming_label: '' - collection_6: - mode: READ_CURATED - incoming_label: '' - collection_7: - mode: READ_CURATED - incoming_label: '' - collection_8: - mode: READ_CURATED - incoming_label: '' - collection_dlflatsocial-1: - mode: READ_CURATED - incoming_label: '' - collection_dlflatsocial-2: - mode: READ_CURATED - incoming_label: '' - cmo-33b726a7e2b9eaf1f8f124049822ade31cb6516a4d8221634b01d13d793bfe16: - hashed: True - user_id: cmo - collections: - collection_1: - mode: WRITE_COLLECTION - incoming_label: cmo - # The plaintext of the following is `token-1`: - token-6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b: - hashed: True - user_id: test_user_1 - collections: - collection_1: - mode: WRITE_COLLECTION - incoming_label: in_token_1 - collection_dlflatsocial-1: - mode: WRITE_COLLECTION - incoming_label: in_token_1 - collection_dlflatsocial-2: - mode: WRITE_COLLECTION - incoming_label: in_token_1 - token_1_xxooo: - user_id: test_user_1_read_collection - collections: - collection_1: - mode: READ_COLLECTION - incoming_label: modes - token_1_xxxoo: - user_id: test_user_1_write_collection - collections: - collection_1: - mode: WRITE_COLLECTION - incoming_label: modes - token_1_oxooo: - user_id: test_user_1_read_submissions - collections: - collection_1: - mode: READ_SUBMISSIONS - incoming_label: modes - token_1_oxxoo: - user_id: test_user_1_write_submissions - collections: - collection_1: - mode: WRITE_SUBMISSIONS - incoming_label: modes - token_1_xoxoo: - user_id: test_user_1_submit - collections: - collection_1: - mode: SUBMIT - incoming_label: modes - token_1_ooxoo: - user_id: test_user_1_submit_only - collections: - collection_1: - mode: SUBMIT_ONLY - incoming_label: modes - token_1_ooxoo: - user_id: test_user_1_submit_only - collections: - collection_1: - mode: SUBMIT_ONLY - incoming_label: modes - token_1_xoooo: - user_id: test_user_1_read_curated - collections: - collection_1: - mode: READ_CURATED - incoming_label: modes - token_1_ooooo: - user_id: test_user_1_nothing - collections: - collection_1: - mode: NOTHING - incoming_label: modes - token_1_xxxxx: - user_id: test_user_1_curated - collections: - collection_1: - mode: CURATOR - incoming_label: modes - collection_8: - mode: CURATOR - incoming_label: modes - token_admin: - user_id: test_admin - collections: - collection_1: - mode: CURATOR - incoming_label: admin_1 - collection_2: - mode: CURATOR - incoming_label: admin_2 - collection_3: - mode: CURATOR - incoming_label: admin_3 - collection_4: - mode: CURATOR - incoming_label: admin_4 - collection_5: - mode: CURATOR - incoming_label: admin_common - collection_6: - mode: CURATOR - incoming_label: admin_common - collection_7: - mode: CURATOR - incoming_label: admin_common - collection_8: - mode: CURATOR - incoming_label: admin_common - token-2: - user_id: test_user_2 - collections: - collection_2: - mode: WRITE_COLLECTION - incoming_label: in_token-2 - token-8: - user_id: test_user_8 - collections: - collection_8: - mode: WRITE_COLLECTION - incoming_label: test_user_8 -""" +g_default_collections.append( + CollectionConfig( + name=f'collection_8', + default_token='test_default_token', + curated=PurePosixPath(f'{curated}/collection_8'), + schema=test_schema_location, + incoming=PurePosixPath(f'{incoming}/collection_8'), + backend=SQLiteBackendConfig( + type='sqlite', + schema=test_schema_location, + ) + ) +) + +g_default_collections.extend([ + CollectionConfig( + name='collection_dlflatsocial-1', + schema=flat_social_schema_location, + default_token='test_default_token', + curated=PurePosixPath(f'{curated}/collection_dlflatsocial-1'), + incoming=PurePosixPath(f'{incoming}/collection_dlflatsocial-1'), + ), + CollectionConfig( + name='collection_dlflatsocial-2', + schema=flat_social_schema_location, + default_token='test_default_token', + curated=PurePosixPath(f'{curated}/collection_dlflatsocial-2'), + incoming=PurePosixPath(f'{incoming}/collection_dlflatsocial-2'), + backend=SQLiteBackendConfig( + type='sqlite', + schema='https://concepts.datalad.org/s/flat-social/unreleased.yaml', + ), + use_classes=[ + 'Organization', + 'Person', + 'Project', + ], + ignore_classes=[ + 'Organization', + 'Project', + ], + ), +]) + +g_default_tokens = [ + TokenRequest( + name='test_default_token', + user_id='basic_access_user', + hashed=False, + representation='basic_access', + collections={ + **{ + f'collection_{i}': TokenCollectionConfig( + mode=TokenModes.READ_CURATED, + ) + for i in range(1, 9) + }, + **{ + f'collection_dlflatsocial-{i}': TokenCollectionConfig( + mode=TokenModes.READ_CURATED, + ) + for i in range(1, 3) + }, + }, + ), + TokenRequest( + name='Test token for some collections', + user_id='test_user_1', + hashed=False, + representation='token-1', + collections={ + collection_name: TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label='in_token_1', + ) + for collection_name in ( + 'collection_1', + 'collection_dlflatsocial-1', + 'collection_dlflatsocial-2', + ) + }, + ), + TokenRequest( + name='Test token for collection_2', + user_id='test_user_2', + hashed=False, + representation='token-2', + collections={ + f'collection_2': TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label='in_token-2', + ) + }, + ), + TokenRequest( + name='Test token for collection_8', + user_id='test_user_8', + hashed=False, + representation='token-8', + collections={ + f'collection_8': TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label='test_user_8', + ) + }, + ), + TokenRequest( + name='Test token for all collections', + user_id='user_all', + hashed=False, + representation='token-all', + collections={ + **{ + f'collection_{i}': TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label='token-all:user_all', + ) + for i in range(1, 9) + }, + **{ + f'collection_dlflatsocial-{i}': TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label='token-all:user_all', + ) + for i in range(1, 3) + }, + }, + ), + TokenRequest( + name='Test Curator Token', + user_id='test_curator', + representation='token_curator', + collections={ + f'collection_{i}': TokenCollectionConfig( + mode=TokenModes.CURATOR, + incoming_label=f'admin_{i}' if i < 5 else 'admin_common', + ) + for i in range(1, 9) + }, + ), + TokenRequest( + name='Test Hashed Token', + user_id='test_hashed', + representation='token-hashed', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label='token-hashed-1', + ), + }, + ), + TokenRequest( + name='Test XX000 (READ_COLLECTION)', + user_id='test_user_1_read_collection', + representation='token_1_xxooo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.READ_COLLECTION, + incoming_label='modes', + ), + }, + ), + TokenRequest( + name='Test XXX00 (WRITE_COLLECTION)', + user_id='test_user_1_write_collection', + representation='token_1_xxxoo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label='modes', + ), + } + ), + TokenRequest( + name='Test 0X000 (READ_SUBMISSIONS)', + user_id='test_user_1_read_submissions', + representation='token_1_oxooo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.READ_SUBMISSIONS, + incoming_label='modes', + ), + }, + ), + TokenRequest( + name='Test 0XX00 (WRITE_SUBMISSIONS)', + user_id='test_user_1_write_submissions', + representation='token_1_oxxoo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.WRITE_SUBMISSIONS, + incoming_label='modes', + ), + }, + ), + TokenRequest( + name='Test X0X00 (SUBMIT)', + user_id='test_user_1_submit', + representation='token_1_xoxoo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.SUBMIT, + incoming_label='modes', + ), + }, + ), + TokenRequest( + name='Test 00X00 (SUBMIT_ONLY)', + user_id='test_user_1_submit_only', + representation='token_1_ooxoo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.SUBMIT_ONLY, + incoming_label='modes', + ), + }, + ), + TokenRequest( + name='Test X0000 (READ_CURATED)', + user_id='test_user_1_read_curated', + representation='token_1_xoooo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.READ_CURATED, + incoming_label='modes', + ), + }, + ), + TokenRequest( + name='Test 00000 (NOTHING)', + user_id='test_user_1_nothing', + representation='token_1_ooooo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.NOTHING, + incoming_label='modes', + ), + }, + ), + TokenRequest( + name='Test XXXXX (CURATOR)', + user_id='test_user_1_curator', + representation='token_1_xxxxx', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.CURATOR, + incoming_label='modes', + ), + 'collection_8': TokenCollectionConfig( + mode=TokenModes.CURATOR, + incoming_label='modes', + ), + }, + ), +] + +g_default_entries = { + f'collection_{i}': [('Person', pid, test_record)] for i in range(1, 9) +} +for collection_id in range(1, 9): + g_default_entries[f'collection_{collection_id}'].extend( + [ + ('Person', pid_curated, test_record_curated), + ( + 'Person', + 'abc:mode_test', + 'pid: abc:mode_test\ngiven_name: mode_curated\nschema_type: abc:Person\n', + ), + ] + ) + +g_default_entries['collection_dlflatsocial-1'] = [('Person', pid_trr, test_record_trr)] +g_default_entries['collection_dlflatsocial-2'] = [('Person', pid_trr, test_record_trr)] @pytest.fixture(scope='session') @@ -275,8 +338,9 @@ def dump_stores_simple(tmp_path_factory): tmp_path = tmp_path_factory.mktemp('dump_store') audit_store_path = tmp_path_factory.mktemp('audit_store') - final_config_text = global_config_text.format(audit_store_path=str(audit_store_path)) - (tmp_path / config_file_name).write_text(final_config_text) + return tmp_path, audit_store_path + #final_config_text = global_config_text.format(audit_store_path=str(audit_store_path)) + #(tmp_path / config_file_name).write_text(final_config_text) default_entries = { f'collection_{i}': [('Person', pid, test_record)] for i in range(1, 9) @@ -294,18 +358,19 @@ def dump_stores_simple(tmp_path_factory): ) default_entries['collection_dlflatsocial-1'] = [('Person', pid_trr, test_record_trr)] default_entries['collection_dlflatsocial-2'] = [('Person', pid_trr, test_record_trr)] + create_store( root_dir=tmp_path, - config=GlobalConfig(**yaml.safe_load(final_config_text)), + abstract_config=GlobalConfig(**yaml.safe_load(final_config_text)), per_collection_info={ - 'collection_1': (str(schema_path), 'digest-md5'), - 'collection_2': (str(schema_path), 'digest-md5-p3'), - 'collection_3': (str(schema_path), 'digest-sha1'), - 'collection_4': (str(schema_path), 'digest-sha1-p3'), - 'collection_5': (str(schema_path), 'after-last-colon'), - 'collection_6': (str(schema_path), 'digest-md5-p3-p3'), - 'collection_7': (str(schema_path), 'digest-sha1-p3-p3'), - 'collection_8': (str(schema_path), 'digest-md5'), + 'collection_1': (str(test_schema_location), 'digest-md5'), + 'collection_2': (str(test_schema_location), 'digest-md5-p3'), + 'collection_3': (str(test_schema_location), 'digest-sha1'), + 'collection_4': (str(test_schema_location), 'digest-sha1-p3'), + 'collection_5': (str(test_schema_location), 'after-last-colon'), + 'collection_6': (str(test_schema_location), 'digest-md5-p3-p3'), + 'collection_7': (str(test_schema_location), 'digest-sha1-p3-p3'), + 'collection_8': (str(test_schema_location), 'digest-md5'), 'collection_dlflatsocial-1': ( 'https://concepts.datalad.org/s/flat-social/unreleased.yaml', 'digest-md5', @@ -322,16 +387,85 @@ def dump_stores_simple(tmp_path_factory): @pytest.fixture(scope='session') def fastapi_app_simple(dump_stores_simple): + tmp_path, audit_tmp_path = dump_stores_simple + old_sys_argv = sys.argv - sys.argv = ['test-runner', str(dump_stores_simple)] + sys.argv = ['test-runner', str(tmp_path)] from dump_things_service.main import app sys.argv = old_sys_argv - return app, dump_stores_simple + return app, tmp_path, audit_tmp_path @pytest.fixture(scope='session') def fastapi_client_simple(fastapi_app_simple): from fastapi.testclient import TestClient - return TestClient(fastapi_app_simple[0]), fastapi_app_simple[1] + test_client = TestClient(fastapi_app_simple[0]) + store_path = fastapi_app_simple[1] + audit_path = fastapi_app_simple[2] + + # Add an audit backend to the first collection in g_default_collections + assert g_default_collections[0].name == 'collection_1' + g_default_collections[0].audit_backends = [ + GitAuditBackendConfig( + type='gitaudit', + path=Path(audit_path), + auto_flush_timeout=2, + ) + ] + + # Add collections via the Web-API + for collection_config in g_default_collections: + response = test_client.post( + '/collections', + json=collection_config.model_dump(exclude_unset=True, mode='json'), + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == 201 + + # Add tokens via Web-API + for token_config in g_default_tokens: + response = test_client.post( + '/tokens', + json=token_config.model_dump(exclude_unset=True, mode='json'), + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == 201 + + # Add default content via backend instances + for collection_config in g_default_collections: + curated_path = Path(store_path / collection_config.curated) + backend_config = collection_config.backend + if backend_config.type.startswith('sqlite'): + backend = SQLiteBackend(curated_path / sqlite_db_filename) + else: + backend = RecordDirStore( + curated_path, + pid_mapping_function=get_mapping_function_by_name( + backend_config.mapping_method, + ), + suffix='yaml', + ) + pydantic_module = get_model_for_schema(collection_config.schema)[0] + add_records_to_backend( + backend, + pydantic_module, + g_default_entries[collection_config.name], + ) + return test_client, store_path + + +def add_records_to_backend( + backend: StorageBackend, + pydantic_module: ModuleType, + record_infos: list[tuple[str, str, str]], +): + for class_name, record_pid, yaml_stream in record_infos: + json_object = yaml.load(yaml_stream, Loader=yaml.SafeLoader ) + assert record_pid == json_object['pid'] + backend.add_record( + iri=resolve_curie(pydantic_module, json_object['pid']), + class_name=class_name, + json_object=json_object, + ) diff --git a/dump_things_service/tests/test_auth.py b/dump_things_service/tests/test_auth.py index e63438d..efba9be 100644 --- a/dump_things_service/tests/test_auth.py +++ b/dump_things_service/tests/test_auth.py @@ -4,8 +4,8 @@ import json import pytest +from dump_things_service.abstract_config import TokenPermission from dump_things_service.auth.forgejo import ForgejoAuthenticationSource -from dump_things_service.token import TokenPermission user_1 = { 'id': 1, diff --git a/dump_things_service/tests/test_basic.py b/dump_things_service/tests/test_basic.py index 31068b1..02348bd 100644 --- a/dump_things_service/tests/test_basic.py +++ b/dump_things_service/tests/test_basic.py @@ -45,7 +45,7 @@ def test_search_by_pid(fastapi_client_simple): for i in range(1, 9): response = test_client.get( f'/collection_{i}/record?pid={pid}', - headers={'x-dumpthings-token': 'basic_access'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK assert response.json() == { @@ -116,7 +116,7 @@ def test_hashed_token(fastapi_client_simple): test_client, _ = fastapi_client_simple response = test_client.get( f'/collection_1/record?pid={pid}', - headers={'x-dumpthings-token': 'cmo-cmo'}, + headers={'x-dumpthings-token': 'token-hashed'}, ) assert response.status_code == HTTP_200_OK assert response.json() == { @@ -124,9 +124,10 @@ def test_hashed_token(fastapi_client_simple): 'pid': pid, 'given_name': given_name, } + # Ensure that the hashed token value is not compared verbatim response = test_client.get( f'/collection_1/record?pid={pid}', - headers={'x-dumpthings-token': 'cmo-33b726a7e2b9eaf1f8f124049822ade31cb6516a4d8221634b01d13d793bfe16'}, + headers={'x-dumpthings-token': '25d3fc9469f4971012815cb3ab8f9db3f50c0d63'}, ) assert response.status_code == HTTP_401_UNAUTHORIZED @@ -136,7 +137,7 @@ def test_search_by_class(fastapi_client_simple): for i in range(1, 9): response = test_client.get( f'/collection_{i}/records/Thing', - headers={'x-dumpthings-token': 'basic_access'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK json_result = response.json() @@ -213,13 +214,10 @@ def test_store_record(fastapi_client_simple): f'/collection_{i}/records/Person', headers={'x-dumpthings-token': 'basic_access'}, ) - assert response.json() == [ - { - 'schema_type': 'abc:Person', - 'pid': pid, - 'given_name': given_name, - } - ] + # The following assertion works because the stored records don't contain + # annotations. If they would, the result would have to be cleaned + # before the containment-check. + assert extra_record not in response.json() # Check that subclasses are retrieved for i, token in basic_write_locations: @@ -280,7 +278,8 @@ def test_global_store_write_fails(fastapi_client_simple): # Since we provide no token, the default token will be used. This will # only allow reading from curated, not posting. response = test_client.post( - f'/collection_{i}/record/Person', json={'pid': extra_record['pid']} + f'/collection_{i}/record/Person', + json={'pid': extra_record['pid']}, ) assert response.status_code == HTTP_403_FORBIDDEN @@ -438,13 +437,13 @@ def test_ignore_classes(fastapi_client_simple): for class_name in ('Organization', 'Project'): response = test_client.post( f'/collection_dlflatsocial-1/record/{class_name}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json={'pid': f'dlflatsocial:c_{class_name}'}, ) assert response.status_code == HTTP_200_OK response = test_client.post( f'/collection_dlflatsocial-2/record/{class_name}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json={'pid': f'dlflatsocial:c_{class_name}'}, ) assert response.status_code == HTTP_404_NOT_FOUND @@ -464,14 +463,14 @@ def test_maintenance(fastapi_client_simple): # Ensure unknown collections are caught in maintenance mode response = test_client.post( '/maintenance', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, json={'collection': 'collection_x', 'active': True}, ) assert response.status_code == HTTP_404_NOT_FOUND response = test_client.post( '/maintenance', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, json={'collection': 'collection_1', 'active': True}, ) @@ -491,7 +490,7 @@ def test_maintenance(fastapi_client_simple): # Deactivate maintenance mode response = test_client.post( '/maintenance', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, json={'collection': 'collection_1', 'active': False}, ) assert response.status_code == HTTP_200_OK diff --git a/dump_things_service/tests/test_config.py b/dump_things_service/tests/test_config.py index 011ff73..0f08aac 100644 --- a/dump_things_service/tests/test_config.py +++ b/dump_things_service/tests/test_config.py @@ -4,15 +4,10 @@ import yaml from pydantic import ValidationError from yaml.scanner import ScannerError -from dump_things_service.config import ( - ConfigError, - GlobalConfig, - process_config, - process_config_object, -) +from dump_things_service.exceptions import ConfigError -def test_scanner_error_detection(tmp_path): +def xxx_test_scanner_error_detection(tmp_path): config_file_path = tmp_path / 'config.yaml' config_file_path.write_text('type: col: le\n:xxx:') global_dict = {} @@ -21,7 +16,7 @@ def test_scanner_error_detection(tmp_path): assert isinstance(e.value.__cause__, ScannerError) -def test_structure_error_detection(tmp_path): +def xxx_test_structure_error_detection(tmp_path): config_file_path = tmp_path / 'config.yaml' config_file_path.write_text('type: colle\n') global_dict = {} @@ -30,7 +25,7 @@ def test_structure_error_detection(tmp_path): assert isinstance(e.value.__cause__, ValidationError) -def test_missing_incoming_detection(tmp_path): +def xxx_test_missing_incoming_detection(tmp_path): config_object = GlobalConfig( **yaml.load( """ @@ -58,7 +53,7 @@ tokens: process_config_object(tmp_path, config_object, [], global_dict) -def test_submission_tags_handling(dump_stores_simple): +def xxx_test_submission_tags_handling(dump_stores_simple): config_object = GlobalConfig( **yaml.load( """ @@ -101,7 +96,7 @@ tokens: assert config.collections['collection_2'].submission_tags.submitter_id_tag == 'http://purl.obolibrary.org/obo/NCIT_C54269' -def test_submission_tags_resolving(dump_stores_simple): +def xxx_test_submission_tags_resolving(dump_stores_simple): config_object = GlobalConfig( **yaml.load( """ @@ -131,7 +126,7 @@ tokens: process_config_object(dump_stores_simple, config_object, [], global_dict) -def test_submission_tags_resolving_error(dump_stores_simple): +def xxx_test_submission_tags_resolving_error(dump_stores_simple): config_object = GlobalConfig( **yaml.load( """ diff --git a/dump_things_service/tests/test_curated.py b/dump_things_service/tests/test_curated.py index 7eebee2..5944a3b 100644 --- a/dump_things_service/tests/test_curated.py +++ b/dump_things_service/tests/test_curated.py @@ -9,7 +9,8 @@ from dump_things_service import ( HTTP_200_OK, HTTP_404_NOT_FOUND, ) -from dump_things_service.config import get_config +from dump_things_service.instance_state import get_instance_state + delete_record = { 'schema_type': 'abc:Person', @@ -120,8 +121,8 @@ def test_audit_backend(fastapi_client_simple): record_id = 'abc:audit-trailed' names = 'Frederick', 'Johny' - tokens = 'token_1_xxxxx', 'token_admin' - user_names = 'test_user_1_curated', 'test_admin' + tokens = 'token_1_xxxxx', 'token_curator' + user_names = 'test_user_1_curator', 'test_curator' json_objects = tuple( { 'schema_type': 'abc:Person', @@ -139,8 +140,8 @@ def test_audit_backend(fastapi_client_simple): ) assert response.status_code == HTTP_200_OK - config_instance = get_config() - audit_backend = config_instance.audit_backends['collection_1'][0] + instance_state = get_instance_state() + audit_backend = instance_state.audit_backends['collection_1'][0] changes = audit_backend.get_audit_log(record_id) assert len(changes) == 2 @@ -156,8 +157,7 @@ def test_audit_backend_auto_flush(fastapi_client_simple): record_id = 'abc:audit-trailed' names = 'Robert', 'Anton' - tokens = 'token_1_xxxxx', 'token_admin' - user_names = 'test_user_1_curated', 'test_admin' + tokens = 'token_1_xxxxx', 'token_curator' json_objects = tuple( { 'schema_type': 'abc:Person', @@ -175,8 +175,8 @@ def test_audit_backend_auto_flush(fastapi_client_simple): ) assert response.status_code == HTTP_200_OK - config_instance = get_config() - audit_backend = config_instance.audit_backends['collection_1'][0] + instance_state = get_instance_state() + audit_backend = instance_state.audit_backends['collection_1'][0] assert audit_backend.current_change_set, 'expected unpersisted changes in audit log' diff --git a/dump_things_service/tests/test_extract_inline.py b/dump_things_service/tests/test_extract_inline.py index 4928dc7..2f9a128 100644 --- a/dump_things_service/tests/test_extract_inline.py +++ b/dump_things_service/tests/test_extract_inline.py @@ -218,7 +218,7 @@ def test_inline_extraction_on_service(fastapi_client_simple): # Deposit JSON record response = test_client.post( f'/collection_dlflatsocial-{i}/record/Person', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json=inlined_json_record, ) assert response.status_code == HTTP_200_OK, 'Response content:' + response.text @@ -231,7 +231,7 @@ def test_inline_extraction_on_service(fastapi_client_simple): for record_pid in (entry[0] for entry in tree): response = test_client.get( f'/collection_dlflatsocial-{i}/record?pid={record_pid}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK records.append(response.json()) @@ -247,7 +247,7 @@ def test_inline_extraction_on_service(fastapi_client_simple): ): records = test_client.get( f'/collection_dlflatsocial-{i}/records/{class_name}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ).json() for pid in pids: assert any(record['pid'] == pid for record in records) @@ -264,7 +264,7 @@ def test_inline_ttl_processing(fastapi_client_simple): for class_name, ttl_record in ttls_with_inline: response = test_client.post( f'/collection_dlflatsocial-{i}/record/{class_name}?format=ttl', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json=ttl_record, ) assert response.status_code == HTTP_200_OK @@ -275,7 +275,7 @@ def test_inline_ttl_processing(fastapi_client_simple): for record_pid in (entry[0] for entry in ttl_tree): response = test_client.get( f'/collection_dlflatsocial-{i}/record?pid={record_pid}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK records.append(response.json()) @@ -291,7 +291,7 @@ def test_inline_ttl_processing(fastapi_client_simple): ): records = test_client.get( f'/collection_dlflatsocial-{i}/records/{class_name}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ).json() for pid in pids: assert any(record['pid'] == pid for record in records) @@ -329,7 +329,7 @@ def test_dont_extract_empty_things_on_service(fastapi_client_simple): # Deposit JSON record response = test_client.post( f'/collection_dlflatsocial-{i}/record/Person', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json=empty_inlined_json_record, ) assert response.status_code == HTTP_200_OK diff --git a/dump_things_service/tests/test_incoming.py b/dump_things_service/tests/test_incoming.py index f6efb05..68b3fb0 100644 --- a/dump_things_service/tests/test_incoming.py +++ b/dump_things_service/tests/test_incoming.py @@ -22,7 +22,7 @@ def test_incoming_labels(fastapi_client_simple): for i in range(1, 9): response = test_client.get( f'/collection_{i}/incoming/', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) existing_labels = response.json() assert len(existing_labels) >= 1 @@ -46,7 +46,7 @@ def fill_zones(test_client): (7, 'admin_common'), (8, 'admin_common'), ): - token = 'token_admin' + token = 'token_curator' result = test_client.post( f'/collection_{collection_id}/incoming/{label}/record/Person', headers={'x-dumpthings-token': token}, @@ -63,16 +63,16 @@ def fill_zones(test_client): @pytest.mark.parametrize('paginate', ('', 'p/')) @pytest.mark.parametrize('class_name', ('', 'Person')) def test_read_incoming_records( - fastapi_client_simple, - paginate: str, - class_name: str, + fastapi_client_simple, + paginate: str, + class_name: str, ): test_client, _ = fastapi_client_simple fill_zones(test_client) for collection_id, labels in ( - (1, ['modes', 'admin_1', 'cmo', 'in_token_1']), + (1, ['modes', 'admin_1', 'in_token_1']), (2, ['in_token-2', 'admin_2']), (3, ['admin_3']), (4, ['admin_4']), @@ -85,9 +85,9 @@ def test_read_incoming_records( for label in labels: response = test_client.get( f'/collection_{collection_id}/incoming/{label}/records/{paginate}{class_name}', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) - assert response.status_code == HTTP_200_OK + assert response.status_code == HTTP_200_OK, f'failed on collection: {collection_id}, label: {label}, class: {class_name}' # We don't know the exact number of entries in each zone, because # it depends on the tests that ran before. @@ -96,10 +96,10 @@ def test_read_incoming_records( expected_length = 0 if label.startswith('admin_'): expected_length = 1 - pattern = f'abc:test_incoming-collection_{collection_id}-token_admin' + pattern = f'abc:test_incoming-collection_{collection_id}-token_curator' response = test_client.get( f'/collection_{collection_id}/incoming/{label}/records/{paginate}{class_name}?matching={pattern}', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_200_OK json_object = response.json() @@ -134,7 +134,7 @@ def test_incoming_unknown_collection(fastapi_client_simple): response = test_client.get( '/no_such_collection/incoming/no_such_label/records/', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_404_NOT_FOUND @@ -144,7 +144,7 @@ def test_incoming_unknown_label(fastapi_client_simple): response = test_client.get( '/collection_1/incoming/no_such_label/records/', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_404_NOT_FOUND @@ -154,35 +154,35 @@ def test_incoming_delete(fastapi_client_simple): response = test_client.post( '/collection_7/incoming/admin_common/record/Person', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, json=delete_record, ) assert response.status_code == HTTP_200_OK response = test_client.get( '/collection_7/incoming/admin_common/record?pid=abc:delete-me', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_200_OK assert response.json()['pid'] == 'abc:delete-me' response = test_client.delete( '/collection_7/incoming/admin_common/record?pid=abc:delete-me', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_200_OK assert response.json() is True response = test_client.get( '/collection_7/incoming/admin_common/record?pid=abc:delete-me', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_200_OK assert response.json() is None response = test_client.delete( '/collection_7/incoming/admin_common/record?pid=abc:delete-me', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_404_NOT_FOUND @@ -193,11 +193,11 @@ def test_incoming_on_disk_only(fastapi_client_simple): # add a random directory to the incoming area of collection_1 random_part = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=6)) dir_name = f'random_{random_part}' - (data_root / 'incoming' / dir_name).mkdir() + (data_root / 'incoming' / 'collection_1' / dir_name).mkdir() response = test_client.get( '/collection_1/incoming/', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_200_OK assert dir_name in response.json() diff --git a/dump_things_service/tests/test_pid_resolution.py b/dump_things_service/tests/test_pid_resolution.py index 4fe2a69..05913eb 100644 --- a/dump_things_service/tests/test_pid_resolution.py +++ b/dump_things_service/tests/test_pid_resolution.py @@ -24,7 +24,7 @@ def test_store_record_curated_with_unresolvable_pid(fastapi_client_simple): # Store a record in two collections response = test_client.post( f'/collection_1/curated/record/Person', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, json={'pid': 'unknown_prefix:test_pid'}, ) assert response.status_code == HTTP_422_UNPROCESSABLE_CONTENT @@ -36,7 +36,7 @@ def test_store_record_incoming_with_unresolvable_pid(fastapi_client_simple): # Store a record in two collections response = test_client.post( f'/collection_1/incoming/in_token_1/record/Person', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, json={'pid': 'unknown_prefix:test_pid'}, ) assert response.status_code == HTTP_422_UNPROCESSABLE_CONTENT diff --git a/dump_things_service/tests/test_roundtrip.py b/dump_things_service/tests/test_roundtrip.py index f324fd4..bc54a46 100644 --- a/dump_things_service/tests/test_roundtrip.py +++ b/dump_things_service/tests/test_roundtrip.py @@ -26,7 +26,7 @@ ttl_result_record_a = """@prefix abc: . xyz:HenryAdams a abc:Person ; abc:annotations [ a abc:Annotation ; - abc:annotation_tag ; + abc:annotation_tag ; abc:annotation_value "1970-01-01T00:00:00" ], [ a abc:Annotation ; abc:annotation_tag oxo:NCIT_C54269 ; @@ -44,7 +44,7 @@ xyz:HenryAdams a abc:Person ; abc:annotation_tag oxo:NCIT_C54269 ; abc:annotation_value "test_user_1" ], [ a abc:Annotation ; - abc:annotation_tag ; + abc:annotation_tag ; abc:annotation_value "1970-01-01T00:00:00" ] ; abc:given_name "Henryöäß" ; abc:schema_type "abc:Person" . diff --git a/dump_things_service/tests/test_roundtrip_flatsocial.py b/dump_things_service/tests/test_roundtrip_flatsocial.py index 46239cd..9fc4df9 100644 --- a/dump_things_service/tests/test_roundtrip_flatsocial.py +++ b/dump_things_service/tests/test_roundtrip_flatsocial.py @@ -48,7 +48,7 @@ dlflatsocial:another_john_ttl a dlflatsocial:Person ; dlthings:annotation_value "1970-01-01T00:00:00" ], [ a dlflat:FlatAnnotation ; dlthings:annotation_tag obo:NCIT_C54269 ; - dlthings:annotation_value "test_user_1" ] . + dlthings:annotation_value "user_all" ] . """ @@ -62,7 +62,7 @@ dlflatsocial:another_john_ttl a dlflatsocial:Person ; dlsocialmx:given_name "Johnöüß" ; dlthings:annotations [ a dlflat:FlatAnnotation ; dlthings:annotation_tag obo:NCIT_C54269 ; - dlthings:annotation_value "test_user_1" ], + dlthings:annotation_value "user_all" ], [ a dlflat:FlatAnnotation ; dlthings:annotation_tag ; dlthings:annotation_value "1970-01-01T00:00:00" ] . @@ -79,7 +79,7 @@ def test_json_ttl_json_dlflatsocial(fastapi_client_simple): # Deposit JSON records response = test_client.post( f'/collection_dlflatsocial-{i}/record/Person', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json=json_record, ) assert response.status_code == HTTP_200_OK @@ -87,7 +87,7 @@ def test_json_ttl_json_dlflatsocial(fastapi_client_simple): # Retrieve TTL records response = test_client.get( f'/collection_dlflatsocial-{i}/record?pid={json_record["pid"]}&format=ttl', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK ttl = response.text @@ -97,7 +97,7 @@ def test_json_ttl_json_dlflatsocial(fastapi_client_simple): response = test_client.post( f'/collection_dlflatsocial-{i}/record/Person?format=ttl', - headers={'content-type': 'text/turtle', 'x-dumpthings-token': 'token-1'}, + headers={'content-type': 'text/turtle', 'x-dumpthings-token': 'token-all'}, data=ttl, ) assert response.status_code == HTTP_200_OK @@ -105,7 +105,7 @@ def test_json_ttl_json_dlflatsocial(fastapi_client_simple): # Retrieve JSON record response = test_client.get( f'/collection_dlflatsocial-{i}/record?pid={new_ttl_pid}&format=json', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK json_object = cleaned_json(response.json(), remove_keys=('annotations',)) @@ -123,7 +123,7 @@ def test_ttl_json_ttl_dlflatsocial(fastapi_client_simple): response = test_client.post( f'/collection_dlflatsocial-{i}/record/Person?format=ttl', headers={ - 'x-dumpthings-token': 'token-1', + 'x-dumpthings-token': 'token-all', 'content-type': 'text/turtle', }, data=ttl_input_record, @@ -133,7 +133,7 @@ def test_ttl_json_ttl_dlflatsocial(fastapi_client_simple): # Retrieve JSON records response = test_client.get( f'/collection_dlflatsocial-{i}/record?pid=dlflatsocial:test_john_ttl&format=json', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK json_object = response.json() @@ -143,7 +143,7 @@ def test_ttl_json_ttl_dlflatsocial(fastapi_client_simple): response = test_client.post( f'/collection_dlflatsocial-{i}/record/Person?format=json', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json=json_object, ) assert response.status_code == HTTP_200_OK @@ -151,7 +151,7 @@ def test_ttl_json_ttl_dlflatsocial(fastapi_client_simple): # Retrieve ttl record response = test_client.get( f'/collection_dlflatsocial-{i}/record?pid={new_json_pid}&format=ttl', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK assert ( diff --git a/dump_things_service/tests/test_token_endpoints.py b/dump_things_service/tests/test_token_endpoints.py index d605d16..35bebbd 100644 --- a/dump_things_service/tests/test_token_endpoints.py +++ b/dump_things_service/tests/test_token_endpoints.py @@ -10,7 +10,7 @@ def test_token_creation(fastapi_client_simple): 'name': 'a', 'user_id': 'u_a', 'representation': '8bb6805ff10bcb1c2ca49dcd4bfef94d', - 'collection_info': { + 'collections': { 'collection_1': { 'mode': 'WRITE_COLLECTION', 'incoming_label': 'i_a' @@ -26,7 +26,7 @@ def test_token_creation(fastapi_client_simple): ) assert response.status_code == HTTP_201_CREATED - # Try to create another token eith name 'a', should result in a 4ß9-error + # Try to create another token with name 'a', should result in a 4ß9-error response = test_client.post( '/tokens', headers={'x-dumpthings-token': 'admin-1'}, diff --git a/dump_things_service/tests/test_utils.py b/dump_things_service/tests/test_utils.py index bfd4294..294a7d4 100644 --- a/dump_things_service/tests/test_utils.py +++ b/dump_things_service/tests/test_utils.py @@ -1,7 +1,7 @@ from dump_things_service.utils import cleaned_json basic_write_locations = tuple((x, f'token-{x}') for x in range(1, 3)) -unauthorized_write_locations = tuple((x, 'token-1') for x in range(3, 6)) +unauthorized_write_locations = tuple((x, 'token-1') for x in range(2, 9)) def test_cleaned_json(): diff --git a/dump_things_service/tests/test_web_interface.py b/dump_things_service/tests/test_web_interface.py index 61bdf44..ab63b3c 100644 --- a/dump_things_service/tests/test_web_interface.py +++ b/dump_things_service/tests/test_web_interface.py @@ -28,7 +28,7 @@ def test_web_interface_post_errors( test_client, _ = fastapi_client_simple result = test_client.post( f'/{collection_name}/record/{class_name}?{query}={format_name}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json={'pid': 'xyz:web_interface_test_pid0x123123'}, ) assert result.status_code < HTTP_500_INTERNAL_SERVER_ERROR @@ -54,7 +54,7 @@ def test_web_interface_get_class_errors( result = test_client.get( f'/{collection_name}/record/{class_name}?{query}={format_name}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert result.status_code < HTTP_500_INTERNAL_SERVER_ERROR @@ -79,6 +79,6 @@ def test_web_interface_get_pid_errors( result = test_client.get( f'/{collection_name}/records?{pid}&{query}={format_name}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert result.status_code < HTTP_500_INTERNAL_SERVER_ERROR diff --git a/dump_things_service/token_endpoints.py b/dump_things_service/token_endpoints.py index 07d81ec..d974464 100644 --- a/dump_things_service/token_endpoints.py +++ b/dump_things_service/token_endpoints.py @@ -2,6 +2,7 @@ import hashlib import logging import random import sys +from quopri import unhex from urllib.parse import quote from fastapi import ( @@ -19,11 +20,12 @@ from dump_things_service import ( from dump_things_service.abstract_config import ( TokenConfig, read_config, - store_config, + store_config, get_config, get_token_info_by_representation, + hash_token_representation, ) from dump_things_service.admin import authenticate_admin from dump_things_service.api_key import api_key_header_scheme -#from dump_things_service.config import get_config +from dump_things_service.instance_state import get_instance_state from dump_things_service.exceptions import ConfigError from dump_things_service.manifest import manifest_configuration from dump_things_service.utils import wrap_http_exception @@ -64,10 +66,10 @@ async def create_token( api_key: str = Depends(api_key_header_scheme), ) -> TokenRequest: - instance_config = get_config() - authenticate_admin(instance_config, api_key) + instance_state = get_instance_state() + authenticate_admin(instance_state, api_key) - abstract_config = read_config(store_path=instance_config.store_path) + abstract_config = read_config(store_path=instance_state.store_path) # Check for existing token-name if body.name in abstract_config.tokens: raise HTTPException( @@ -86,37 +88,42 @@ async def create_token( if body.representation: # We have a specific representation, check that it is not already used - for token in abstract_config.tokens.values(): - if token.representation == body.representation: - detail= f"Representation '{body.representation}' already exists." - raise HTTPException(status_code=HTTP_409_CONFLICT, detail=detail) + existing_token_info = get_token_info_by_representation( + abstract_config=abstract_config, + token_representation=body.representation, + ) + if existing_token_info: + detail= f"Token with identical representation already exists." + raise HTTPException(status_code=HTTP_409_CONFLICT, detail=detail) else: # Generate a random representation that does not yet exist. collision = True while collision: body.representation = random.randbytes(24).hex() - collision = any( - map( - lambda t: t.representation == body.representation, - instance_config.xxx_tokens - ) + existing_token_info = get_token_info_by_representation( + abstract_config=abstract_config, + token_representation=body.representation, ) + collision = existing_token_info is not None # Store the new token in the configuration abstract_config.tokens[body.name] = TokenConfig( user_id=body.user_id, collections=body.collections, - representation=body.representation, - hashed=body.hashed, + representation=( + hash_token_representation(body.representation) + if body.hashed + else body.representation + ), ) - # Manifest the configuration + # Manifest the new configuration with wrap_http_exception(ConfigError): - manifest_configuration(abstract_config, instance_config) + manifest_configuration(abstract_config, instance_state) # Persist the configuration store_config( - store_path=instance_config.store_path, + store_path=instance_state.store_path, config=abstract_config, ) @@ -165,10 +172,10 @@ async def get_token_with_name( api_key: str = Depends(api_key_header_scheme), ) -> TokenRequest: - instance_config = get_config() - authenticate_admin(instance_config, api_key) + instance_state = get_instance_state() + authenticate_admin(instance_state, api_key) - abstract_config = read_config(store_path=instance_config.store_path) + abstract_config = read_config(store_path=instance_state.store_path) if token_name not in abstract_config.tokens: detail = f"token with name '{token_name}' does not exist." raise HTTPException(status_code=HTTP_404_NOT_FOUND, detail=detail) diff --git a/dump_things_service/utils.py b/dump_things_service/utils.py index efc535e..f3811b1 100644 --- a/dump_things_service/utils.py +++ b/dump_things_service/utils.py @@ -34,28 +34,27 @@ from dump_things_service import ( ) from dump_things_service.abstract_config import ( Configuration, - TokenConfig, - tokens_for_collection, - CollectionConfig, - TokenCollectionConfig, + TokenModes, + TokenPermission, + mode_mapping, + get_default_token_config, + get_token_config_for_representation_and_collection, ) from dump_things_service.auth import ( AuthenticationError, AuthenticationInfo, ) -from dump_things_service.token import ( - TokenPermission, - get_token_parts, -) +from dump_things_service.api_token import get_token_parts +from dump_things_service.abstract_config import check_collection if TYPE_CHECKING: from pathlib import Path from dump_things_service import JSON - from dump_things_service.backends.record_dir import RecordDirStore + from dump_things_service.backends.record_dir import _RecordDirStore from dump_things_service.backends.sqlite import SQLiteBackend from dump_things_service.instance_state import InstanceState - from dump_things_service.store.model_store import ModelStore + from dump_things_service.store.model_store import _ModelStore logger = logging.getLogger('dump_things_service') @@ -126,13 +125,32 @@ def wrap_http_exception( def join_default_token_permissions( + abstract_configuration: Configuration, instance_state: InstanceState, permissions: TokenPermission, collection: str, ) -> TokenPermission: - default_token_name = instance_state.collections[collection].default_token - default_token_permissions = instance_state.tokens[collection][default_token_name]['permissions'] - result = TokenPermission() + + result = permissions.model_copy() + + # Get the default token name. If a default token is not defined, return + # token permissions without any right. A collection might define a default + # token that does not yet exist. We allow this inconsistency to decouple + # token and collection creation, i.e. to allow to create a collection first + # and a token later. + default_token_name = abstract_configuration.collections[collection].default_token + if default_token_name not in abstract_configuration.tokens: + return result + + # We allow inconsistencies in token/collection configuration space. This + # allows an administrator to create tokens and collections in two separate + # steps. Therefore we have to check whether the referred default token + # is actually defined for the collection. + if collection not in abstract_configuration.tokens[default_token_name].collections: + return result + + default_token_mode = abstract_configuration.tokens[default_token_name].collections[collection].mode + default_token_permissions = mode_mapping[TokenModes(default_token_mode)] result.curated_read = ( permissions.curated_read | default_token_permissions.curated_read ) @@ -219,10 +237,11 @@ def authenticate_token( def get_token_store( + abstract_config: Configuration, instance_state: InstanceState, collection_name: str, token_representation: str, -) -> tuple[ModelStore, str, TokenPermission, str] | tuple[None, None, None, None]: +) -> tuple[_ModelStore, TokenPermission, str] | tuple[None, None, None, None]: # Try to authenticate the token with the authentication providers that # are associated with the collection. @@ -233,27 +252,18 @@ def get_token_store( ) permissions = auth_info.token_permission - # If the token is hashed, get the hashed value. This is required because - # we associate token info with the hashed version of the token. - hashed_token = resolve_hashed_token( - instance_state, - collection_name, - token_representation, - ) - # If the token has no incoming-read or incoming-write permissions, we do not # need to create a store. if not permissions.incoming_read and not permissions.incoming_write: - instance_state.token_stores[collection_name][token_representation] = ( + instance_state.incoming_stores[collection_name][token_representation] = ( None, - hashed_token, permissions, auth_info.user_id, ) - return instance_state.token_stores[collection_name][token_representation] + return instance_state.incoming_stores[collection_name][token_representation] # Check whether the collection has an incoming definition - incoming = instance_state.incoming.get(collection_name) + incoming = abstract_config.collections[collection_name].incoming if not incoming: raise HTTPException( status_code=HTTP_401_UNAUTHORIZED, @@ -261,36 +271,35 @@ def get_token_store( ) # Check whether a store for this collection and token does already exist. - store_info = instance_state.token_stores[collection_name].get(token_representation) + store_info = instance_state.incoming_stores[collection_name].get(token_representation) if store_info: return store_info store_dir = instance_state.store_path / incoming / auth_info.incoming_label token_store = create_token_store( + abstract_configuration=abstract_config, instance_state=instance_state, collection_name=collection_name, store_dir=store_dir, ) - instance_state.token_stores[collection_name][token_representation] = ( + instance_state.incoming_stores[collection_name][token_representation] = ( token_store, - hashed_token, permissions, auth_info.user_id, ) - return instance_state.token_stores[collection_name][token_representation] + return instance_state.incoming_stores[collection_name][token_representation] def create_token_store( + abstract_configuration: Configuration, instance_state: InstanceState, collection_name: str, store_dir: Path, -) -> ModelStore: +) -> _ModelStore: from dump_things_service.backends.schema_type_layer import SchemaTypeLayer - from dump_things_service.config import ( - ConfigError, - get_backend_and_extension, - ) + from dump_things_service.abstract_config import get_backend_and_extension + from dump_things_service.exceptions import ConfigError from dump_things_service.store.model_store import ModelStore # Check if the store was already created and if it was created for the @@ -314,10 +323,10 @@ def create_token_store( store_dir.mkdir(parents=True, exist_ok=True) - schema_uri = instance_state.schemas[collection_name] + schema_uri = abstract_configuration.collections[collection_name].schema - # We get the backend information from the curated store - backend_type = instance_state.backend[collection_name].type + # We get the backend information from the abstract configuration + backend_type = abstract_configuration.collections[collection_name].backend.type backend_name, extension = get_backend_and_extension(backend_type) backend = instance_state.curated_stores[collection_name].backend @@ -331,7 +340,7 @@ def create_token_store( token_store = create_record_dir_token_store( store_dir=store_dir, order_by=backend.order_by, - schema_uri=instance_state.schemas[collection_name], + schema_uri=schema_uri, mapping_function=backend.pid_mapping_function, suffix=backend.suffix, ) @@ -349,7 +358,7 @@ def create_token_store( if extension == 'stl': token_store = SchemaTypeLayer(backend=token_store, schema=schema_uri) - submission_tags = instance_state.collections[collection_name].submission_tags + submission_tags = abstract_configuration.collections[collection_name].submission_tags tags = { 'id': submission_tags.submitter_id_tag, 'time': submission_tags.submission_time_tag, @@ -366,7 +375,7 @@ def create_record_dir_token_store( schema_uri: str, mapping_function: Callable, suffix: str, -) -> RecordDirStore: +) -> _RecordDirStore: from dump_things_service.backends.record_dir import RecordDirStore store_backend = RecordDirStore( @@ -410,30 +419,36 @@ def check_bounds( async def process_token( abstract_config: Configuration, + instance_state: InstanceState, api_key: str | None, collection: str, -) -> tuple[TokenPermission, ModelStore]: +) -> tuple[TokenPermission, _ModelStore]: - token_config = ( - get_default_token_config(abstract_config, collection) - if api_key is None - else get_token_config_by_representation(abstract_config, api_key) - ) + if api_key is None: + token_config = get_default_token_config(abstract_config, collection) + else: + token_elements = get_token_config_for_representation_and_collection( + abstract_config, + collection_name=collection, + token_representation=api_key, + ) + token_config = token_elements[1] if token_elements else None if not token_config: - detail = f'invalid token' + detail = f"invalid token for collection '{collection}'" raise HTTPException( status_code=HTTP_401_UNAUTHORIZED, detail=detail, ) - token_store, token, token_permissions, _ = get_token_store( + token_store, token_permissions, user_id = get_token_store( + abstract_config, instance_state, collection, - token_config, + token_config.representation, ) final_permissions = join_default_token_permissions( - instance_state, token_permissions, collection + abstract_config, instance_state, token_permissions, collection ) # Check for maintenance mode @@ -454,3 +469,35 @@ async def process_token( detail=f"No read access to curated or incoming data in collection '{collection}'.", ) return final_permissions, token_store + + +def get_required_incoming_labels( + abstract_config: Configuration, + collection_name: str, +) -> set[str]: + return set( + map( + lambda x: x[1], + get_required_incoming_info(abstract_config, collection_name), + ) + ) + + +def get_required_incoming_info( + abstract_config: Configuration, + collection_name: str, +) -> set[tuple[str, str]]: + return { + (token_name, this_collection_info.incoming_label) + for token_name, token_info in abstract_config.tokens.items() + for this_collection_name, this_collection_info in token_info.collections.items() + if this_collection_name == collection_name and mode_mapping[ + TokenModes(this_collection_info.mode) + ].incoming_write is True + } + + +def var_escape( + name: str, +) -> str: + return name.replace('_', '___').replace('-', '_0_') diff --git a/dump_things_service/validate.py b/dump_things_service/validate.py new file mode 100644 index 0000000..601d778 --- /dev/null +++ b/dump_things_service/validate.py @@ -0,0 +1,98 @@ +from typing import Any + +from fastapi import ( + Depends, + HTTPException, +) +from pydantic import ( + BaseModel, + TypeAdapter, + ValidationError, +) +from starlette.responses import JSONResponse + +from dump_things_service import ( + HTTP_400_BAD_REQUEST, + HTTP_403_FORBIDDEN, + HTTP_422_UNPROCESSABLE_CONTENT, + Format, +) +from dump_things_service.abstract_config import ( + check_collection, + get_config, + get_default_token_name, +) +from dump_things_service.api_key import api_key_header_scheme +from dump_things_service.converter import FormatConverter +from dump_things_service.instance_state import get_instance_state +from dump_things_service.utils import ( + get_token_store, + join_default_token_permissions, + wrap_http_exception, +) + + +def validate_record( + collection: str, + data: BaseModel | str, + class_name: str, + model: Any, + input_format: Format, + api_key: str | None = Depends(api_key_header_scheme), +) -> JSONResponse: + + instance_state = get_instance_state() + abstract_config = get_config() + + if input_format == Format.json and isinstance(data, str): + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail='Invalid JSON data provided.' + ) + + if input_format == Format.ttl and not isinstance(data, str): + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail='Invalid ttl data provided.' + ) + + check_collection(abstract_config, collection) + + token = ( + get_default_token_name(abstract_config, collection) + if api_key is None + else api_key + ) + + store, token_permissions, user_id = get_token_store( + abstract_config, + instance_state, + collection, + token, + ) + final_permissions = join_default_token_permissions( + abstract_config, + instance_state, + token_permissions, + collection, + ) + if not final_permissions.incoming_write: + raise HTTPException( + status_code=HTTP_403_FORBIDDEN, + detail=f"Not authorized to validate records for collection '{collection}'.", + ) + + if input_format == Format.ttl: + with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Conversion error'): + json_object = FormatConverter( + abstract_config.collections[collection].schema, + input_format=Format.ttl, + output_format=Format.json, + ).convert(data, class_name) + with wrap_http_exception(ValidationError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): + TypeAdapter(getattr(model, class_name)).validate_python(json_object) + else: + # Try to convert it into TTL to detect potential errors before storing + # the record + with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): + instance_state.validators[collection].validate(data) + + return JSONResponse(True) diff --git a/dump_things_service/config.py b/dump_things_service/xxx_no_config.py similarity index 99% rename from dump_things_service/config.py rename to dump_things_service/xxx_no_config.py index 6a65ed1..b931ad3 100644 --- a/dump_things_service/config.py +++ b/dump_things_service/xxx_no_config.py @@ -30,6 +30,7 @@ from dump_things_service import ( HTTP_404_NOT_FOUND, Format, ) +from dump_things_service.abstract_config import TokenPermission from dump_things_service.audit.gitaudit import GitAuditBackend from dump_things_service.backends.record_dir import RecordDirStore from dump_things_service.backends.schema_type_layer import SchemaTypeLayer @@ -46,8 +47,7 @@ from dump_things_service.exceptions import ( from dump_things_service.model import get_model_for_schema from dump_things_service.resolve_curie import resolve_curie from dump_things_service.store.model_store import ModelStore -from dump_things_service.token import ( - TokenPermission, +from dump_things_service.api_token import ( get_token_parts, hash_token, ) @@ -103,7 +103,7 @@ class TokenModes(enum.Enum): class TokenCollectionConfig(BaseModel): model_config = ConfigDict(extra='forbid', use_enum_values=True) mode: TokenModes - incoming_label: str = Field(strict=True) + incoming_label: str = None class TokenConfig(StrictModel): @@ -287,7 +287,7 @@ def get_mapping_function(collection_config: CollectionDirConfig): return mapping_functions[collection_config.idfx] -def get_permissions(mode: TokenModes) -> TokenPermission: +def xxx_get_permissions(mode: TokenModes) -> TokenPermission: return mode_mapping[mode] -- 2.52.0 From 392dbf6bed71794bf6ed6b06f83084211a385349 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 13 May 2026 09:45:04 +0200 Subject: [PATCH 06/64] add explicit admin-token endpoints Handle admin tokens seperately from the traditional token handling --- dump_things_service/abstract_config.py | 25 +- dump_things_service/admin.py | 36 +- dump_things_service/api_token.py | 16 - dump_things_service/collection_endpoints.py | 86 +-- dump_things_service/commands/check_pids.py | 16 +- dump_things_service/incoming.py | 6 +- dump_things_service/instance_state.py | 3 +- dump_things_service/main.py | 50 +- dump_things_service/tests/fixtures.py | 6 +- dump_things_service/token_endpoints.py | 72 +- dump_things_service/utils.py | 21 +- dump_things_service/xxx_no_config.py | 705 -------------------- 12 files changed, 141 insertions(+), 901 deletions(-) delete mode 100644 dump_things_service/xxx_no_config.py diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index 9acf159..a33b04a 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -32,6 +32,13 @@ logger = logging.getLogger('dump_things_service') g_abstract_configuration = None +dump_things_config_iri = 'dump_things:config' +dump_things_private_path = Path('__dump_things__') +config_backend_path = dump_things_private_path / 'config_store' +config_audit_path = dump_things_private_path / 'config_audit' +config_backend = None +config_audit = None + class StrictModel(BaseModel): model_config = ConfigDict( @@ -118,7 +125,6 @@ class TokenPermission(BaseModel): incoming_write: bool = False curated_write: bool = False zones_access: bool = False - admin: bool = False class TokenCollectionConfig(StrictModel): @@ -134,17 +140,14 @@ class TokenConfig(StrictModel): representation: str = '' -dump_things_config_iri = 'dump_things:config' -dump_things_private_path = Path('__dump_things__') -config_backend_path = dump_things_private_path / 'config_store' -config_audit_path = dump_things_private_path / 'config_audit' -config_backend = None -config_audit = None +class AdminTokenConfig(StrictModel): + representation: str class Configuration(BaseModel): collections: dict[str, CollectionConfig] = {} tokens: dict[str, TokenConfig] = {} + admin_tokens: dict[str, AdminTokenConfig] = {} pid: str = dump_things_config_iri @@ -174,14 +177,6 @@ mode_mapping = { curated_write=True, zones_access=True, ), - TokenModes.ADMIN: TokenPermission( - curated_read=True, - incoming_read=True, - incoming_write=True, - curated_write=True, - zones_access=True, - admin=True, - ) } diff --git a/dump_things_service/admin.py b/dump_things_service/admin.py index 0d12db5..375645b 100644 --- a/dump_things_service/admin.py +++ b/dump_things_service/admin.py @@ -1,21 +1,37 @@ -import sys +import logging from fastapi import HTTPException -from dump_things_service import ( - HTTP_401_UNAUTHORIZED, +from dump_things_service import HTTP_401_UNAUTHORIZED +from dump_things_service.abstract_config import ( + Configuration, + hash_token_representation, ) from dump_things_service.instance_state import InstanceState +logger = logging.getLogger('dump_things_service') + + def authenticate_admin( instance_state: InstanceState, + abstract_config: Configuration, api_key: str, ): - print('IMPLEMENT: authenticate_admin() ', file=sys.stderr, flush=True) - if api_key != 'admin-1': - detail = f'invalid admin token: {api_key}' - raise HTTPException( - status_code=HTTP_401_UNAUTHORIZED, - detail=detail, - ) + if api_key == instance_state.bootstrap_token: + logger.info('authenticate_admin: using bootstrap token') + return + + hashed_token_representation = hash_token_representation(api_key) + for token_name, token_config in abstract_config.admin_tokens.items(): + if token_config.representation == hashed_token_representation: + logger.info( + 'authenticate_admin: using token %s', + token_name, + ) + return + + raise HTTPException( + status_code=HTTP_401_UNAUTHORIZED, + detail='Invalid admin token', + ) diff --git a/dump_things_service/api_token.py b/dump_things_service/api_token.py index da173ee..e69de29 100644 --- a/dump_things_service/api_token.py +++ b/dump_things_service/api_token.py @@ -1,16 +0,0 @@ -import hashlib - - -def get_token_parts(token: str) -> list[str]: - parts = token.split('-', 1) - if len(parts) != 2: - msg = 'Invalid token format' - raise ValueError(msg) - return parts - - -def hash_token(token: str) -> str: - parts = get_token_parts(token) - hasher = hashlib.sha256() - hasher.update(parts[1].encode()) - return f'{parts[0]}-{hasher.hexdigest()}' diff --git a/dump_things_service/collection_endpoints.py b/dump_things_service/collection_endpoints.py index b15e2ca..b17be1b 100644 --- a/dump_things_service/collection_endpoints.py +++ b/dump_things_service/collection_endpoints.py @@ -1,14 +1,5 @@ import logging -import random -import sys -from pathlib import ( - Path, - PurePosixPath, -) -from typing import ( - Literal, - cast, -) +from typing import Literal from urllib.parse import quote from fastapi import ( @@ -17,20 +8,16 @@ from fastapi import ( HTTPException, Response, ) -from pydantic import ( - BaseModel, - ConfigDict, -) +from pydantic import BaseModel from dump_things_service import ( HTTP_201_CREATED, HTTP_409_CONFLICT, ) from dump_things_service.abstract_config import ( - read_config, store_config, CollectionConfig, - Configuration, + get_config, ) from dump_things_service.admin import authenticate_admin from dump_things_service.api_key import api_key_header_scheme @@ -39,7 +26,9 @@ from dump_things_service.manifest import manifest_configuration from dump_things_service.exceptions import ConfigError from dump_things_service.utils import wrap_http_exception + logger = logging.getLogger('dump_things_service') + router = APIRouter() @@ -74,37 +63,34 @@ async def create_collection( ): instance_state = get_instance_state() + abstract_config = get_config() # Check admin rights - authenticate_admin(instance_state, api_key) + authenticate_admin(instance_state, abstract_config, api_key) # TODO: read the current abstract configuration, check for a collection # of the given name. If it does not exist yet, add a collection # configuration that reflects the `body`. Then try to manifest the # new configuration. If there are no errors, persist the new # configuration. - configuration: Configuration = read_config( - store_path=instance_state.store_path - ) - # Check for existing collection name - if body.name in configuration.collections: + if body.name in abstract_config.collections: raise HTTPException( status_code=HTTP_409_CONFLICT, detail=f"Collection with name '{body.name}' already exists.", ) # Update the abstract configuration - configuration.collections[body.name] = body + abstract_config.collections[body.name] = body # Manifest the abstract configuration with wrap_http_exception(ConfigError): - manifest_configuration(configuration, instance_state) + manifest_configuration(abstract_config, instance_state) - # Persist the configuration + # Persist the abstract configuration store_config( store_path=instance_state.store_path, - config=configuration, + config=abstract_config, ) response.headers['Location'] = f'/collections/{quote(body.name)}' @@ -119,51 +105,9 @@ async def get_collections( api_key: str = Depends(api_key_header_scheme), ) -> list[CollectionConfig]: - instance_config = get_config() + instance_state = get_instance_state() + abstract_config = get_config() # Check admin rights - authenticate_admin(instance_config, api_key) - - abstract_config = read_config(store_path=instance_config.store_path) + authenticate_admin(instance_state, abstract_config, api_key) return list(abstract_config.collections.values()) - - -x = """ -def create_or_reuse( - instance_config: InstanceConfig, - local_path: PurePosixPath, - schema_location: str, - backend_spec: str, -): - full_path = Path(instance_config.store_path / local_path) - if full_path.exists(): - ensure_backend_type(full_path, schema_location, backend_spec) - else: - full_path.mkdir(parents=True, exist_ok=False) - create_backend(instance_config, full_path, schema_location, backend_spec) - - -def ensure_backend_type( - path: Path, - schema_location: str, - backend_spec: str, -): - backend_name, extension = get_backend_and_extension(backend_spec) - if backend_name is 'record_dir': - try: - config = Config.get_record_dir_config(path) - except ConfigError as e: - raise HTTPException( - status_code=HTTP_409_CONFLICT, - detail=f"did not find record_dir store in '{path}', reason: {e}", - ) from e - if config.schema != schema_location: - raise HTTPException( - status_code=HTTP_409_CONFLICT, - detail=f"existing record_dir store has different schema: '{config.schema}'", - ) - elif backend_name is 'sqlite': - pass - else: - raise ValueError(f"unknown backend type: '{backend_spec}'") -""" diff --git a/dump_things_service/commands/check_pids.py b/dump_things_service/commands/check_pids.py index fccf65c..764ca44 100644 --- a/dump_things_service/commands/check_pids.py +++ b/dump_things_service/commands/check_pids.py @@ -65,23 +65,23 @@ def check_pids_in_stores( def check_pids(): - instance_config = get_config() - abstract_config = read_config(instance_config.store_path) + instance_state = get_config() + abstract_config = read_config(instance_state.store_path) result = 0 # Check pids in curated stores - result += check_pids_in_stores(instance_config.curated_stores.values()) + result += check_pids_in_stores(instance_state.curated_stores.values()) # Check pids in incoming stores. Incoming stores can be defined in the # configuration, or can be generated by external authentication sources. # In the latter case, they are manifest as directories in the incoming area # of a collection. - for collection, collection_info in instance_config.collections.items(): + for collection, collection_info in instance_state.collections.items(): - configured_labels = get_config_labels(instance_config, collection) + configured_labels = get_config_labels(instance_state, collection) on_disk_labels = get_on_disk_labels( - store_path=instance_config.store_path, + store_path=instance_state.store_path, abstract_config=abstract_config, collection=collection, ) @@ -89,9 +89,9 @@ def check_pids(): token_stores = [ create_token_store( - instance_config, + instance_state, collection, - instance_config.store_path / collection_info.incoming / label + instance_state.store_path / collection_info.incoming / label ) for label in all_labels ] diff --git a/dump_things_service/incoming.py b/dump_things_service/incoming.py index 318108c..bffd745 100644 --- a/dump_things_service/incoming.py +++ b/dump_things_service/incoming.py @@ -394,21 +394,21 @@ def xxx_create_incoming_endpoints( logger.info('Creating dynamic incoming endpoints...') serial_number = count() - instance_config = get_config() + instance_state = get_instance_state() generated_tags = [] for collection, ( model, classes, model_var_name, - ) in instance_config.model_info.items(): + ) in instance_state.model_info.items(): tag_name = f'Incoming area: write records to the given incoming area of collection "{collection}"' if model_var_name not in global_dict: global_dict[model_var_name] = model - for class_name in instance_config.use_classes[collection]: + for class_name in instance_state.use_classes[collection]: # Create an endpoint to dump data of type `class_name` of schema # `model`. diff --git a/dump_things_service/instance_state.py b/dump_things_service/instance_state.py index d277ba3..4cd9942 100644 --- a/dump_things_service/instance_state.py +++ b/dump_things_service/instance_state.py @@ -22,6 +22,7 @@ from dump_things_service.abstract_config import ( MappingMethod, mapping_functions, ) +from dump_things_service.auth.config import ConfigAuthenticationSource from dump_things_service.converter import get_conversion_objects from dump_things_service.exceptions import ( @@ -144,7 +145,7 @@ def create_instance_state( bootstrap_token=bootstrap_token, fastapi_app=fastapi_app, ) - return cast(InstanceState, g_instance_state) + return g_instance_state def get_instance_state() -> InstanceState: diff --git a/dump_things_service/main.py b/dump_things_service/main.py index 3ba768e..4a80ee6 100644 --- a/dump_things_service/main.py +++ b/dump_things_service/main.py @@ -2,6 +2,7 @@ from __future__ import annotations # noqa: I001 -- the patches have to be impor import argparse import logging +import os from pathlib import Path from typing import ( Annotated, # noqa F401 -- used by generated code @@ -111,12 +112,12 @@ parser = argparse.ArgumentParser() parser.add_argument('--host', default='0.0.0.0') # noqa S104 parser.add_argument('--port', default=8000, type=int) parser.add_argument('--origins', action='append', default=[]) -#parser.add_argument( -# '-c', -# '--config', -# metavar='CONFIG_FILE', -# help="Read the configuration from 'CONFIG_FILE' instead of looking for it in the data store root directory. ", -#) +parser.add_argument( + '--admin-token', + type=str, + default='', + help='An initial admin token that will allow to add or remove tokens and collections', +) parser.add_argument( '--root-path', default='', @@ -207,16 +208,21 @@ tag_info = [ arguments = parser.parse_args() + +# Get bootstrap token from environment +if not arguments.admin_token: + arguments.admin_token = os.environ.get('DTS_ADMIN_TOKEN', '') + # Set the log level numeric_level = getattr(logging, arguments.log_level.upper(), None) if not isinstance(numeric_level, int): logger.error( - 'Invalid log level: %s, defaulting to level "WARNING"', arguments.log_level + 'Invalid log level: %s, defaulting to level "WARNING"', + arguments.log_level, ) else: logger.setLevel(level=numeric_level) - store_path = Path(arguments.store).resolve() if not store_path.exists(): logger.error(f'Store path does not exist: {store_path}') @@ -251,18 +257,9 @@ app.add_middleware( add_pagination(app) -#new_process_config( -# store_path=store_path, -# fastapi_app=app, -# order_by=['pid'], -# globals_dict=globals(), -#) -#g_instance_config = get_config() - - g_instance_state = create_instance_state( store_path=store_path, - bootstrap_token='admin-1', + bootstrap_token=arguments.admin_token, fastapi_app=app, ) @@ -275,23 +272,6 @@ manifest_configuration( instance_state=g_instance_state, ) -x = """ -create_store_endpoints( - app=app, - configuration=g_configuration, - tag_info=tag_info, - placeholder='placeholder_write', - global_dict=globals(), -) -create_validate_endpoints( - app=app, - configuration=g_configuration, - tag_info=tag_info, - placeholder='placeholder_validate', - global_dict=globals(), -) -""" - g_instance_state.fastapi_app.openapi_schema = None g_instance_state.fastapi_app.setup() diff --git a/dump_things_service/tests/fixtures.py b/dump_things_service/tests/fixtures.py index 8f9530e..31f051e 100644 --- a/dump_things_service/tests/fixtures.py +++ b/dump_things_service/tests/fixtures.py @@ -390,7 +390,11 @@ def fastapi_app_simple(dump_stores_simple): tmp_path, audit_tmp_path = dump_stores_simple old_sys_argv = sys.argv - sys.argv = ['test-runner', str(tmp_path)] + sys.argv = [ + 'test-runner', + '--admin-token', 'admin-1', + str(tmp_path), + ] from dump_things_service.main import app sys.argv = old_sys_argv diff --git a/dump_things_service/token_endpoints.py b/dump_things_service/token_endpoints.py index d974464..e775202 100644 --- a/dump_things_service/token_endpoints.py +++ b/dump_things_service/token_endpoints.py @@ -1,8 +1,6 @@ -import hashlib import logging import random import sys -from quopri import unhex from urllib.parse import quote from fastapi import ( @@ -11,6 +9,7 @@ from fastapi import ( HTTPException, Response, ) +from starlette.status import HTTP_406_NOT_ACCEPTABLE from dump_things_service import ( HTTP_201_CREATED, @@ -21,7 +20,7 @@ from dump_things_service.abstract_config import ( TokenConfig, read_config, store_config, get_config, get_token_info_by_representation, - hash_token_representation, + hash_token_representation, AdminTokenConfig, ) from dump_things_service.admin import authenticate_admin from dump_things_service.api_key import api_key_header_scheme @@ -39,6 +38,10 @@ class TokenRequest(TokenConfig): name: str +class AdminTokenRequest(AdminTokenConfig): + name: str + + def get_token_parts(token: str) -> list[str]: parts = token.split('-', 1) if len(parts) != 2: @@ -47,13 +50,6 @@ def get_token_parts(token: str) -> list[str]: return parts -def hash_token(token: str) -> str: - parts = get_token_parts(token) - hasher = hashlib.sha256() - hasher.update(parts[1].encode()) - return f'{parts[0]}-{hasher.hexdigest()}' - - @router.post( '/tokens', tags=['Administration interface'], @@ -67,9 +63,10 @@ async def create_token( ) -> TokenRequest: instance_state = get_instance_state() - authenticate_admin(instance_state, api_key) - abstract_config = read_config(store_path=instance_state.store_path) + + authenticate_admin(instance_state, abstract_config, api_key) + # Check for existing token-name if body.name in abstract_config.tokens: raise HTTPException( @@ -146,10 +143,11 @@ async def get_tokens( api_key: str = Depends(api_key_header_scheme), ) -> list[TokenRequest]: - instance_config = get_config() - authenticate_admin(instance_config, api_key) + instance_state = get_instance_state() + abstract_config = read_config(store_path=instance_state.store_path) + + authenticate_admin(instance_state, abstract_config, api_key) - abstract_config = read_config(store_path=instance_config.store_path) return [ TokenRequest( name=n, @@ -173,7 +171,9 @@ async def get_token_with_name( ) -> TokenRequest: instance_state = get_instance_state() - authenticate_admin(instance_state, api_key) + abstract_config = get_config() + + authenticate_admin(instance_state, abstract_config, api_key) abstract_config = read_config(store_path=instance_state.store_path) if token_name not in abstract_config.tokens: @@ -188,3 +188,43 @@ async def get_token_with_name( collections=t.collections, hashed=t.hashed, ) + + +@router.post( + '/admin_tokens', + tags=['Administration interface'], + name='Add a new admin token', +) +async def create_admin_token( + body: AdminTokenRequest, + api_key: str = Depends(api_key_header_scheme), +): + + instance_state = get_instance_state() + abstract_config = read_config(store_path=instance_state.store_path) + + authenticate_admin(instance_state, abstract_config, api_key) + + # Check for token content + if not body.representation: + detail='Empty administrator token is not allowed' + raise HTTPException(status_code=HTTP_406_NOT_ACCEPTABLE, detail=detail) + + # Check for existing token-name + if body.name in abstract_config.admin_tokens: + raise HTTPException( + status_code=HTTP_409_CONFLICT, + detail=f"Admin token with name '{body.name}' already exists.", + ) + + # It is sufficient to add the new admin token to the admin_token dictionary + # in order to manifest the new configuration. + abstract_config.admin_tokens[body.name] = AdminTokenConfig( + representation=body.representation, + ) + + # Persist the configuration. + store_config( + store_path=instance_state.store_path, + config=abstract_config, + ) diff --git a/dump_things_service/utils.py b/dump_things_service/utils.py index f3811b1..a6e77a6 100644 --- a/dump_things_service/utils.py +++ b/dump_things_service/utils.py @@ -8,7 +8,6 @@ To speed up processing, multiple indices could be introduced, e.g.: """ from __future__ import annotations -import hashlib import logging import sys from contextlib import contextmanager @@ -16,7 +15,6 @@ from functools import reduce from typing import ( TYPE_CHECKING, Callable, - Iterable, ) import fsspec @@ -37,6 +35,7 @@ from dump_things_service.abstract_config import ( TokenModes, TokenPermission, mode_mapping, + check_collection, get_default_token_config, get_token_config_for_representation_and_collection, ) @@ -44,8 +43,6 @@ from dump_things_service.auth import ( AuthenticationError, AuthenticationInfo, ) -from dump_things_service.api_token import get_token_parts -from dump_things_service.abstract_config import check_collection if TYPE_CHECKING: from pathlib import Path @@ -183,22 +180,6 @@ def get_on_disk_labels( } -def resolve_hashed_token( - instance_state: InstanceState, - collection_name: str, - token: str, -) -> str: - - # Check for hashed token and return the hashed token value instead - # of the plain text token value if the token is hashed. - if '-' in token: - return instance_state.hashed_tokens[collection_name].get( - get_token_parts(token)[0], - token, - ) - return token - - def authenticate_token( instance_state: InstanceState, collection_name: str, diff --git a/dump_things_service/xxx_no_config.py b/dump_things_service/xxx_no_config.py deleted file mode 100644 index b931ad3..0000000 --- a/dump_things_service/xxx_no_config.py +++ /dev/null @@ -1,705 +0,0 @@ -from __future__ import annotations - -import dataclasses -import enum -import hashlib -import logging -from functools import partial -from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Literal, -) - -import yaml -from fastapi import ( - FastAPI, - HTTPException, -) -from pydantic import ( - BaseModel, - ConfigDict, - Field, - ValidationError, -) -from yaml.scanner import ScannerError - -from dump_things_service import ( - HTTP_404_NOT_FOUND, - Format, -) -from dump_things_service.abstract_config import TokenPermission -from dump_things_service.audit.gitaudit import GitAuditBackend -from dump_things_service.backends.record_dir import RecordDirStore -from dump_things_service.backends.schema_type_layer import SchemaTypeLayer -from dump_things_service.backends.sqlite import SQLiteBackend -from dump_things_service.backends.sqlite import ( - record_file_name as sqlite_record_file_name, -) - -from dump_things_service.converter import FormatConverter, get_conversion_objects -from dump_things_service.exceptions import ( - ConfigError, - CurieResolutionError, -) -from dump_things_service.model import get_model_for_schema -from dump_things_service.resolve_curie import resolve_curie -from dump_things_service.store.model_store import ModelStore -from dump_things_service.api_token import ( - get_token_parts, - hash_token, -) -from dump_things_service.abstract_config import check_collection - -if TYPE_CHECKING: - import types - -logger = logging.getLogger('dump_things_service') - -config_file_name = '.dumpthings.yaml' -ignored_files = {'.', '..', config_file_name} - - -_global_config_instance = None - - -class StrictModel(BaseModel): - model_config = ConfigDict(extra='forbid') - - -class MappingMethod(enum.Enum): - digest_md5 = 'digest-md5' - digest_md5_p3 = 'digest-md5-p3' - digest_md5_p3_p3 = 'digest-md5-p3-p3' - digest_sha1 = 'digest-sha1' - digest_sha1_p3 = 'digest-sha1-p3' - digest_sha1_p3_p3 = 'digest-sha1-p3-p3' - after_last_colon = 'after-last-colon' - - -class CollectionDirConfig(StrictModel): - type: Literal['records'] - version: Literal[1] - schema: str - format: Literal['yaml'] - idfx: MappingMethod - - -class TokenModes(enum.Enum): - READ_CURATED = 'READ_CURATED' - READ_COLLECTION = 'READ_COLLECTION' - WRITE_COLLECTION = 'WRITE_COLLECTION' - READ_SUBMISSIONS = 'READ_SUBMISSIONS' - WRITE_SUBMISSIONS = 'WRITE_SUBMISSIONS' - SUBMIT = 'SUBMIT' - SUBMIT_ONLY = 'SUBMIT_ONLY' - NOTHING = 'NOTHING' - CURATOR = 'CURATOR' - ADMIN = 'ADMIN' - - -class TokenCollectionConfig(BaseModel): - model_config = ConfigDict(extra='forbid', use_enum_values=True) - mode: TokenModes - incoming_label: str = None - - -class TokenConfig(StrictModel): - user_id: str - collections: dict[str, TokenCollectionConfig] - hashed: bool = False - - -class BackendConfigRecordDir(StrictModel): - type: Literal['record_dir', 'record_dir+stl'] - - -class BackendConfigSQLite(StrictModel): - type: Literal['sqlite', 'sqlite+stl'] - schema: str - - -class ForgejoAuthConfig(StrictModel): - type: Literal['forgejo'] - url: str - organization: str - team: str - label_type: Literal['team', 'user'] - instance_id: str | None = None - repository: str | None = None - - -class ConfigAuthConfig(StrictModel): - type: Literal['config'] = 'config' - - -class GitAuditBackendConfig(StrictModel): - type: Literal['gitaudit'] - path: Path - auto_flush_timeout: int = 60 - - -class TagConfig(StrictModel): - submitter_id_tag: str = 'http://purl.obolibrary.org/obo/NCIT_C54269' - submission_time_tag: str = 'http://semanticscience.org/resource/SIO_001083' - - -class CollectionConfig(StrictModel): - default_token: str - curated: Path - incoming: Path | None = None - backend: BackendConfigRecordDir | BackendConfigSQLite | None = None - auth_sources: list[ForgejoAuthConfig | ConfigAuthConfig] = [ConfigAuthConfig()] - submission_tags: TagConfig = TagConfig() - use_classes: list[str] = dataclasses.field(default_factory=list) - ignore_classes: list[str] = dataclasses.field(default_factory=list) - audit_backends: list[GitAuditBackendConfig] = dataclasses.field(default_factory=list) - - -class GlobalConfig(StrictModel): - model_config = ConfigDict(strict=True) - - type: Literal['collections'] - version: Literal[1] - collections: dict[str, CollectionConfig] - tokens: dict[str, TokenConfig] - - -@dataclasses.dataclass -class XXXInstanceConfig: - store_path: Path - fastapi_app: FastAPI - order_by: list[str] = dataclasses.field(default_factory=list) - collections: dict = dataclasses.field(default_factory=dict) - all_stores: dict = dataclasses.field(default_factory=dict) - curated_stores: dict = dataclasses.field(default_factory=dict) - incoming: dict = dataclasses.field(default_factory=dict) - zones: dict = dataclasses.field(default_factory=dict) - permissions: dict = dataclasses.field(default_factory=dict) - model_info: dict = dataclasses.field(default_factory=dict) - token_stores: dict = dataclasses.field(default_factory=dict) - schemas: dict = dataclasses.field(default_factory=dict) - conversion_objects: dict = dataclasses.field(default_factory=dict) - backend: dict = dataclasses.field(default_factory=dict) - auth_providers: dict = dataclasses.field(default_factory=dict) - tokens: dict = dataclasses.field(default_factory=dict) - hashed_tokens: dict = dataclasses.field(default_factory=dict) - validators: dict = dataclasses.field(default_factory=dict) - use_classes: dict = dataclasses.field(default_factory=dict) - maintenance_mode: set = dataclasses.field(default_factory=set) - audit_backends: dict = dataclasses.field(default_factory=dict) - xxx_tokens: dict = dataclasses.field(default_factory=dict) - xxx_collections: dict = dataclasses.field(default_factory=dict) - - -mode_mapping = { - TokenModes.READ_CURATED: TokenPermission(curated_read=True), - TokenModes.READ_COLLECTION: TokenPermission( - curated_read=True, - incoming_read=True, - ), - TokenModes.WRITE_COLLECTION: TokenPermission( - curated_read=True, - incoming_read=True, - incoming_write=True, - ), - TokenModes.READ_SUBMISSIONS: TokenPermission(incoming_read=True), - TokenModes.WRITE_SUBMISSIONS: TokenPermission( - incoming_read=True, - incoming_write=True, - ), - TokenModes.SUBMIT: TokenPermission(curated_read=True, incoming_write=True), - TokenModes.SUBMIT_ONLY: TokenPermission(incoming_write=True), - TokenModes.NOTHING: TokenPermission(), - TokenModes.CURATOR: TokenPermission( - curated_read=True, - incoming_read=True, - incoming_write=True, - curated_write=True, - zones_access=True, - ), - TokenModes.ADMIN: TokenPermission( - curated_read=True, - incoming_read=True, - incoming_write=True, - curated_write=True, - zones_access=True, - admin=True, - ) -} - - -def get_hex_digest(hasher: Callable, data: str) -> str: - hash_context = hasher(data.encode()) - return hash_context.hexdigest() - - -def mapping_digest_p3( - hasher: Callable, - pid: str, - suffix: str, -) -> Path: - hex_digest = get_hex_digest(hasher, pid) - return Path(hex_digest[:3]) / (hex_digest[3:] + '.' + suffix) - - -def mapping_digest_p3_p3( - hasher: Callable, - pid: str, - suffix: str, -) -> Path: - hex_digest = get_hex_digest(hasher, pid) - return Path(hex_digest[:3]) / hex_digest[3:6] / (hex_digest[6:] + '.' + suffix) - - -def mapping_digest(hasher: Callable, pid: str, suffix: str) -> Path: - hex_digest = get_hex_digest(hasher, pid) - return Path(hex_digest + '.' + suffix) - - -def mapping_after_last_colon(pid: str, suffix: str) -> Path: - plain_result = pid.split(':')[-1] - # Escape any colons and slashes in the pid - escaped_result = ( - plain_result.replace('_', '__').replace('/', '_s').replace('.', '_d') - ) - return Path(escaped_result + '.' + suffix) - - -mapping_functions = { - MappingMethod.digest_md5: partial(mapping_digest, hashlib.md5), - MappingMethod.digest_md5_p3: partial(mapping_digest_p3, hashlib.md5), - MappingMethod.digest_md5_p3_p3: partial(mapping_digest_p3_p3, hashlib.md5), - MappingMethod.digest_sha1: partial(mapping_digest, hashlib.sha1), - MappingMethod.digest_sha1_p3: partial(mapping_digest_p3, hashlib.sha1), - MappingMethod.digest_sha1_p3_p3: partial(mapping_digest_p3_p3, hashlib.sha1), - MappingMethod.after_last_colon: mapping_after_last_colon, -} - - -def get_mapping_function_by_name(mapping_function_name: str) -> Callable: - return mapping_functions[MappingMethod(mapping_function_name)] - - -def get_mapping_function(collection_config: CollectionDirConfig): - return mapping_functions[collection_config.idfx] - - -def xxx_get_permissions(mode: TokenModes) -> TokenPermission: - return mode_mapping[mode] - - -class Config: - @staticmethod - def get_config_from_file(path: Path) -> GlobalConfig: - try: - return GlobalConfig(**yaml.load(path.read_text(), Loader=yaml.SafeLoader)) - except ScannerError as e: - msg = f'YAML-error while reading config file {path}: {e}' - raise ConfigError(msg) from e - except TypeError: - msg = f'Error in yaml file {path}: content is not a mapping' - raise ConfigError(msg) from None - except ValidationError as e: - msg = f'Pydantic-error reading config file {path}: {e}' - raise ConfigError(msg) from e - - @staticmethod - def get_config(path: Path, file_name=config_file_name) -> GlobalConfig: - return Config.get_config_from_file(path / file_name) - - @staticmethod - def get_collection_dir_config( - path: Path, - file_name: str = config_file_name, - ) -> CollectionDirConfig: - config_path = path / file_name - if not config_path.exists(): - msg = f'Config file does not exist: {config_path}' - raise ConfigError(msg) - try: - return CollectionDirConfig( - **yaml.load(config_path.read_text(), Loader=yaml.SafeLoader) - ) - except ScannerError as e: - msg = f'YAML-error while reading config file {config_path}: {e}' - raise ConfigError(msg) from e - except ValidationError as e: - msg = f'Pydantic-error reading config file {config_path}: {e}' - raise ConfigError(msg) from e - - -def new_process_config( - store_path: Path, - fastapi_app: FastAPI, - order_by: list[str], - globals_dict: dict[str, Any], -) -> XXXInstanceConfig: - global global_config_instance - - from dump_things_service.abstract_config import read_config - from dump_things_service.manifest import manifest_configuration - - global_config_instance = XXXInstanceConfig( - store_path=store_path, - fastapi_app=fastapi_app, - order_by=order_by, - ) - abstract_configuration = read_config(global_config_instance.store_path) - manifest_configuration(abstract_configuration, global_config_instance) - return global_config_instance - - -def get_config(): - return global_config_instance - - -def process_config( - store_path: Path, - config_file: Path, - order_by: list[str], - globals_dict: dict[str, Any], -) -> XXXInstanceConfig: - global global_config_instance - - config_object = Config.get_config_from_file(config_file) - global_config_instance = process_config_object( - store_path=store_path, - config_object=config_object, - order_by=order_by, - globals_dict=globals_dict, - ) - return global_config_instance - - -def process_config_object( - store_path: Path, - config_object: GlobalConfig, - order_by: list[str], - globals_dict: dict[str, Any], -): - from dump_things_service.auth.config import ConfigAuthenticationSource - from dump_things_service.auth.forgejo import ForgejoAuthenticationSource - - instance_config = XXXInstanceConfig(store_path=store_path) - instance_config.collections = config_object.collections - - for collection_name, collection_info in config_object.collections.items(): - # Create the authentication providers - instance_config.auth_providers[collection_name] = [] - - auth_provider_list = [] - # Check for multiple providers - for auth_provider in collection_info.auth_sources: - if auth_provider.type == 'config': - key = ('config',) - elif auth_provider.type == 'forgejo': - key = ( - 'forgejo', - auth_provider.url, - auth_provider.organization, - auth_provider.team, - auth_provider.label_type, - auth_provider.repository, - ) - else: - msg = f'Unknown authentication provider type: {auth_provider.type}' - raise ConfigError(msg) - if key in auth_provider_list: - logger.warning('Ignoring duplicated authentication provider: %s', key) - continue - auth_provider_list.append(key) - - for auth_provider in auth_provider_list: - if auth_provider[0] == 'config': - instance_config.auth_providers[collection_name].append( - ConfigAuthenticationSource( - instance_config=instance_config, - collection=collection_name, - ) - ) - else: - instance_config.auth_providers[collection_name].append( - ForgejoAuthenticationSource(*auth_provider[1:]) - ) - - # Set the default backend if not specified - backend = collection_info.backend or BackendConfigRecordDir( - type='record_dir+stl' - ) - - instance_config.backend[collection_name] = backend - backend_name, extension = get_backend_and_extension(backend.type) - if backend_name == 'record_dir': - # Get the config from the curated directory - collection_config = Config.get_collection_dir_config( - store_path / collection_info.curated - ) - schema = collection_config.schema - elif backend.type == 'sqlite': - schema = backend.schema - else: - msg = f'Unsupported backend `{collection_info.backend}` for collection `{collection_name}`.' - raise ConfigError(msg) - - # Generate the collection model - model, classes, model_var_name = get_model_for_schema(schema) - instance_config.model_info[collection_name] = model, classes, model_var_name - globals_dict[model_var_name] = model - - # Generate the curated stores - if backend_name == 'record_dir': - curated_store_backend = RecordDirStore( - root=store_path / collection_info.curated, - pid_mapping_function=get_mapping_function(collection_config), - suffix=collection_config.format, - order_by=order_by, - ) - curated_store_backend.build_index_if_needed(schema=schema) - elif backend.type == 'sqlite': - curated_store_backend = SQLiteBackend( - db_path=store_path / collection_info.curated / sqlite_record_file_name, - ) - else: - msg = f'Unsupported backend `{collection_info.backend}` for collection `{collection_name}`.' - raise ConfigError(msg) - - if extension == 'stl': - curated_store_backend = SchemaTypeLayer( - backend=curated_store_backend, - schema=schema, - ) - - curated_store = ModelStore( - schema=schema, - backend=curated_store_backend, - tags={ - 'id': collection_info.submission_tags.submitter_id_tag, - 'time': collection_info.submission_tags.submission_time_tag, - } - ) - - instance_config.curated_stores[collection_name] = curated_store - - if collection_info.incoming: - instance_config.incoming[collection_name] = collection_info.incoming - - instance_config.schemas[collection_name] = schema - if schema not in instance_config.conversion_objects: - instance_config.conversion_objects[schema] = get_conversion_objects(schema) - - # We do not create stores for tokens here, but leave it to the token - # authentication routine. - instance_config.token_stores[collection_name] = {} - - # Generate audit backends - instance_config.audit_backends[collection_name] = [] - for audit_backend in collection_info.audit_backends: - instance_config.audit_backends[collection_name].append( - GitAuditBackend(audit_backend.path, audit_backend.auto_flush_timeout) - ) - - # Create validator for each collection - for collection_name, _ in config_object.collections.items(): - instance_config.validators[collection_name] = FormatConverter( - schema=instance_config.schemas[collection_name], - input_format=Format.json, - output_format=Format.ttl, - ) - - # Resolve classes-blacklist and -whitelist - for collection_name, collection_info in config_object.collections.items(): - - model_info = instance_config.model_info[collection_name] - - # If the whitelist is present, get all whitelisted classes - if collection_info.use_classes: - # Check that the whitelisted classes exist - undefined = [ - name - for name in collection_info.use_classes - if name not in model_info[1] - ] - if undefined: - msg = ( - 'used class(es): ' - + ', '.join(undefined) - + ' not defined in schema: ' - + model_info[0].linkml_meta.root['id'] - ) - raise ConfigError(msg) - use_classes = collection_info.use_classes - else: - use_classes = model_info[1] - - # Check for blacklisted classes - undefined = [ - name - for name in collection_info.ignore_classes - if name not in use_classes - ] - if undefined: - msg = ( - 'ignored class(es): ' - + ', '.join(undefined) - + ' not defined in schema or in `used_classes`: ' - + model_info[0].linkml_meta.root['id'] - ) - raise ConfigError(msg) - - instance_config.use_classes[collection_name] = [ - name - for name in use_classes - if name not in collection_info.ignore_classes - ] - - # Read info for tokens from the configuration - for token_name, token_info in config_object.tokens.items(): - for collection_name, token_collection_info in token_info.collections.items(): - - if collection_name not in instance_config.hashed_tokens: - instance_config.hashed_tokens[collection_name] = {} - - if token_info.hashed: - token_id, _ = get_token_parts(token_name) - if token_id == '': - msg = 'empty ID in hashed token' - raise ConfigError(msg) - if token_id in instance_config.hashed_tokens[collection_name]: - msg = f'duplicated ID in hashed token: {token_id}' - raise ConfigError(msg) - instance_config.hashed_tokens[collection_name][token_id] = token_name - - if collection_name not in instance_config.tokens: - instance_config.tokens[collection_name] = {} - - permissions = get_permissions(token_collection_info.mode) - instance_config.tokens[collection_name][token_name] = { - 'permissions': permissions, - 'user_id': token_info.user_id, - 'incoming_label': token_collection_info.incoming_label, - } - - # There is only a token store if the token has incoming read- or - # incoming write-permissions. If a token store exists, we ensure - # that an incoming path is set and an incoming label exists. - if permissions.incoming_read or permissions.incoming_write: - # Check that the incoming label is set for a token that has - # access rights to incoming records. - if not token_collection_info.incoming_label: - msg = f'Token `{token_name}` with mode {token_collection_info.mode} must not have an empty `incoming_label`' - raise ConfigError(msg) - - if any(c in token_collection_info.incoming_label for c in ('\\', '/')): - msg = ( - f'Incoming label for token `...` on collection ' - f'`{collection_name}` must not contain slashes or ' - f'backslashes: `{token_collection_info.incoming_label}`' - ) - raise ConfigError(msg) - - if collection_name not in instance_config.incoming: - msg = ( - 'Incoming location not defined for collection ' - f'`{collection_name}`, which has at least one token ' - f'with write access' - ) - raise ConfigError(msg) - - # Create all incoming zones - incoming_location = ( - store_path - / instance_config.collections[collection_name].incoming - / token_collection_info.incoming_label - ) - incoming_location.mkdir(parents=True, exist_ok=True) - - # Check that default tokens are defined - for collection_name, collection_info in config_object.collections.items(): - if collection_info.default_token not in instance_config.tokens[collection_name]: - msg = f'Unknown default token: `{collection_info.default_token}`' - raise ConfigError(msg) - - # Check that config authentication source is present if tokens are defined - # in the config file - for collection_name, _ in config_object.collections.items(): - config_tokens = instance_config.tokens.get(collection_name, {}) - if config_tokens: - if not any( - isinstance(auth_source, ConfigAuthenticationSource) - for auth_source in instance_config.auth_providers[collection_name] - ): - msg = ( - f'Collection `{collection_name}` has tokens defined in ' - 'configuration file, but no `config` authentication source' - ) - raise ConfigError(msg) - - # Check that hashed plain tokens do not clash with hashed tokens: - hashed_plain_tokens = { - hash_token(token) - for collection in instance_config.collections - for token in instance_config.tokens[collection] - if '-' in token - } - hashed_tokens = { - value - for token_dict in instance_config.hashed_tokens.values() - for value in token_dict.values() - } - if hashed_plain_tokens.intersection(hashed_tokens): - msg = 'plain tokens clash with hashed tokens' - raise ConfigError(msg) - - # Check tags - for collection_name, collection_info in config_object.collections.items(): - module = instance_config.model_info[collection_name][0] - try: - resolve_curie(module, collection_info.submission_tags.submission_time_tag) - except CurieResolutionError as e: - raise ConfigError(str(e)) from e - - return instance_config - - -def get_backend_and_extension(backend_type: str) -> tuple[str, str]: - elements = backend_type.split('+') - return (elements[0], elements[1]) if len(elements) > 1 else (elements[0], '') - - -def get_zone( - instance_config: XXXInstanceConfig, - collection: str, - token: str, -) -> str | None: - """Get the zone for the given collection and token.""" - if collection not in instance_config.zones: - raise HTTPException( - status_code=HTTP_404_NOT_FOUND, - detail=f'No incoming zone defined for collection: {collection}', - ) - if token not in instance_config.zones[collection]: - raise HTTPException( - status_code=HTTP_404_NOT_FOUND, - detail=f'Missing incoming_label for given token in collection: {collection}', - ) - return instance_config.zones[collection][token] - - -def get_conversion_objects_for_collection( - instance_config: XXXInstanceConfig, - collection_name: str, -) -> dict: - """Get the conversion objects for the given collection.""" - check_collection(instance_config, collection_name) - return instance_config.conversion_objects[instance_config.schemas[collection_name]] - - -def get_model_info_for_collection( - instance_config: XXXInstanceConfig, - collection_name: str, -) -> tuple[types.ModuleType, dict[str, Any], str]: - check_collection(instance_config, collection_name) - return instance_config.model_info[collection_name] -- 2.52.0 From 0edba2a2fb14ae428542fc0142b5e915c99b43ba Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 13 May 2026 09:51:48 +0200 Subject: [PATCH 07/64] add token- and collection deletion --- dump_things_service/abstract_config.py | 26 +- dump_things_service/admin.py | 23 +- dump_things_service/collection.py | 18 - dump_things_service/collection_endpoints.py | 61 ++ dump_things_service/curated.py | 64 -- dump_things_service/dynamic_endpoints.py | 136 ---- dump_things_service/incoming.py | 87 --- dump_things_service/instance_state.py | 627 ------------------ dump_things_service/main.py | 98 --- .../tests/test_collection_administration.py | 210 ++++++ .../tests/test_pid_resolution.py | 38 -- dump_things_service/token_endpoints.py | 87 ++- 12 files changed, 370 insertions(+), 1105 deletions(-) delete mode 100644 dump_things_service/dynamic_endpoints.py create mode 100644 dump_things_service/tests/test_collection_administration.py diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index a33b04a..ba4f4df 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -8,6 +8,7 @@ from pathlib import ( from typing import ( Iterable, Literal, + cast, ) from fastapi import HTTPException @@ -116,7 +117,6 @@ class TokenModes(enum.Enum): SUBMIT_ONLY = 'SUBMIT_ONLY' NOTHING = 'NOTHING' CURATOR = 'CURATOR' - ADMIN = 'ADMIN' class TokenPermission(BaseModel): @@ -232,7 +232,7 @@ def get_config() -> Configuration: if not g_abstract_configuration: msg = 'Configuration not yet loaded' raise RuntimeError(msg) - return g_abstract_configuration + return cast(Configuration, g_abstract_configuration) def store_config( @@ -267,28 +267,6 @@ def tokens_for_collection( ) -def get_zone( - configuration: Configuration, - collection: str, - token: str, -) -> str | None: - """Get the zone for the given collection and token.""" - check_collection(configuration, collection) - - assert False - if collection not in configuration.collections: - raise HTTPException( - status_code=HTTP_404_NOT_FOUND, - detail=f'No incoming zone defined for collection: {collection}', - ) - if token not in instance_config.zones[collection]: - raise HTTPException( - status_code=HTTP_404_NOT_FOUND, - detail=f'Missing incoming_label for given token in collection: {collection}', - ) - return instance_config.zones[collection][token] - - def check_collection( abstract_config: Configuration, collection: str, diff --git a/dump_things_service/admin.py b/dump_things_service/admin.py index 375645b..2f95b59 100644 --- a/dump_things_service/admin.py +++ b/dump_things_service/admin.py @@ -18,19 +18,20 @@ def authenticate_admin( abstract_config: Configuration, api_key: str, ): - if api_key == instance_state.bootstrap_token: - logger.info('authenticate_admin: using bootstrap token') - return - - hashed_token_representation = hash_token_representation(api_key) - for token_name, token_config in abstract_config.admin_tokens.items(): - if token_config.representation == hashed_token_representation: - logger.info( - 'authenticate_admin: using token %s', - token_name, - ) + if api_key: + if api_key == instance_state.bootstrap_token: + logger.info('authenticate_admin: using bootstrap token') return + hashed_token_representation = hash_token_representation(api_key) + for token_name, token_config in abstract_config.admin_tokens.items(): + if token_config.representation == hashed_token_representation: + logger.info( + "authenticate_admin: using token '%s'", + token_name, + ) + return + raise HTTPException( status_code=HTTP_401_UNAUTHORIZED, detail='Invalid admin token', diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index 99c08fa..5ceba17 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -245,24 +245,6 @@ def create_collection( output_format=Format.ttl, ) - x = """ - if incoming_path: - if collection_name not in instance_state.incoming_stores: - instance_state.incoming_stores[collection_name] = {} - - # Create a store for each incoming area label in this collection - for token_name, path in active_incoming_store_info: - label_path = incoming_path / path - incoming_store = create_store( - instance_state, - label_path, - collection_configuration.backend, - collection_configuration.schema, - collection_configuration.submission_tags, - ) - instance_state.incoming_stores[collection_name][token_name] = incoming_store - """ - # Create the authentication sources for authentication_spec in collection_configuration.auth_sources: create_authentication_source( diff --git a/dump_things_service/collection_endpoints.py b/dump_things_service/collection_endpoints.py index b17be1b..a727234 100644 --- a/dump_things_service/collection_endpoints.py +++ b/dump_things_service/collection_endpoints.py @@ -12,6 +12,7 @@ from pydantic import BaseModel from dump_things_service import ( HTTP_201_CREATED, + HTTP_404_NOT_FOUND, HTTP_409_CONFLICT, ) from dump_things_service.abstract_config import ( @@ -111,3 +112,63 @@ async def get_collections( # Check admin rights authenticate_admin(instance_state, abstract_config, api_key) return list(abstract_config.collections.values()) + + +@router.get( + '/collections/{collection_name}', + tags=['Administration interface'], + name='Get existing collection by name', +) +async def get_collections( + collection_name: str, + api_key: str = Depends(api_key_header_scheme), +) -> CollectionConfig: + + instance_state = get_instance_state() + abstract_config = get_config() + + # Check admin rights + authenticate_admin(instance_state, abstract_config, api_key) + + if collection_name not in abstract_config.collections: + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f"Collection with name '{collection_name}' does not exist.", + ) + return abstract_config.collections[collection_name] + + +@router.delete( + '/collections/{collection_name}', + tags=['Administration interface'], + name='Delete collection with name', +) +async def delete_collections( + collection_name: str, + api_key: str = Depends(api_key_header_scheme), +): + + instance_state = get_instance_state() + abstract_config = get_config() + + # Check admin rights + authenticate_admin(instance_state, abstract_config, api_key) + + if collection_name not in abstract_config.collections: + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f"Collection with name '{collection_name}' does not exist.", + ) + + # Update the abstract configuration + del abstract_config.collections[collection_name] + + # Manifest the abstract configuration + with wrap_http_exception(ConfigError): + manifest_configuration(abstract_config, instance_state) + + # Persist the abstract configuration + store_config( + store_path=instance_state.store_path, + config=abstract_config, + ) diff --git a/dump_things_service/curated.py b/dump_things_service/curated.py index 2f6ef0b..dc79197 100644 --- a/dump_things_service/curated.py +++ b/dump_things_service/curated.py @@ -292,70 +292,6 @@ def _get_store_and_backend( return model_store, backend -def xxx_create_curated_endpoints( - app: FastAPI, - tag_info: list[dict[str, str]], - placeholder: str, - global_dict: dict, -): - # Create endpoints for all classes in all collections - logger.info('Creating dynamic curated endpoints...') - serial_number = count() - - instance_state = get_instance_state() - abstract_config = read_config(instance_state.store_path) - generated_tags = [] - - for collection, ( - model, - classes, - model_var_name, - ) in instance_config.model_info.items(): - - tag_name = f'Curated area: write records to curated area of collection "{collection}"' - - if model_var_name not in global_dict: - global_dict[model_var_name] = model - - for class_name in instance_state.collections[collection].active_classes: - - # Create an endpoint to dump data of type `class_name` of schema - # `application`. - endpoint_name = f'_endpoint_curated_{next(serial_number)}' - - endpoint_source = _endpoint_curated_template.format( - name=endpoint_name, - model_var_name=model_var_name, - class_name=class_name, - collection=collection, - info=f"'store {collection}/{class_name} objects'", - ) - exec(endpoint_source, global_dict) # noqa S102 - - # Create an API route for the endpoint - app.add_api_route( - path=f'/{collection}/curated/record/{class_name}', - endpoint=global_dict[endpoint_name], - methods=['POST'], - name=f'curated area: store "{class_name}" object (schema: {model.linkml_meta["id"]})', - response_model=None, - tags=[tag_name] - ) - - generated_tags.append({ - 'name': tag_name, - 'description': f'(requires **curator token**)', - }) - - index = tag_info.index({'name': placeholder, 'description': ''}) - tag_info[index:index + 1] = generated_tags - - logger.info( - 'Creation of %d curated endpoints completed.', - next(serial_number), - ) - - def store_curated_record( collection: str, data: BaseModel, diff --git a/dump_things_service/dynamic_endpoints.py b/dump_things_service/dynamic_endpoints.py deleted file mode 100644 index 0c01c53..0000000 --- a/dump_things_service/dynamic_endpoints.py +++ /dev/null @@ -1,136 +0,0 @@ -import logging -from itertools import count - -from fastapi import FastAPI - -from dump_things_service import config -from dump_things_service.abstract_config import Configuration -from dump_things_service.instance_state import InstanceState - - -logger = logging.getLogger('dump_things_service') - - -_endpoint_template = """ -async def {name}( - data: {model_var_name}.{class_name} | Annotated[str, Body(media_type='text/plain')], - api_key: str = Depends(api_key_header_scheme), - format: Format = Format.json, -) -> JSONResponse | PlainTextResponse: - logger.info('{name}(%s, %s, %s, %s)', repr(data), repr('{class_name}'), repr({model_var_name}), repr(format)) - return {handler}('{collection}', data, '{class_name}', {model_var_name}, format, api_key) -""" - - -def xxx_create_store_endpoints( - app: FastAPI, - configuration: Configuration, - tag_info: list[dict[str, str]], - placeholder: str, - global_dict: dict, -): - # Create endpoints for all classes in all collections - logger.info('Creating dynamic store_record endpoints...') - serial_number = count() - - generated_tags = [] - - for collection_name, collection in configuration.collections.items(): - - tag_name = f'Write records to collection "{collection}"' - - model, model_var_name = None, 'model_' + collection_name - - global_dict[model_var_name] = model - for class_name in instance_state.use_classes[collection]: - - # Create an endpoint to dump data of type `class_name` in version - # `version` of schema `application`. - endpoint_name = f'_endpoint_{next(serial_number)}' - - endpoint_source = _endpoint_template.format( - name=endpoint_name, - model_var_name=model_var_name, - class_name=class_name, - collection=collection, - info=f"'store {collection}/{class_name} objects'", - handler='store_record', - ) - exec(endpoint_source, global_dict) # noqa S102 - - # Create an API route for the endpoint - app.add_api_route( - path=f'/{collection}/record/{class_name}', - endpoint=global_dict[endpoint_name], - methods=['POST'], - name=f'store "{class_name}" object (schema: {model.linkml_meta["id"]})', - response_model=None, - tags=[tag_name] - ) - - generated_tags.append({ - 'name': tag_name, - 'description': '', - }) - - index = tag_info.index({'name': placeholder, 'description': ''}) - tag_info[index:index + 1] = generated_tags - - logger.info('Creation of %d endpoints completed.', next(serial_number)) - - -def xxx_create_validate_endpoints( - app: FastAPI, - configuration: Configuration, - tag_info: list[dict[str, str]], - placeholder: str, - global_dict: dict, -): - # Create endpoints for all classes in all collections - logger.info('Creating dynamic validate_record endpoints...') - serial_number = count() - - generated_tags = [] - - for collection_name, collection in configuration.collections.items(): - - tag_name = f'Validate records for collection "{collection}"' - - model, model_var_name = None, 'model_' + collection_name - - global_dict[model_var_name] = model - for class_name in instance_state.use_classes[collection]: - - # Create an endpoint to dump data of type `class_name` in version - # `version` of schema `application`. - endpoint_name = f'_endpoint_validate_{next(serial_number)}' - - endpoint_source = _endpoint_template.format( - name=endpoint_name, - model_var_name=model_var_name, - class_name=class_name, - collection=collection, - info=f"'validate {collection}/{class_name} objects'", - handler='validate_record', - ) - exec(endpoint_source, global_dict) # noqa S102 - - # Create an API route for the endpoint - instance_state.fastapi_app.add_api_route( - path=f'/{collection}/validate/record/{class_name}', - endpoint=global_dict[endpoint_name], - methods=['POST'], - name=f'Validate "{class_name}" object (schema: {model.linkml_meta["id"]})', - response_model=None, - tags=[tag_name] - ) - - generated_tags.append({ - 'name': tag_name, - 'description': '', - }) - - index = tag_info.index({'name': placeholder, 'description': ''}) - tag_info[index:index + 1] = generated_tags - - logger.info('Creation of %d endpoints completed.', next(serial_number)) diff --git a/dump_things_service/incoming.py b/dump_things_service/incoming.py index bffd745..2d888cd 100644 --- a/dump_things_service/incoming.py +++ b/dump_things_service/incoming.py @@ -1,13 +1,11 @@ from __future__ import annotations import logging -from itertools import count from typing import TYPE_CHECKING from fastapi import ( APIRouter, Depends, - FastAPI, HTTPException, ) from fastapi_pagination import ( @@ -49,28 +47,6 @@ if TYPE_CHECKING: from dump_things_service.store.model_store import _ModelStore -_endpoint_incoming_template = """ -async def {name}( - data: {model_var_name}.{class_name}, - label: str, - api_key: str = Depends(api_key_header_scheme), -) -> JSONResponse: - logger.info( - '{name}(%s, %s, %s)', - repr(data), - repr(label), - repr({model_var_name}), - ) - return await store_incoming_record( - '{collection}', - label, - data, - '{class_name}', - api_key, - ) -""" - - logger = logging.getLogger('dump_things_service') router = APIRouter() add_pagination(router) @@ -384,69 +360,6 @@ async def authorize_zones( ) -def xxx_create_incoming_endpoints( - app: FastAPI, - tag_info: list[dict[str, str]], - placeholder: str, - global_dict: dict, -): - # Create endpoints for all classes in all collections - logger.info('Creating dynamic incoming endpoints...') - serial_number = count() - - instance_state = get_instance_state() - generated_tags = [] - - for collection, ( - model, - classes, - model_var_name, - ) in instance_state.model_info.items(): - - tag_name = f'Incoming area: write records to the given incoming area of collection "{collection}"' - - if model_var_name not in global_dict: - global_dict[model_var_name] = model - - for class_name in instance_state.use_classes[collection]: - - # Create an endpoint to dump data of type `class_name` of schema - # `model`. - endpoint_name = f'_endpoint_incoming_{next(serial_number)}' - - endpoint_source = _endpoint_incoming_template.format( - name=endpoint_name, - model_var_name=model_var_name, - class_name=class_name, - collection=collection, - info=f"'store {collection}/{class_name} objects'", - ) - exec(endpoint_source, global_dict) # noqa S102 - - # Create an API route for the endpoint - app.add_api_route( - path=f'/{collection}/incoming/{{label}}/record/{class_name}', - endpoint=global_dict[endpoint_name], - methods=['POST'], - name=f'incoming area: store "{class_name}" object (schema: {model.linkml_meta["id"]})', - response_model=None, - tags=[tag_name] - ) - - generated_tags.append({ - 'name': tag_name, - 'description': f'(requires **curator token**)', - }) - - index = tag_info.index({'name': placeholder, 'description': ''}) - tag_info[index:index + 1] = generated_tags - - logger.info( - 'Creation of %d incoming endpoints completed.', - next(serial_number), - ) - - async def store_incoming_record( collection: str, label: str, diff --git a/dump_things_service/instance_state.py b/dump_things_service/instance_state.py index 4cd9942..90c49e4 100644 --- a/dump_things_service/instance_state.py +++ b/dump_things_service/instance_state.py @@ -8,7 +8,6 @@ from types import ModuleType from typing import ( Any, Callable, - cast, ) import yaml @@ -22,7 +21,6 @@ from dump_things_service.abstract_config import ( MappingMethod, mapping_functions, ) -from dump_things_service.auth.config import ConfigAuthenticationSource from dump_things_service.converter import get_conversion_objects from dump_things_service.exceptions import ( @@ -100,32 +98,6 @@ class InstanceState: order_by: list[str] = dataclasses.field(default_factory=list) all_stores: dict = dataclasses.field(default_factory=dict) - # OLD STUFF - #schemas: dict = dataclasses.field(default_factory=dict) - #conversion_objects: dict = dataclasses.field(default_factory=dict) - #model_info: dict = dataclasses.field(default_factory=dict) - - #order_by: list[str] = dataclasses.field(default_factory=list) - #collections: dict = dataclasses.field(default_factory=dict) - #curated_stores: dict = dataclasses.field(default_factory=dict) - #incoming_stores: dict = dataclasses.field(default_factory=dict) - #incoming: dict = dataclasses.field(default_factory=dict) - #zones: dict = dataclasses.field(default_factory=dict) - #permissions: dict = dataclasses.field(default_factory=dict) - #model_info: dict = dataclasses.field(default_factory=dict) - # token_stores: dict = dataclasses.field(default_factory=dict) - #schemas: dict = dataclasses.field(default_factory=dict) - #conversion_objects: dict = dataclasses.field(default_factory=dict) - #backend: dict = dataclasses.field(default_factory=dict) - #tokens: dict = dataclasses.field(default_factory=dict) - #hashed_tokens: dict = dataclasses.field(default_factory=dict) - #validators: dict = dataclasses.field(default_factory=dict) - #use_classes: dict = dataclasses.field(default_factory=dict) - #maintenance_mode: set = dataclasses.field(default_factory=set) - #audit_backends: dict = dataclasses.field(default_factory=dict) - #xxx_tokens: dict = dataclasses.field(default_factory=dict) - #xxx_collections: dict = dataclasses.field(default_factory=dict) - g_instance_state:InstanceState | None = None @@ -183,602 +155,3 @@ def get_mapping_function_by_name(mapping_function_name: str) -> Callable: def get_mapping_function(collection_config: RecordDirConfigFileContent): return mapping_functions[collection_config.idfx] - - -x = """ -class StrictModel(BaseModel): - model_config = ConfigDict(extra='forbid') - - -class MappingMethod(enum.Enum): - digest_md5 = 'digest-md5' - digest_md5_p3 = 'digest-md5-p3' - digest_md5_p3_p3 = 'digest-md5-p3-p3' - digest_sha1 = 'digest-sha1' - digest_sha1_p3 = 'digest-sha1-p3' - digest_sha1_p3_p3 = 'digest-sha1-p3-p3' - after_last_colon = 'after-last-colon' - - -class CollectionDirConfig(StrictModel): - type: Literal['records'] - version: Literal[1] - schema: str - format: Literal['yaml'] - idfx: MappingMethod - - -class TokenModes(enum.Enum): - READ_CURATED = 'READ_CURATED' - READ_COLLECTION = 'READ_COLLECTION' - WRITE_COLLECTION = 'WRITE_COLLECTION' - READ_SUBMISSIONS = 'READ_SUBMISSIONS' - WRITE_SUBMISSIONS = 'WRITE_SUBMISSIONS' - SUBMIT = 'SUBMIT' - SUBMIT_ONLY = 'SUBMIT_ONLY' - NOTHING = 'NOTHING' - CURATOR = 'CURATOR' - ADMIN = 'ADMIN' - - -class TokenCollectionConfig(BaseModel): - model_config = ConfigDict(extra='forbid', use_enum_values=True) - mode: TokenModes - incoming_label: str = Field(strict=True) - - -class TokenConfig(StrictModel): - user_id: str - collections: dict[str, TokenCollectionConfig] - hashed: bool = False - - -class BackendConfigRecordDir(StrictModel): - type: Literal['record_dir', 'record_dir+stl'] - - -class BackendConfigSQLite(StrictModel): - type: Literal['sqlite', 'sqlite+stl'] - schema: str - - -class ForgejoAuthConfig(StrictModel): - type: Literal['forgejo'] - url: str - organization: str - team: str - label_type: Literal['team', 'user'] - instance_id: str | None = None - repository: str | None = None - - -class ConfigAuthConfig(StrictModel): - type: Literal['config'] = 'config' - - -class GitAuditBackendConfig(StrictModel): - type: Literal['gitaudit'] - path: Path - auto_flush_timeout: int = 60 - - -class TagConfig(StrictModel): - submitter_id_tag: str = 'http://purl.obolibrary.org/obo/NCIT_C54269' - submission_time_tag: str = 'http://semanticscience.org/resource/SIO_001083' - - -class CollectionConfig(StrictModel): - default_token: str - curated: Path - incoming: Path | None = None - backend: BackendConfigRecordDir | BackendConfigSQLite | None = None - auth_sources: list[ForgejoAuthConfig | ConfigAuthConfig] = [ConfigAuthConfig()] - submission_tags: TagConfig = TagConfig() - use_classes: list[str] = dataclasses.field(default_factory=list) - ignore_classes: list[str] = dataclasses.field(default_factory=list) - audit_backends: list[GitAuditBackendConfig] = dataclasses.field(default_factory=list) - - -class GlobalConfig(StrictModel): - model_config = ConfigDict(strict=True) - - type: Literal['collections'] - version: Literal[1] - collections: dict[str, CollectionConfig] - tokens: dict[str, TokenConfig] - - -mode_mapping = { - TokenModes.READ_CURATED: TokenPermission(curated_read=True), - TokenModes.READ_COLLECTION: TokenPermission( - curated_read=True, - incoming_read=True, - ), - TokenModes.WRITE_COLLECTION: TokenPermission( - curated_read=True, - incoming_read=True, - incoming_write=True, - ), - TokenModes.READ_SUBMISSIONS: TokenPermission(incoming_read=True), - TokenModes.WRITE_SUBMISSIONS: TokenPermission( - incoming_read=True, - incoming_write=True, - ), - TokenModes.SUBMIT: TokenPermission(curated_read=True, incoming_write=True), - TokenModes.SUBMIT_ONLY: TokenPermission(incoming_write=True), - TokenModes.NOTHING: TokenPermission(), - TokenModes.CURATOR: TokenPermission( - curated_read=True, - incoming_read=True, - incoming_write=True, - curated_write=True, - zones_access=True, - ), - TokenModes.ADMIN: TokenPermission( - curated_read=True, - incoming_read=True, - incoming_write=True, - curated_write=True, - zones_access=True, - admin=True, - ) -} - - -def get_hex_digest(hasher: Callable, data: str) -> str: - hash_context = hasher(data.encode()) - return hash_context.hexdigest() - - -def mapping_digest_p3( - hasher: Callable, - pid: str, - suffix: str, -) -> Path: - hex_digest = get_hex_digest(hasher, pid) - return Path(hex_digest[:3]) / (hex_digest[3:] + '.' + suffix) - - -def mapping_digest_p3_p3( - hasher: Callable, - pid: str, - suffix: str, -) -> Path: - hex_digest = get_hex_digest(hasher, pid) - return Path(hex_digest[:3]) / hex_digest[3:6] / (hex_digest[6:] + '.' + suffix) - - -def mapping_digest(hasher: Callable, pid: str, suffix: str) -> Path: - hex_digest = get_hex_digest(hasher, pid) - return Path(hex_digest + '.' + suffix) - - -def mapping_after_last_colon(pid: str, suffix: str) -> Path: - plain_result = pid.split(':')[-1] - # Escape any colons and slashes in the pid - escaped_result = ( - plain_result.replace('_', '__').replace('/', '_s').replace('.', '_d') - ) - return Path(escaped_result + '.' + suffix) - - -mapping_functions = { - MappingMethod.digest_md5: partial(mapping_digest, hashlib.md5), - MappingMethod.digest_md5_p3: partial(mapping_digest_p3, hashlib.md5), - MappingMethod.digest_md5_p3_p3: partial(mapping_digest_p3_p3, hashlib.md5), - MappingMethod.digest_sha1: partial(mapping_digest, hashlib.sha1), - MappingMethod.digest_sha1_p3: partial(mapping_digest_p3, hashlib.sha1), - MappingMethod.digest_sha1_p3_p3: partial(mapping_digest_p3_p3, hashlib.sha1), - MappingMethod.after_last_colon: mapping_after_last_colon, -} - - -def get_mapping_function_by_name(mapping_function_name: str) -> Callable: - return mapping_functions[MappingMethod(mapping_function_name)] - - -def get_mapping_function(collection_config: CollectionDirConfig): - return mapping_functions[collection_config.idfx] - - -def get_permissions(mode: TokenModes) -> TokenPermission: - return mode_mapping[mode] - - -class Config: - @staticmethod - def get_config_from_file(path: Path) -> GlobalConfig: - try: - return GlobalConfig(**yaml.load(path.read_text(), Loader=yaml.SafeLoader)) - except ScannerError as e: - msg = f'YAML-error while reading config file {path}: {e}' - raise ConfigError(msg) from e - except TypeError: - msg = f'Error in yaml file {path}: content is not a mapping' - raise ConfigError(msg) from None - except ValidationError as e: - msg = f'Pydantic-error reading config file {path}: {e}' - raise ConfigError(msg) from e - - @staticmethod - def get_config(path: Path, file_name=config_file_name) -> GlobalConfig: - return Config.get_config_from_file(path / file_name) - - @staticmethod - def get_record_dir_config( - path: Path, - file_name: str = config_file_name, - ) -> CollectionDirConfig: - config_path = path / file_name - if not config_path.exists(): - msg = f'Config file does not exist: {config_path}' - raise ConfigError(msg) - try: - return CollectionDirConfig( - **yaml.load(config_path.read_text(), Loader=yaml.SafeLoader) - ) - except ScannerError as e: - msg = f'YAML-error while reading config file {config_path}: {e}' - raise ConfigError(msg) from e - except ValidationError as e: - msg = f'Pydantic-error reading config file {config_path}: {e}' - raise ConfigError(msg) from e - - -def new_process_config( - store_path: Path, - fastapi_app: FastAPI, - order_by: list[str], - globals_dict: dict[str, Any], -) -> InstanceConfig: - global global_config_instance - - from dump_things_service.abstract_config import read_config - from dump_things_service.manifest import manifest_configuration - - global_config_instance = InstanceConfig( - store_path=store_path, - fastapi_app=fastapi_app, - order_by=order_by, - ) - abstract_configuration = read_config(global_config_instance.store_path) - manifest_configuration(abstract_configuration, global_config_instance) - return global_config_instance - - -def get_config(): - return global_config_instance - - -def process_config( - store_path: Path, - config_file: Path, - order_by: list[str], - globals_dict: dict[str, Any], -) -> InstanceConfig: - global global_config_instance - - config_object = Config.get_config_from_file(config_file) - global_config_instance = process_config_object( - store_path=store_path, - config_object=config_object, - order_by=order_by, - globals_dict=globals_dict, - ) - return global_config_instance - - -def process_config_object( - store_path: Path, - config_object: GlobalConfig, - order_by: list[str], - globals_dict: dict[str, Any], -): - from dump_things_service.auth.config import ConfigAuthenticationSource - from dump_things_service.auth.forgejo import ForgejoAuthenticationSource - - instance_config = InstanceConfig(store_path=store_path) - instance_config.collections = config_object.collections - - for collection_name, collection_info in config_object.collections.items(): - # Create the authentication providers - instance_config.auth_providers[collection_name] = [] - - auth_provider_list = [] - # Check for multiple providers - for auth_provider in collection_info.auth_sources: - if auth_provider.type == 'config': - key = ('config',) - elif auth_provider.type == 'forgejo': - key = ( - 'forgejo', - auth_provider.url, - auth_provider.organization, - auth_provider.team, - auth_provider.label_type, - auth_provider.repository, - ) - else: - msg = f'Unknown authentication provider type: {auth_provider.type}' - raise ConfigError(msg) - if key in auth_provider_list: - logger.warning('Ignoring duplicated authentication provider: %s', key) - continue - auth_provider_list.append(key) - - for auth_provider in auth_provider_list: - if auth_provider[0] == 'config': - instance_config.auth_providers[collection_name].append( - ConfigAuthenticationSource( - instance_config=instance_config, - collection=collection_name, - ) - ) - else: - instance_config.auth_providers[collection_name].append( - ForgejoAuthenticationSource(*auth_provider[1:]) - ) - - # Set the default backend if not specified - backend = collection_info.backend or BackendConfigRecordDir( - type='record_dir+stl' - ) - - instance_config.backend[collection_name] = backend - backend_name, extension = get_backend_and_extension(backend.type) - if backend_name == 'record_dir': - # Get the config from the curated directory - collection_config = Config.get_record_dir_config( - store_path / collection_info.curated - ) - schema = collection_config.schema - elif backend.type == 'sqlite': - schema = backend.schema - else: - msg = f'Unsupported backend `{collection_info.backend}` for collection `{collection_name}`.' - raise ConfigError(msg) - - # Generate the collection model - model, classes, model_var_name = get_model_for_schema(schema) - instance_config.model_info[collection_name] = model, classes, model_var_name - globals_dict[model_var_name] = model - - # Generate the curated stores - if backend_name == 'record_dir': - curated_store_backend = RecordDirStore( - root=store_path / collection_info.curated, - pid_mapping_function=get_mapping_function(collection_config), - suffix=collection_config.format, - order_by=order_by, - ) - curated_store_backend.build_index_if_needed(schema=schema) - elif backend.type == 'sqlite': - curated_store_backend = SQLiteBackend( - db_path=store_path / collection_info.curated / sqlite_record_file_name, - ) - else: - msg = f'Unsupported backend `{collection_info.backend}` for collection `{collection_name}`.' - raise ConfigError(msg) - - if extension == 'stl': - curated_store_backend = SchemaTypeLayer( - backend=curated_store_backend, - schema=schema, - ) - - curated_store = ModelStore( - schema=schema, - backend=curated_store_backend, - tags={ - 'id': collection_info.submission_tags.submitter_id_tag, - 'time': collection_info.submission_tags.submission_time_tag, - } - ) - - instance_config.curated_stores[collection_name] = curated_store - - if collection_info.incoming: - instance_config.incoming[collection_name] = collection_info.incoming - - instance_config.schemas[collection_name] = schema - if schema not in instance_config.conversion_objects: - instance_config.conversion_objects[schema] = get_conversion_objects(schema) - - # We do not create stores for tokens here, but leave it to the token - # authentication routine. - instance_config.token_stores[collection_name] = {} - - # Generate audit backends - instance_config.audit_backends[collection_name] = [] - for audit_backend in collection_info.audit_backends: - instance_config.audit_backends[collection_name].append( - GitAuditBackend(audit_backend.path, audit_backend.auto_flush_timeout) - ) - - # Create validator for each collection - for collection_name, _ in config_object.collections.items(): - instance_config.validators[collection_name] = FormatConverter( - schema=instance_config.schemas[collection_name], - input_format=Format.json, - output_format=Format.ttl, - ) - - # Resolve classes-blacklist and -whitelist - for collection_name, collection_info in config_object.collections.items(): - - model_info = instance_config.model_info[collection_name] - - # If the whitelist is present, get all whitelisted classes - if collection_info.use_classes: - # Check that the whitelisted classes exist - undefined = [ - name - for name in collection_info.use_classes - if name not in model_info[1] - ] - if undefined: - msg = ( - 'used class(es): ' - + ', '.join(undefined) - + ' not defined in schema: ' - + model_info[0].linkml_meta.root['id'] - ) - raise ConfigError(msg) - use_classes = collection_info.use_classes - else: - use_classes = model_info[1] - - # Check for blacklisted classes - undefined = [ - name - for name in collection_info.ignore_classes - if name not in use_classes - ] - if undefined: - msg = ( - 'ignored class(es): ' - + ', '.join(undefined) - + ' not defined in schema or in `used_classes`: ' - + model_info[0].linkml_meta.root['id'] - ) - raise ConfigError(msg) - - instance_config.use_classes[collection_name] = [ - name - for name in use_classes - if name not in collection_info.ignore_classes - ] - - # Read info for tokens from the configuration - for token_name, token_info in config_object.tokens.items(): - for collection_name, token_collection_info in token_info.collections.items(): - - if collection_name not in instance_config.hashed_tokens: - instance_config.hashed_tokens[collection_name] = {} - - if token_info.hashed: - token_id, _ = get_token_parts(token_name) - if token_id == '': - msg = 'empty ID in hashed token' - raise ConfigError(msg) - if token_id in instance_config.hashed_tokens[collection_name]: - msg = f'duplicated ID in hashed token: {token_id}' - raise ConfigError(msg) - instance_config.hashed_tokens[collection_name][token_id] = token_name - - if collection_name not in instance_config.tokens: - instance_config.tokens[collection_name] = {} - - permissions = get_permissions(token_collection_info.mode) - instance_config.tokens[collection_name][token_name] = { - 'permissions': permissions, - 'user_id': token_info.user_id, - 'incoming_label': token_collection_info.incoming_label, - } - - # There is only a token store if the token has incoming read- or - # incoming write-permissions. If a token store exists, we ensure - # that an incoming path is set and an incoming label exists. - if permissions.incoming_read or permissions.incoming_write: - # Check that the incoming label is set for a token that has - # access rights to incoming records. - if not token_collection_info.incoming_label: - msg = f'Token `{token_name}` with mode {token_collection_info.mode} must not have an empty `incoming_label`' - raise ConfigError(msg) - - if any(c in token_collection_info.incoming_label for c in ('\\', '/')): - msg = ( - f'Incoming label for token `...` on collection ' - f'`{collection_name}` must not contain slashes or ' - f'backslashes: `{token_collection_info.incoming_label}`' - ) - raise ConfigError(msg) - - if collection_name not in instance_config.incoming: - msg = ( - 'Incoming location not defined for collection ' - f'`{collection_name}`, which has at least one token ' - f'with write access' - ) - raise ConfigError(msg) - - # Create all incoming zones - incoming_location = ( - store_path - / instance_config.collections[collection_name].incoming - / token_collection_info.incoming_label - ) - incoming_location.mkdir(parents=True, exist_ok=True) - - # Check that default tokens are defined - for collection_name, collection_info in config_object.collections.items(): - if collection_info.default_token not in instance_config.tokens[collection_name]: - msg = f'Unknown default token: `{collection_info.default_token}`' - raise ConfigError(msg) - - # Check that config authentication source is present if tokens are defined - # in the config file - for collection_name, _ in config_object.collections.items(): - config_tokens = instance_config.tokens.get(collection_name, {}) - if config_tokens: - if not any( - isinstance(auth_source, ConfigAuthenticationSource) - for auth_source in instance_config.auth_providers[collection_name] - ): - msg = ( - f'Collection `{collection_name}` has tokens defined in ' - 'configuration file, but no `config` authentication source' - ) - raise ConfigError(msg) - - # Check that hashed plain tokens do not clash with hashed tokens: - hashed_plain_tokens = { - hash_token(token) - for collection in instance_config.collections - for token in instance_config.tokens[collection] - if '-' in token - } - hashed_tokens = { - value - for token_dict in instance_config.hashed_tokens.values() - for value in token_dict.values() - } - if hashed_plain_tokens.intersection(hashed_tokens): - msg = 'plain tokens clash with hashed tokens' - raise ConfigError(msg) - - # Check tags - for collection_name, collection_info in config_object.collections.items(): - module = instance_config.model_info[collection_name][0] - try: - resolve_curie(module, collection_info.submission_tags.submission_time_tag) - except CurieResolutionError as e: - raise ConfigError(str(e)) from e - - return instance_config - - -def get_backend_and_extension(backend_type: str) -> tuple[str, str]: - elements = backend_type.split('+') - return (elements[0], elements[1]) if len(elements) > 1 else (elements[0], '') - - -def get_conversion_objects_for_collection( - configuration: Configuration, - instance_state: InstanceState, - collection: str, -) -> dict: - '''Get the conversion objects for the given collection.''' - check_collection(configuration, collection) - return instance_state.conversion_objects[collection] - - -def get_model_info_for_collection( - configuration: Configuration, - instance_state: InstanceState, - collection: str, -) -> tuple[types.ModuleType, dict[str, Any], str]: - '''Get the conversion objects for the given collection.''' - check_collection(configuration, collection) - return instance_state.model_info[collection] -""" diff --git a/dump_things_service/main.py b/dump_things_service/main.py index 4a80ee6..c8175a9 100644 --- a/dump_things_service/main.py +++ b/dump_things_service/main.py @@ -278,94 +278,6 @@ g_instance_state.fastapi_app.setup() add_pagination(g_instance_state.fastapi_app) -def xxx_store_record( - collection: str, - data: BaseModel | str, - class_name: str, - model: Any, - input_format: Format, - api_key: str | None = Depends(api_key_header_scheme), -) -> JSONResponse | PlainTextResponse: - if input_format == Format.json and isinstance(data, str): - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail='Invalid JSON data provided.' - ) - - if input_format == Format.ttl and not isinstance(data, str): - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail='Invalid ttl data provided.' - ) - - check_collection(g_instance_state, collection) - - token = ( - get_default_token_name(g_instance_state, collection) - if api_key is None - else api_key - ) - - # Get the token permissions and extend them by the default permissions. - # This call will also convert plaintext tokens into the hashed version of - # the token, if the token is hashed. This is necessary because we do not - # store the plaintext token, so all token-information is associated with - # the hashed representation of the token. - store, _, token_permissions, user_id = get_token_store( - g_instance_state, - collection, - token, - ) - final_permissions = join_default_token_permissions( - g_configuration, - g_instance_state, - token_permissions, - collection, - ) - if not final_permissions.incoming_write: - raise HTTPException( - status_code=HTTP_403_FORBIDDEN, - detail=f"Not authorized to submit to collection '{collection}'.", - ) - - if input_format == Format.ttl: - with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Conversion error'): - json_object = FormatConverter( - g_configuration.collections[collection].schema, - input_format=Format.ttl, - output_format=Format.json, - ).convert(data, class_name) - with wrap_http_exception(ValidationError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): - record = TypeAdapter(getattr(model, class_name)).validate_python(json_object) - else: - record = data - - with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): - g_instance_state.validators[collection].validate(record) - - with wrap_http_exception(CurieResolutionError): - stored_records = store.store_object(obj=record, submitter=user_id) - - if input_format == Format.ttl: - format_converter = FormatConverter( - g_configuration.collections[collection].schema, - input_format=Format.json, - output_format=Format.ttl, - ) - with wrap_http_exception(ValueError, header='Conversion error'): - return PlainTextResponse( - combine_ttl( - [ - format_converter.convert( - record, - class_name, - ) - for class_name, record in stored_records - ] - ), - media_type='text/turtle', - ) - return JSONResponse([record for _, record in stored_records]) - - @app.get('/', response_class=RedirectResponse) async def root() -> RedirectResponse: return RedirectResponse('/docs') @@ -722,16 +634,6 @@ async def delete_record( return True -# Create dynamic endpoints and rebuild the app to include all dynamically -# created endpoints. -#create_store_endpoints(app, g_instance_config, tag_info, 'placeholder_write', globals()) -#create_validate_endpoints(app, g_instance_config, tag_info, 'placeholder_validate', globals()) -#create_curated_endpoints(app, tag_info, 'placeholder_curated_write', globals()) -#create_incoming_endpoints(app, tag_info, 'placeholder_incoming_write', globals()) -#app.openapi_schema = None -#app.setup() - - def main(): uvicorn.run( app, diff --git a/dump_things_service/tests/test_collection_administration.py b/dump_things_service/tests/test_collection_administration.py new file mode 100644 index 0000000..b9e93e8 --- /dev/null +++ b/dump_things_service/tests/test_collection_administration.py @@ -0,0 +1,210 @@ +from pathlib import ( + Path, + PurePosixPath, +) + +from starlette.testclient import TestClient + +from dump_things_service import ( + HTTP_201_CREATED, + HTTP_200_OK, + HTTP_404_NOT_FOUND, + HTTP_401_UNAUTHORIZED, +) +from dump_things_service.abstract_config import ( + CollectionConfig, + TokenCollectionConfig, + TokenModes, + hash_token_representation, +) +from dump_things_service.token_endpoints import ( + TokenRequest, + AdminTokenRequest, +) +from dump_things_service.utils import cleaned_json + +# String representation of curated- and incoming-path +curated = 'admin_test_curated' +incoming = 'admin_test_incoming' + +# Path to a local simple test schema +test_schema_location = str((Path(__file__).parent / 'testschema.yaml').absolute()) + +new_collection_name = 'admin_test_collection' +new_token_name = 'admin_test_token' +new_token_representation = 'admin_test_token' +new_collection_config = CollectionConfig( + name=new_collection_name, + default_token='test_default_token', + curated=PurePosixPath(f'{curated}/admin_test_collection'), + schema=test_schema_location, + incoming=PurePosixPath(f'{incoming}/admin_test_collection'), +) + +new_token_request = TokenRequest( + name=new_token_name, + user_id='admin_test_token_user', + hashed=False, + representation=new_token_representation, + collections={ + new_collection_name: TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + ) + }, +) + +new_admin_token_name='New_Admin_Token' +plain_new_admin_token = 'admin-XXX' +new_admin_token_request = AdminTokenRequest( + name=new_admin_token_name, + representation=hash_token_representation(plain_new_admin_token) +) + + +def _name_in_openapi_paths( + test_client: TestClient, + name: str, +) -> bool: + response = test_client.get('/openapi.json') + open_api = response.json() + for path in open_api['paths'].keys(): + if name in path: + return True + return False + + +def test_collection_adding(fastapi_client_simple): + test_client, _ = fastapi_client_simple + + # Check the the collection does not yet exist + response = test_client.get( + f'/collections/{new_collection_name}', + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_404_NOT_FOUND + assert not _name_in_openapi_paths(test_client, new_collection_name) + + # Add a new collection + response = test_client.post( + '/collections', + headers={'x-dumpthings-token': 'admin-1'}, + json=new_collection_config.model_dump(mode='json'), + ) + assert response.status_code == HTTP_201_CREATED + assert _name_in_openapi_paths(test_client, new_collection_name) + + response = test_client.get( + f'/collections/{new_collection_name}', + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_200_OK + assert response.json() == new_collection_config.model_dump(mode='json') + + # Add a token to the collection + response = test_client.post( + '/tokens', + headers={'x-dumpthings-token': 'admin-1'}, + json=new_token_request.model_dump(mode='json'), + ) + assert response.status_code == HTTP_201_CREATED + + # Read the token back + response = test_client.get( + f'/tokens/{new_token_name}', + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_200_OK + assert response.json() == new_token_request.model_dump(mode='json') + + new_record = { + 'pid': 'http://example.com/admin-test-1', + 'given_name': 'Admin Test 1', + 'schema_type': 'abc:Person', + } + + # Add a record to the collection + response = test_client.post( + f'/{new_collection_name}/record/Person', + headers={'x-dumpthings-token': new_token_representation}, + json=new_record, + ) + assert response.status_code == HTTP_200_OK + + # Read the record back + response = test_client.get( + f'/{new_collection_name}/records/Person', + headers={'x-dumpthings-token': new_token_representation}, + ) + assert response.status_code == HTTP_200_OK + assert cleaned_json(response.json()[0], ('annotations',)) == new_record + + # Remove the token + response = test_client.delete( + f'/tokens/{new_token_name}', + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_200_OK + + # Check that posting is not possible with the removed token + response = test_client.post( + f'/{new_collection_name}/record/Person', + headers={'x-dumpthings-token': new_token_representation}, + json=new_record, + ) + assert response.status_code == HTTP_401_UNAUTHORIZED + + # Remove the collection + response = test_client.delete( + f'/collections/{new_collection_name}', + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_200_OK + #assert not _name_in_openapi_paths(test_client, new_collection_name) + + +def test_admin_token_management(fastapi_client_simple): + test_client, _ = fastapi_client_simple + + # Check that the new admin token is not yet working + response = test_client.get( + f'/collections/collection_1', + headers={'x-dumpthings-token': plain_new_admin_token}, + ) + assert response.status_code == HTTP_401_UNAUTHORIZED + + # Add a new admin token + response = test_client.post( + '/admin_tokens', + headers={'x-dumpthings-token': 'admin-1'}, + json=new_admin_token_request.model_dump(mode='json'), + ) + assert response.status_code == HTTP_201_CREATED + + # Try the new token + response = test_client.get( + f'/collections/collection_1', + headers={'x-dumpthings-token': plain_new_admin_token}, + ) + assert response.status_code == HTTP_200_OK + + # Check that the token shows up in the token list + response = test_client.get( + f'/admin_tokens', + headers={'x-dumpthings-token': plain_new_admin_token}, + ) + assert response.status_code == HTTP_200_OK + assert new_admin_token_name in response.json() + + # Delete the new admin token + response = test_client.delete( + f'/admin_tokens/{new_admin_token_name}', + headers={'x-dumpthings-token': plain_new_admin_token}, + ) + assert response.status_code == HTTP_200_OK + + response = test_client.get( + f'/admin_tokens', + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_200_OK + assert new_admin_token_name not in response.json() diff --git a/dump_things_service/tests/test_pid_resolution.py b/dump_things_service/tests/test_pid_resolution.py index 05913eb..b6cb49a 100644 --- a/dump_things_service/tests/test_pid_resolution.py +++ b/dump_things_service/tests/test_pid_resolution.py @@ -15,41 +15,3 @@ def test_store_record_validation(fastapi_client_simple, pid, url_part): json={'pid': pid}, ) assert response.status_code == HTTP_422_UNPROCESSABLE_CONTENT - - -x = """ -def test_store_record_curated_with_unresolvable_pid(fastapi_client_simple): - test_client, _ = fastapi_client_simple - - # Store a record in two collections - response = test_client.post( - f'/collection_1/curated/record/Person', - headers={'x-dumpthings-token': 'token_curator'}, - json={'pid': 'unknown_prefix:test_pid'}, - ) - assert response.status_code == HTTP_422_UNPROCESSABLE_CONTENT - - -def test_store_record_incoming_with_unresolvable_pid(fastapi_client_simple): - test_client, _ = fastapi_client_simple - - # Store a record in two collections - response = test_client.post( - f'/collection_1/incoming/in_token_1/record/Person', - headers={'x-dumpthings-token': 'token_curator'}, - json={'pid': 'unknown_prefix:test_pid'}, - ) - assert response.status_code == HTTP_422_UNPROCESSABLE_CONTENT - - -def test_store_record_with_non_ascii_pid(fastapi_client_simple): - test_client, _ = fastapi_client_simple - - # Store a record in two collections - response = test_client.post( - f'/collection_1/record/Person', - headers={'x-dumpthings-token': 'token-1'}, - json={'pid': 'abc:test_pid'}, - ) - assert response.status_code == HTTP_422_UNPROCESSABLE_CONTENT -""" diff --git a/dump_things_service/token_endpoints.py b/dump_things_service/token_endpoints.py index e775202..a0fad99 100644 --- a/dump_things_service/token_endpoints.py +++ b/dump_things_service/token_endpoints.py @@ -17,10 +17,13 @@ from dump_things_service import ( HTTP_409_CONFLICT, ) from dump_things_service.abstract_config import ( + AdminTokenConfig, TokenConfig, + get_config, + get_token_info_by_representation, + hash_token_representation, read_config, - store_config, get_config, get_token_info_by_representation, - hash_token_representation, AdminTokenConfig, + store_config, ) from dump_things_service.admin import authenticate_admin from dump_things_service.api_key import api_key_header_scheme @@ -190,10 +193,39 @@ async def get_token_with_name( ) +@router.delete( + '/tokens/{token_name}', + tags=['Administration interface'], + name='Delete token with name', +) +async def get_token_with_name( + token_name: str, + api_key: str = Depends(api_key_header_scheme), +): + + instance_state = get_instance_state() + abstract_config = get_config() + + authenticate_admin(instance_state, abstract_config, api_key) + + abstract_config = read_config(store_path=instance_state.store_path) + if token_name not in abstract_config.tokens: + detail = f"token with name '{token_name}' does not exist." + raise HTTPException(status_code=HTTP_404_NOT_FOUND, detail=detail) + + # Store the new token in the configuration + del abstract_config.tokens[token_name] + + # Manifest the new configuration + with wrap_http_exception(ConfigError): + manifest_configuration(abstract_config, instance_state) + + @router.post( '/admin_tokens', tags=['Administration interface'], name='Add a new admin token', + status_code=HTTP_201_CREATED, ) async def create_admin_token( body: AdminTokenRequest, @@ -228,3 +260,54 @@ async def create_admin_token( store_path=instance_state.store_path, config=abstract_config, ) + + +@router.get( + '/admin_tokens', + tags=['Administration interface'], + name='Get admin token names', +) +async def create_admin_token( + api_key: str = Depends(api_key_header_scheme), +) -> list[str]: + instance_state = get_instance_state() + abstract_config = read_config(store_path=instance_state.store_path) + + authenticate_admin(instance_state, abstract_config, api_key) + + return list(abstract_config.admin_tokens) + ( + [] + if instance_state.bootstrap_token is None + else ['__bootstrap__'] + ) + + +@router.delete( + '/admin_tokens/{token_name}', + tags=['Administration interface'], + name='Delete admin token with name', +) +async def create_admin_token( + token_name: str, + api_key: str = Depends(api_key_header_scheme), +): + + instance_state = get_instance_state() + abstract_config = read_config(store_path=instance_state.store_path) + + authenticate_admin(instance_state, abstract_config, api_key) + + # Check for token existence + if token_name not in abstract_config.admin_tokens: + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f"Admin token with name '{token_name}' does not exist.", + ) + + del abstract_config.admin_tokens[token_name] + + # Persist the configuration. + store_config( + store_path=instance_state.store_path, + config=abstract_config, + ) -- 2.52.0 From fe0781a2edf743a6e1107952031f3eaf9573c12a Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sun, 17 May 2026 21:16:26 +0200 Subject: [PATCH 08/64] add tag_info generation Ensure that generated endpoints are tagged and sorted correctly in the openapi docs. --- dump_things_service/collection.py | 18 ++++++--- dump_things_service/instance_state.py | 1 + dump_things_service/main.py | 53 --------------------------- dump_things_service/manifest.py | 52 ++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 58 deletions(-) diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index 5ceba17..a0f95cf 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -54,10 +54,11 @@ from dump_things_service.backends.sqlite import ( from dump_things_service.backends.schema_type_layer import SchemaTypeLayer from dump_things_service.instance_state import ( InstanceState, + InstanceStateCollectionInfo, get_record_dir_config, get_instance_state, get_schema_info, - record_dir_config_file_name, InstanceStateCollectionInfo, + record_dir_config_file_name, ) from dump_things_service.store.model_store import ( _ModelStore, @@ -456,6 +457,7 @@ def create_endpoint( collection_config: CollectionConfig, template: str, handler: str, + tag_group: str, tag_name: str, app: FastAPI, ): @@ -492,6 +494,10 @@ def create_endpoint( tags=[tag_name] ) + if tag_group not in instance_state.tags: + instance_state.tags[tag_group] = list() + instance_state.tags[tag_group].append({'name': tag_name, 'description': ''}) + logger.info( 'Creation of %d %s-endpoints completed.', len(active_classes), @@ -509,12 +515,13 @@ def create_endpoints_for_collection( operation_path, template, handler, + tag_group, tag_name, ) in ( - ('store', 'record', _endpoint_template, 'store_record', f'Write records to collection "{collection_config.name}"'), - ('validate', 'validate/record', _endpoint_template, 'validate_record', f'Validate records for collection "{collection_config.name}"'), - ('curated', 'curated/record', _endpoint_curated_template, 'store_curated_record', f'Store records in curated area of collection "{collection_config.name}"'), - ('incoming', 'incoming/{label}/record', _endpoint_incoming_template, 'store_incoming_record', f'Store records in incoming area "{{label}}" of collection "{collection_config.name}"'), + ('store', 'record', _endpoint_template, 'store_record', 'write', f'Write records to collection "{collection_config.name}"'), + ('validate', 'validate/record', _endpoint_template, 'validate_record', 'validate', f'Validate records for collection "{collection_config.name}"'), + ('curated', 'curated/record', _endpoint_curated_template, 'store_curated_record', 'curated_write', f'Curated area: store records in curated area of collection "{collection_config.name}"'), + ('incoming', 'incoming/{label}/record', _endpoint_incoming_template, 'store_incoming_record', 'incoming_write', f'Incoming area: store records in incoming area "{{label}}" of collection "{collection_config.name}"'), ): create_endpoint( operation_name=operation_name, @@ -523,6 +530,7 @@ def create_endpoints_for_collection( collection_config=collection_config, template=template, handler=handler, + tag_group=tag_group, tag_name=tag_name, app=app, ) diff --git a/dump_things_service/instance_state.py b/dump_things_service/instance_state.py index 90c49e4..9214ec1 100644 --- a/dump_things_service/instance_state.py +++ b/dump_things_service/instance_state.py @@ -97,6 +97,7 @@ class InstanceState: validators: dict = dataclasses.field(default_factory=dict) order_by: list[str] = dataclasses.field(default_factory=list) all_stores: dict = dataclasses.field(default_factory=dict) + tags: dict[str, list[dict]] = dataclasses.field(default_factory=dict) g_instance_state:InstanceState | None = None diff --git a/dump_things_service/main.py b/dump_things_service/main.py index c8175a9..f89d557 100644 --- a/dump_things_service/main.py +++ b/dump_things_service/main.py @@ -154,58 +154,6 @@ For more information refer to the [README-file](https://github.com/christian-mon of the project. """ -tag_info = [ - { - 'name': 'Server management', - 'description': 'General server operations', - }, - { - 'name': 'Read records', - 'description': 'Read records from the given collection', - }, - { - 'name': 'placeholder_write', - 'description': '', - }, - { - 'name': 'placeholder_validate', - 'description': '', - }, - { - 'name': 'Delete records', - 'description': 'Delete records from the incoming area associated with the authorization token', - }, - { - 'name': 'Curated area: read records', - 'description': 'Read records only from the curated area of the given collection (requires **curator token**)', - }, - { - 'name': 'placeholder_curated_write', - 'description': '', - }, - { - 'name': 'Curated area: delete records', - 'description': 'Delete records from the curated area of the given collection (requires **curator token**)', - }, - { - 'name': 'Incoming area: read labels', - 'description': 'Read labels of all incoming areas for the given collection (requires **curator token**)', - }, - { - 'name': 'Incoming area: read records', - 'description': 'Read records from the given incoming area of the given collection (requires **curator token**)', - }, - { - 'name': 'placeholder_incoming_write', - 'description': '', - }, - { - 'name': 'Incoming area: delete records', - 'description': 'Delete records from the given incoming area of the given collection (requires **curator token**)', - }, -] - - arguments = parser.parse_args() @@ -236,7 +184,6 @@ app = FastAPI( title='Dump Things Service', description=description, version=__version__, - openapi_tags=tag_info ) app.include_router(curated_router) diff --git a/dump_things_service/manifest.py b/dump_things_service/manifest.py index f7e9ca4..50e3963 100644 --- a/dump_things_service/manifest.py +++ b/dump_things_service/manifest.py @@ -13,6 +13,43 @@ from dump_things_service.instance_state import InstanceState logger = logging.getLogger('dump_things_service') +openapi_tags_template = [ + { + 'name': 'Server management', + 'description': 'General server operations', + }, + { + 'name': 'Read records', + 'description': 'Read records from the given collection', + }, + 'write', + 'validate', + { + 'name': 'Delete records', + 'description': 'Delete records from the incoming area associated with the authorization token', + }, + 'curated_write', + { + 'name': 'Curated area: delete records', + 'description': 'Delete records from the curated area of the given collection (requires **curator token**)', + }, + { + 'name': 'Incoming area: read labels', + 'description': 'Read labels of all incoming areas for the given collection (requires **curator token**)', + }, + { + 'name': 'Incoming area: read records', + 'description': 'Read records from the given incoming area of the given collection (requires **curator token**)', + }, + 'incoming_write', + { + 'name': 'Incoming area: delete records', + 'description': 'Delete records from the given incoming area of the given collection (requires **curator token**)', + }, +] + + + def manifest_configuration( configuration: Configuration, instance_state: InstanceState, @@ -95,6 +132,10 @@ def manifest_configuration( if new_collection_names or deleted_collection_names: instance_state.fastapi_app.openapi_schema = None + instance_state.fastapi_app.openapi_tags = create_openapi_tags( + instance_state, + openapi_tags_template, + ) instance_state.fastapi_app.setup() add_pagination(instance_state.fastapi_app) @@ -132,3 +173,14 @@ def delete_collection( # instance_state. Maybe all collection-specific information # should go into the instance_state.collection[x]-object!? # That would allow to remove it easily. + + +def create_openapi_tags( + instance_state: InstanceState, + openapi_tags_template: list[dict | str], +) -> list[dict]: + result = openapi_tags_template.copy() + for identifier, tag_list in instance_state.tags.items(): + index = result.index(identifier) + result[index:index + 1] = tag_list + return result -- 2.52.0 From effcbbca2fb8f7ce2ac91f25bef980ea378edbe9 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 18 May 2026 06:53:25 +0200 Subject: [PATCH 09/64] ensure that api-doc tags are deleted When a collection is deleted, the collection specific openapi doc tags are deleted as well. --- dump_things_service/collection.py | 20 ++++++++++---------- dump_things_service/instance_state.py | 2 +- dump_things_service/manifest.py | 26 +++++++++++++++++++++++--- 3 files changed, 34 insertions(+), 14 deletions(-) diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index a0f95cf..ead1134 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -236,7 +236,8 @@ def create_collection( if collection_configuration.ignore_classes: active_classes -= set(collection_configuration.ignore_classes) instance_state.collections[collection_name] = InstanceStateCollectionInfo( - active_classes=active_classes + active_classes=active_classes, + tag_info=dict(), ) # Create a validator for the collection @@ -467,26 +468,29 @@ def create_endpoint( collection_config.name, ) + collection_name = collection_config.name + instance_state.collections[collection_name].tag_info[tag_group] = tag_name + # TODO: get schema_info from instance_state!? model, classes, model_var_name = get_model_for_schema(collection_config.schema) globals()[model_var_name] = model - active_classes = instance_state.collections[collection_config.name].active_classes + active_classes = instance_state.collections[collection_name].active_classes for class_name in active_classes: - endpoint_name = f'_endpoint_{var_escape(collection_config.name)}_{operation_name}_{class_name}' + endpoint_name = f'_endpoint_{var_escape(collection_name)}_{operation_name}_{class_name}' endpoint_source = template.format( name=endpoint_name, model_var_name=model_var_name, class_name=class_name, - collection=collection_config.name, - info=f"'{operation_name} {collection_config.name}/{class_name} objects'", + collection=collection_name, + info=f"'{operation_name} {collection_name}/{class_name} objects'", handler=handler, ) exec(endpoint_source, globals()) # noqa S102 # Create an API route for the endpoint app.add_api_route( - path=f'/{collection_config.name}/{operation_path}/{class_name}', + path=f'/{collection_name}/{operation_path}/{class_name}', endpoint=globals()[endpoint_name], methods=['POST'], name=f'{operation_name} "{class_name}" object (schema: {model.linkml_meta["id"]})', @@ -494,10 +498,6 @@ def create_endpoint( tags=[tag_name] ) - if tag_group not in instance_state.tags: - instance_state.tags[tag_group] = list() - instance_state.tags[tag_group].append({'name': tag_name, 'description': ''}) - logger.info( 'Creation of %d %s-endpoints completed.', len(active_classes), diff --git a/dump_things_service/instance_state.py b/dump_things_service/instance_state.py index 9214ec1..436d276 100644 --- a/dump_things_service/instance_state.py +++ b/dump_things_service/instance_state.py @@ -57,6 +57,7 @@ class SchemaInfo: @dataclasses.dataclass class InstanceStateCollectionInfo: active_classes: set[str] + tag_info: dict[str, str] @cache @@ -97,7 +98,6 @@ class InstanceState: validators: dict = dataclasses.field(default_factory=dict) order_by: list[str] = dataclasses.field(default_factory=list) all_stores: dict = dataclasses.field(default_factory=dict) - tags: dict[str, list[dict]] = dataclasses.field(default_factory=dict) g_instance_state:InstanceState | None = None diff --git a/dump_things_service/manifest.py b/dump_things_service/manifest.py index 50e3963..dfbf76e 100644 --- a/dump_things_service/manifest.py +++ b/dump_things_service/manifest.py @@ -12,6 +12,12 @@ from dump_things_service.instance_state import InstanceState logger = logging.getLogger('dump_things_service') +tag_groups = [ + 'write', + 'validate', + 'curated_write', + 'incoming_write', +] openapi_tags_template = [ { @@ -169,7 +175,9 @@ def delete_collection( collection_name: str, ): instance_state.collections.pop(collection_name) - # TODO: remove all collection-related information from + instance_state.tags.pop(collection_name) + + # TODO: remove further collection-related information from # instance_state. Maybe all collection-specific information # should go into the instance_state.collection[x]-object!? # That would allow to remove it easily. @@ -179,8 +187,20 @@ def create_openapi_tags( instance_state: InstanceState, openapi_tags_template: list[dict | str], ) -> list[dict]: + + # Collect tag name lists for all tag groups that we have defined. + tag_group_info = { + tag_group: sorted( + [ + {'name': collection_info.tag_info[tag_group]} + for collection_info in instance_state.collections.values() + ], + key=lambda x: x['name'] + ) + for tag_group in tag_groups + } result = openapi_tags_template.copy() - for identifier, tag_list in instance_state.tags.items(): - index = result.index(identifier) + for tag_group, tag_list in tag_group_info.items(): + index = result.index(tag_group) result[index:index + 1] = tag_list return result -- 2.52.0 From 9fb7704bd35f3472169237eb97d39cf26acfec2f Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 18 May 2026 09:39:39 +0200 Subject: [PATCH 10/64] hash admin-token representation by default By default the admin-token representation is hashed. That means the request should contain the plain token, unless the request-attribute `hashed` is set. In this case, the `POST /admin-tokens` endpoint assumes that the representation is already hashed. It will perform a simple syntax verification to assure that the representation is a 40-digit hex number. --- dump_things_service/token_endpoints.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/dump_things_service/token_endpoints.py b/dump_things_service/token_endpoints.py index a0fad99..794ed10 100644 --- a/dump_things_service/token_endpoints.py +++ b/dump_things_service/token_endpoints.py @@ -1,5 +1,6 @@ import logging import random +import re import sys from urllib.parse import quote @@ -36,6 +37,8 @@ from dump_things_service.utils import wrap_http_exception logger = logging.getLogger('dump_things_service') router = APIRouter() +hash_matcher = re.compile(r'^[a-f0-9A-F]{40}$') + class TokenRequest(TokenConfig): name: str @@ -43,6 +46,7 @@ class TokenRequest(TokenConfig): class AdminTokenRequest(AdminTokenConfig): name: str + hashed: bool = False def get_token_parts(token: str) -> list[str]: @@ -242,6 +246,13 @@ async def create_admin_token( detail='Empty administrator token is not allowed' raise HTTPException(status_code=HTTP_406_NOT_ACCEPTABLE, detail=detail) + if not body.hashed: + body.representation = hash_token_representation(body.representation) + else: + if not hash_matcher.match(body.representation.strip()): + detail='Hashed token is not a 40-digits hex-number' + raise HTTPException(status_code=HTTP_406_NOT_ACCEPTABLE, detail=detail) + # Check for existing token-name if body.name in abstract_config.admin_tokens: raise HTTPException( -- 2.52.0 From feded97346b55f15d9ba394420e5fc69b8d55279 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 18 May 2026 09:58:54 +0200 Subject: [PATCH 11/64] remove token representation from `GET /tokens`-response --- dump_things_service/token_endpoints.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/dump_things_service/token_endpoints.py b/dump_things_service/token_endpoints.py index 794ed10..d8ab4e7 100644 --- a/dump_things_service/token_endpoints.py +++ b/dump_things_service/token_endpoints.py @@ -19,12 +19,13 @@ from dump_things_service import ( ) from dump_things_service.abstract_config import ( AdminTokenConfig, + StrictModel, TokenConfig, get_config, get_token_info_by_representation, hash_token_representation, read_config, - store_config, + store_config, TokenCollectionConfig, ) from dump_things_service.admin import authenticate_admin from dump_things_service.api_key import api_key_header_scheme @@ -44,6 +45,12 @@ class TokenRequest(TokenConfig): name: str +class TokenResponse(StrictModel): + name: str + user_id: str + collections: dict[str, TokenCollectionConfig] + + class AdminTokenRequest(AdminTokenConfig): name: str hashed: bool = False @@ -148,7 +155,7 @@ async def create_token( ) async def get_tokens( api_key: str = Depends(api_key_header_scheme), -) -> list[TokenRequest]: +) -> list[TokenResponse]: instance_state = get_instance_state() abstract_config = read_config(store_path=instance_state.store_path) @@ -156,12 +163,10 @@ async def get_tokens( authenticate_admin(instance_state, abstract_config, api_key) return [ - TokenRequest( + TokenResponse( name=n, user_id=t.user_id, - representation=t.representation, collections=t.collections, - hashed=t.hashed, ) for n, t in abstract_config.tokens.items() ] @@ -175,7 +180,7 @@ async def get_tokens( async def get_token_with_name( token_name: str, api_key: str = Depends(api_key_header_scheme), -) -> TokenRequest: +) -> TokenResponse: instance_state = get_instance_state() abstract_config = get_config() @@ -188,12 +193,10 @@ async def get_token_with_name( raise HTTPException(status_code=HTTP_404_NOT_FOUND, detail=detail) t = abstract_config.tokens[token_name] - return TokenRequest( + return TokenResponse( name=token_name, user_id=t.user_id, - representation=t.representation, collections=t.collections, - hashed=t.hashed, ) -- 2.52.0 From 355efa4f2093bd610e7f35b3b34a0772f272798d Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 18 May 2026 12:47:54 +0200 Subject: [PATCH 12/64] remove endpoints when deleting collections --- dump_things_service/collection.py | 42 +++++++++++++++++++++++++++++++ dump_things_service/manifest.py | 7 ++++-- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index ead1134..064ebf2 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -536,6 +536,48 @@ def create_endpoints_for_collection( ) +def delete_endpoints_for_collection( + instance_state: InstanceState, + collection_name: str, +): + + active_classes = instance_state.collections[collection_name].active_classes + + for operation_path in ( + 'record', + 'validate/record', + 'curated/record', + 'incoming/{label}/record' + ): + delete_endpoint( + collection_name=collection_name, + active_classes=active_classes, + operation_path=operation_path, + app=instance_state.fastapi_app, + ) + + +def delete_endpoint( + collection_name: str, + active_classes: set[str], + operation_path: str, + app: FastAPI, +): + + remove_paths_set = set( + f'/{collection_name}/{operation_path}/{class_name}' + for class_name in active_classes + ) + + remove_indices = [ + index + for index, api_route in enumerate(app.router.routes) + if api_route.path in remove_paths_set + ] + for index in sorted(remove_indices, reverse=True): + del app.router.routes[index] + + def store_record( collection: str, data: BaseModel | str, diff --git a/dump_things_service/manifest.py b/dump_things_service/manifest.py index dfbf76e..70fa2a9 100644 --- a/dump_things_service/manifest.py +++ b/dump_things_service/manifest.py @@ -6,7 +6,10 @@ from dump_things_service.abstract_config import ( Configuration, TokenConfig, ) -from dump_things_service.collection import create_collection +from dump_things_service.collection import ( + create_collection, + delete_endpoints_for_collection, +) from dump_things_service.instance_state import InstanceState @@ -113,6 +116,7 @@ def manifest_configuration( # configuration (we do not delete the collection from token-objects here # because token-objects are all re-created below). for collection_name in deleted_collection_names: + delete_endpoints_for_collection(instance_state, collection_name) delete_collection(instance_state, collection_name) # Create the internal representation objects for collections that have been @@ -175,7 +179,6 @@ def delete_collection( collection_name: str, ): instance_state.collections.pop(collection_name) - instance_state.tags.pop(collection_name) # TODO: remove further collection-related information from # instance_state. Maybe all collection-specific information -- 2.52.0 From e1d51eca8aeea3111655ea0b87da697abf9d53d1 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 19 May 2026 08:28:54 +0200 Subject: [PATCH 13/64] adapt collection- and token-managing tests Test that endpoints of deleted collections are removed from the openapi documentation and generate 404-errors when accessed. Adapt token listing test to the modified token result structure, which does not contain the token representation anymore. --- .../tests/test_collection_administration.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/dump_things_service/tests/test_collection_administration.py b/dump_things_service/tests/test_collection_administration.py index b9e93e8..dc9b03b 100644 --- a/dump_things_service/tests/test_collection_administration.py +++ b/dump_things_service/tests/test_collection_administration.py @@ -114,7 +114,11 @@ def test_collection_adding(fastapi_client_simple): headers={'x-dumpthings-token': 'admin-1'}, ) assert response.status_code == HTTP_200_OK - assert response.json() == new_token_request.model_dump(mode='json') + assert response.json() == { + 'name': new_token_request.name, + 'user_id': new_token_request.user_id, + 'collections': new_token_request.model_dump(mode='json')['collections'], + } new_record = { 'pid': 'http://example.com/admin-test-1', @@ -159,7 +163,16 @@ def test_collection_adding(fastapi_client_simple): headers={'x-dumpthings-token': 'admin-1'}, ) assert response.status_code == HTTP_200_OK - #assert not _name_in_openapi_paths(test_client, new_collection_name) + + # Check that the collection endpoints are not found + response = test_client.get( + f'/{new_collection_name}/records/Person', + headers={'x-dumpthings-token': new_token_representation}, + ) + assert response.status_code == HTTP_404_NOT_FOUND + + # Check that the openapi document is adjusted + assert not _name_in_openapi_paths(test_client, new_collection_name) def test_admin_token_management(fastapi_client_simple): -- 2.52.0 From d76a33698be2733cc5803efb2d9450abe800f5ba Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 19 May 2026 09:07:45 +0200 Subject: [PATCH 14/64] use hashed admin token representations When adding the plain admin token `t` via `/admin_tokens`, `AdminTokenRequest.representation` must contain `hash_token_representation(t)`. The value of `AdminTokenRequest.representation` is checked for the correct pattern (a 40-digit hex number) and stored verbatim in the configuration. To authenticate an administrator, the token provided in the `x-dumpthings-token`-header is hashed and compared to the stored values of `AdminTokenRequest.representation`. --- dump_things_service/token_endpoints.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/dump_things_service/token_endpoints.py b/dump_things_service/token_endpoints.py index d8ab4e7..ad067ce 100644 --- a/dump_things_service/token_endpoints.py +++ b/dump_things_service/token_endpoints.py @@ -53,7 +53,6 @@ class TokenResponse(StrictModel): class AdminTokenRequest(AdminTokenConfig): name: str - hashed: bool = False def get_token_parts(token: str) -> list[str]: @@ -249,12 +248,9 @@ async def create_admin_token( detail='Empty administrator token is not allowed' raise HTTPException(status_code=HTTP_406_NOT_ACCEPTABLE, detail=detail) - if not body.hashed: - body.representation = hash_token_representation(body.representation) - else: - if not hash_matcher.match(body.representation.strip()): - detail='Hashed token is not a 40-digits hex-number' - raise HTTPException(status_code=HTTP_406_NOT_ACCEPTABLE, detail=detail) + if not hash_matcher.match(body.representation.strip()): + detail='Hashed token is not a 40-digits hex-number' + raise HTTPException(status_code=HTTP_406_NOT_ACCEPTABLE, detail=detail) # Check for existing token-name if body.name in abstract_config.admin_tokens: -- 2.52.0 From 938b76b28188e55233c99309a0bb228c5477f07d Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 19 May 2026 09:20:02 +0200 Subject: [PATCH 15/64] fix code formatting --- dump_things_service/tests/test_collection_administration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dump_things_service/tests/test_collection_administration.py b/dump_things_service/tests/test_collection_administration.py index dc9b03b..b178c5b 100644 --- a/dump_things_service/tests/test_collection_administration.py +++ b/dump_things_service/tests/test_collection_administration.py @@ -57,7 +57,7 @@ new_admin_token_name='New_Admin_Token' plain_new_admin_token = 'admin-XXX' new_admin_token_request = AdminTokenRequest( name=new_admin_token_name, - representation=hash_token_representation(plain_new_admin_token) + representation=hash_token_representation(plain_new_admin_token), ) -- 2.52.0 From b63eb4af9a780e2230a1dc53c6b1d2ea71015e09 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 19 May 2026 12:45:52 +0200 Subject: [PATCH 16/64] remove unused code and files --- dump_things_service/api_token.py | 0 dump_things_service/auth/__init__.py | 3 +-- dump_things_service/backends/mongo.py | 0 dump_things_service/collection.py | 2 +- dump_things_service/main.py | 18 ++---------------- 5 files changed, 4 insertions(+), 19 deletions(-) delete mode 100644 dump_things_service/api_token.py delete mode 100644 dump_things_service/backends/mongo.py diff --git a/dump_things_service/api_token.py b/dump_things_service/api_token.py deleted file mode 100644 index e69de29..0000000 diff --git a/dump_things_service/auth/__init__.py b/dump_things_service/auth/__init__.py index 56231dd..051720f 100644 --- a/dump_things_service/auth/__init__.py +++ b/dump_things_service/auth/__init__.py @@ -14,9 +14,8 @@ import abc import dataclasses from typing import TYPE_CHECKING - if TYPE_CHECKING: - from dump_things_service.api_token import TokenPermission + from dump_things_service.abstract_config import TokenPermission class AuthenticationError(Exception): diff --git a/dump_things_service/backends/mongo.py b/dump_things_service/backends/mongo.py deleted file mode 100644 index e69de29..0000000 diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index 064ebf2..632ebc7 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -81,7 +81,7 @@ from dump_things_service.utils import ( # This following lines are required for dynamic endpoint generation -from typing import Annotated +from typing import Annotated #noqa 401 from fastapi import ( Body, Depends, diff --git a/dump_things_service/main.py b/dump_things_service/main.py index f89d557..308bcac 100644 --- a/dump_things_service/main.py +++ b/dump_things_service/main.py @@ -4,23 +4,17 @@ import argparse import logging import os from pathlib import Path -from typing import ( - Annotated, # noqa F401 -- used by generated code - Any, - TYPE_CHECKING, -) +from typing import TYPE_CHECKING from dump_things_service.manifest import manifest_configuration # Perform the patching before importing any third-party libraries -from dump_things_service.patches import enabled # noqa: F401 +from dump_things_service.patches import enabled # noqa F401 -- used by generated code import uvicorn from fastapi import ( - Body, # noqa F401 -- used by generated code Depends, FastAPI, HTTPException, - Response, # noqa F401 -- used by generated code ) from fastapi.middleware.cors import CORSMiddleware from fastapi_pagination import ( @@ -31,11 +25,8 @@ from fastapi_pagination import ( from fastapi_pagination.utils import disable_installed_extensions_check from pydantic import ( BaseModel, - TypeAdapter, - ValidationError, ) from starlette.responses import ( - JSONResponse, PlainTextResponse, RedirectResponse, ) @@ -44,13 +35,11 @@ from dump_things_service import ( HTTP_400_BAD_REQUEST, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND, - HTTP_422_UNPROCESSABLE_CONTENT, Format, ) from dump_things_service.__about__ import __version__ from dump_things_service.abstract_config import ( check_collection, - get_default_token_name, read_config, ) from dump_things_service.api_key import api_key_header_scheme @@ -72,9 +61,6 @@ from dump_things_service.token_endpoints import router as token_router from dump_things_service.utils import ( authenticate_token, check_bounds, - combine_ttl, - get_token_store, - join_default_token_permissions, process_token, wrap_http_exception, ) -- 2.52.0 From 3c8df4e5805516369d1455fe30ab2f57faf3fb19 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 20 May 2026 11:38:06 +0200 Subject: [PATCH 17/64] clean up backend code Format backend code, add __all__ variables. --- dump_things_service/backends/record_dir.py | 75 +++++------ .../backends/record_dir_index.py | 63 ++++----- .../backends/schema_type_layer.py | 59 ++++----- dump_things_service/backends/sqlite.py | 74 ++++++----- dump_things_service/utils.py | 120 ++++++++++++++++++ 5 files changed, 257 insertions(+), 134 deletions(-) diff --git a/dump_things_service/backends/record_dir.py b/dump_things_service/backends/record_dir.py index 320ec59..16e0258 100644 --- a/dump_things_service/backends/record_dir.py +++ b/dump_things_service/backends/record_dir.py @@ -27,10 +27,10 @@ from dump_things_service.backends.record_dir_index import RecordDirIndex if TYPE_CHECKING: from collections.abc import Iterable - from types import ModuleType __all__ = [ + '_RecordDirStore', 'RecordDirStore', ] @@ -45,12 +45,12 @@ class RecordDirResultList(BackendResultList): """ def generate_result( - self, - _: int, - iri: str, - class_name: str, - sort_key: str, - path: Path, + self, + _: int, + iri: str, + class_name: str, + sort_key: str, + path: Path, ) -> RecordInfo: """ Generate a JSON representation of the record at index `index`. @@ -76,11 +76,11 @@ class _RecordDirStore(StorageBackend): """Store records in a directory structure""" def __init__( - self, - root: Path, - pid_mapping_function: Callable, - suffix: str, - order_by: Iterable[str] | None = None, + self, + root: Path, + pid_mapping_function: Callable, + suffix: str, + order_by: Iterable[str] | None = None, ): super().__init__(order_by=order_by) if not root.is_absolute(): @@ -92,27 +92,27 @@ class _RecordDirStore(StorageBackend): self.index = RecordDirIndex(root, suffix) def get_uri( - self + self ) -> str: return f'file://{self.root!s}' def build_index( - self, - schema: str, + self, + schema: str, ): self.index.rebuild_index(schema, self.order_by) def build_index_if_needed( - self, - schema: str, + self, + schema: str, ): self.index.rebuild_if_needed(schema, self.order_by) def add_record( - self, - iri: str, - class_name: str, - json_object: dict, + self, + iri: str, + class_name: str, + json_object: dict, ): pid = json_object['pid'] @@ -148,8 +148,8 @@ class _RecordDirStore(StorageBackend): self.index.add_iri_info(iri, class_name, str(storage_path), sort_string) def get_record_by_iri( - self, - iri: str, + self, + iri: str, ) -> RecordInfo | None: index_entry = self.index.get_info_for_iri(iri) if index_entry is None: @@ -165,9 +165,9 @@ class _RecordDirStore(StorageBackend): ) def get_records_of_classes( - self, - class_names: list[str], - pattern: str | None = None, + self, + class_names: list[str], + pattern: str | None = None, ) -> RecordDirResultList: return RecordDirResultList().add_info( sorted( @@ -186,8 +186,8 @@ class _RecordDirStore(StorageBackend): ) def get_all_records( - self, - pattern: str | None = None, + self, + pattern: str | None = None, ) -> RecordDirResultList: return RecordDirResultList().add_info( sorted( @@ -205,8 +205,8 @@ class _RecordDirStore(StorageBackend): ) def remove_record( - self, - iri: str, + self, + iri: str, ) -> bool: index_entry = self.index.get_info_for_iri(iri) if index_entry is None: @@ -226,10 +226,10 @@ _existing_stores = {} def RecordDirStore( # noqa: N802 - root: Path, - pid_mapping_function: Callable, - suffix: str, - order_by: Iterable[str] | None = None, + root: Path, + pid_mapping_function: Callable, + suffix: str, + order_by: Iterable[str] | None = None, ) -> _RecordDirStore: """Get a record directory store for the given root directory.""" existing_store = _existing_stores.get(root) @@ -255,10 +255,3 @@ def RecordDirStore( # noqa: N802 raise ValueError(msg) return existing_store - - -def _get_schema_type( - class_name: str, - schema_module: ModuleType, -) -> str: - return getattr(schema_module, class_name).class_class_curie diff --git a/dump_things_service/backends/record_dir_index.py b/dump_things_service/backends/record_dir_index.py index 416c0ef..d872dd6 100644 --- a/dump_things_service/backends/record_dir_index.py +++ b/dump_things_service/backends/record_dir_index.py @@ -65,11 +65,11 @@ class IndexEntry(Base): class RecordDirIndex: def __init__( - self, - store_dir: Path, - suffix: str, - *, - echo: bool = False, + self, + store_dir: Path, + suffix: str, + *, + echo: bool = False, ): if not store_dir.is_absolute(): msg = f'Not an absolute path: {store_dir}' @@ -91,11 +91,11 @@ class RecordDirIndex: Base.metadata.create_all(self.engine) def add_iri_info( - self, - iri: str, - class_name: str, - path: str, - sort_key: str, + self, + iri: str, + class_name: str, + path: str, + sort_key: str, ): with Session(self.engine) as session, session.begin(): self.add_iri_info_with_session( @@ -107,12 +107,12 @@ class RecordDirIndex: ) def add_iri_info_with_session( - self, - session: Session, - iri: str, - class_name: str, - path: str, - sort_key: str, + self, + session: Session, + iri: str, + class_name: str, + path: str, + sort_key: str, ): existing_record = session.query(IndexEntry).filter_by(iri=iri).first() if existing_record: @@ -131,8 +131,8 @@ class RecordDirIndex: ) def get_info_for_iri( - self, - iri: str, + self, + iri: str, ) -> tuple | None: with Session(self.engine) as session, session.begin(): statement = select(IndexEntry).filter_by(iri=iri) @@ -142,8 +142,8 @@ class RecordDirIndex: return None def get_info_for_class( - self, - class_name: str, + self, + class_name: str, ) -> Generator[IndexEntry]: with Session(self.engine) as session, session.begin(): statement = select(IndexEntry).filter_by(class_name=class_name) @@ -152,7 +152,7 @@ class RecordDirIndex: yield row[0] def get_info_for_all_classes( - self, + self, ) -> Generator[IndexEntry]: statement = select(IndexEntry) with Session(self.engine) as session, session.begin(): @@ -161,8 +161,8 @@ class RecordDirIndex: yield row[0] def remove_iri_info( - self, - iri: str, + self, + iri: str, ) -> bool: statement = delete(IndexEntry).where(IndexEntry.iri == iri) with Session(self.engine) as session, session.begin(): @@ -170,9 +170,9 @@ class RecordDirIndex: return result.rowcount == 1 def rebuild_index( - self, - schema: str, - order_by: Iterable[str] | None = None, + self, + schema: str, + order_by: Iterable[str] | None = None, ): """Rebuild the index from the records in the directory.""" lgr.info('Building IRI index for records in %s', self.store_dir) @@ -223,15 +223,18 @@ class RecordDirIndex: self.needs_rebuild = False def rebuild_if_needed( - self, - schema: str, - order_by: Iterable[str] | None = None, + self, + schema: str, + order_by: Iterable[str] | None = None, ): if self.needs_rebuild: self.rebuild_index(schema=schema, order_by=order_by) self.needs_rebuild = False - def _get_class_name(self, path: Path) -> str: + def _get_class_name( + self, + path: Path, + ) -> str: """Get the class name from the path.""" rel_path = path.absolute().relative_to(self.store_dir) return rel_path.parts[0] diff --git a/dump_things_service/backends/schema_type_layer.py b/dump_things_service/backends/schema_type_layer.py index bc992c9..640519a 100644 --- a/dump_things_service/backends/schema_type_layer.py +++ b/dump_things_service/backends/schema_type_layer.py @@ -34,15 +34,16 @@ if TYPE_CHECKING: __all__ = [ + '_SchemaTypeLayer', 'SchemaTypeLayer', ] class SchemaTypeLayerResultList(BackendResultList): def __init__( - self, - origin_list: BackendResultList, - schema_model: ModuleType, + self, + origin_list: BackendResultList, + schema_model: ModuleType, ): super().__init__() self.schema_model = schema_model @@ -50,12 +51,12 @@ class SchemaTypeLayerResultList(BackendResultList): self.list_info = self.origin_list.list_info def generate_result( - self, - index: int, - iri: str, - class_name: str, - sort_key: str, - private: Any, + self, + index: int, + iri: str, + class_name: str, + sort_key: str, + private: Any, ) -> RecordInfo: origin_element = self.origin_list.generate_result( index, iri, class_name, sort_key, private @@ -72,9 +73,9 @@ class _SchemaTypeLayer(StorageBackend): """Proxy backend that removes `schema_type` from stored records""" def __init__( - self, - backend: StorageBackend, - schema: str, + self, + backend: StorageBackend, + schema: str, ): super().__init__() self.backend = backend @@ -86,10 +87,10 @@ class _SchemaTypeLayer(StorageBackend): return self.backend.get_uri() def add_record( - self, - iri: str, - class_name: str, - json_object: dict, + self, + iri: str, + class_name: str, + json_object: dict, ): # Remove the top level `schema_type` from the JSON object because we # don't want to store it in the files. We add `schema_type` after @@ -104,14 +105,14 @@ class _SchemaTypeLayer(StorageBackend): ) def remove_record( - self, - iri: str, + self, + iri: str, ) -> bool: return self.backend.remove_record(iri=iri) def get_record_by_iri( - self, - iri: str, + self, + iri: str, ) -> RecordInfo | None: origin_result = self.backend.get_record_by_iri(iri) if origin_result and 'schema_type' not in origin_result.json_object: @@ -122,9 +123,9 @@ class _SchemaTypeLayer(StorageBackend): return origin_result def get_records_of_classes( - self, - class_names: list[str], - pattern: str | None = None, + self, + class_names: list[str], + pattern: str | None = None, ) -> BackendResultList: return SchemaTypeLayerResultList( origin_list=self.backend.get_records_of_classes( @@ -135,8 +136,8 @@ class _SchemaTypeLayer(StorageBackend): ) def get_all_records( - self, - pattern: str | None = None, + self, + pattern: str | None = None, ) -> BackendResultList: return SchemaTypeLayerResultList( origin_list=self.backend.get_all_records(pattern), @@ -149,8 +150,8 @@ class _SchemaTypeLayer(StorageBackend): def _get_schema_type( - class_name: str, - schema_module: ModuleType, + class_name: str, + schema_module: ModuleType, ) -> str: return getattr(schema_module, class_name).class_class_curie @@ -160,8 +161,8 @@ _existing_layers = {} def SchemaTypeLayer( # noqa: N802 - backend: StorageBackend, - schema: str, + backend: StorageBackend, + schema: str, ) -> _SchemaTypeLayer: existing_layer, _ = _existing_layers.get(id(backend), (None, None)) if not existing_layer: diff --git a/dump_things_service/backends/sqlite.py b/dump_things_service/backends/sqlite.py index be89d8f..5dd8523 100644 --- a/dump_things_service/backends/sqlite.py +++ b/dump_things_service/backends/sqlite.py @@ -60,6 +60,12 @@ if TYPE_CHECKING: from collections.abc import Iterable from pathlib import Path + +__all__ = [ + '_SQLiteBackend', + 'SQLiteBackend', +] + logger = logging.getLogger('dump_things_service') old_record_file_name = '.sqlite-records.db' @@ -82,19 +88,19 @@ class Thing(Base): class SQLResultList(BackendResultList): def __init__( - self, - engine: Any, + self, + engine: Any, ): super().__init__() self.engine = engine def generate_result( - self, - _: int, - iri: str, - class_name: str, - sort_key: str, - db_id: int, + self, + _: int, + iri: str, + class_name: str, + sort_key: str, + db_id: int, ) -> RecordInfo: """ Generate a JSON representation of the record at index `index`. @@ -118,11 +124,11 @@ class SQLResultList(BackendResultList): class _SQLiteBackend(StorageBackend): def __init__( - self, - db_path: Path, - *, - order_by: Iterable[str] | None = None, - echo: bool = False, + self, + db_path: Path, + *, + order_by: Iterable[str] | None = None, + echo: bool = False, ) -> None: assert db_path.is_absolute(), f'db_path not absolute {db_path}' if db_path.exists(): @@ -155,10 +161,10 @@ class _SQLiteBackend(StorageBackend): shutil.move(str(old_path), str(self.db_path)) def add_record( - self, - iri: str, - class_name: str, - json_object: dict, + self, + iri: str, + class_name: str, + json_object: dict, ): with Session(self.engine) as session, session.begin(): self._add_record_with_session( @@ -169,8 +175,8 @@ class _SQLiteBackend(StorageBackend): ) def add_records_bulk( - self, - record_infos: Iterable[RecordInfo], + self, + record_infos: Iterable[RecordInfo], ): with Session(self.engine) as session, session.begin(): for record_info in record_infos: @@ -182,8 +188,8 @@ class _SQLiteBackend(StorageBackend): ) def remove_record( - self, - iri: str, + self, + iri: str, ) -> bool: statement = delete(Thing).where(Thing.iri == iri) with Session(self.engine) as session, session.begin(): @@ -191,11 +197,11 @@ class _SQLiteBackend(StorageBackend): return result.rowcount == 1 def _add_record_with_session( - self, - session: Session, - iri: str, - class_name: str, - json_object: dict, + self, + session: Session, + iri: str, + class_name: str, + json_object: dict, ): sort_key = create_sort_key(json_object, self.order_by) existing_record = session.query(Thing).filter_by(iri=iri).first() @@ -214,8 +220,8 @@ class _SQLiteBackend(StorageBackend): ) def get_record_by_iri( - self, - iri: str, + self, + iri: str, ) -> RecordInfo | None: with Session(self.engine) as session, session.begin(): statement = select(Thing).filter_by(iri=iri) @@ -230,9 +236,9 @@ class _SQLiteBackend(StorageBackend): return None def get_records_of_classes( - self, - class_names: Iterable[str], - pattern: str | None = None, + self, + class_names: Iterable[str], + pattern: str | None = None, ) -> SQLResultList: class_list = ', '.join(f"'{cn}'" for cn in class_names) @@ -265,8 +271,8 @@ class _SQLiteBackend(StorageBackend): ) def get_all_records( - self, - pattern: str | None = None, + self, + pattern: str | None = None, ) -> SQLResultList: if pattern is None: statement = text( @@ -300,7 +306,7 @@ _existing_sqlite_backends = {} def SQLiteBackend( # noqa: N802 - db_path: Path, *, order_by: Iterable[str] | None = None, echo: bool = False + db_path: Path, *, order_by: Iterable[str] | None = None, echo: bool = False ) -> _SQLiteBackend: existing_backend = _existing_sqlite_backends.get(db_path) if not existing_backend: diff --git a/dump_things_service/utils.py b/dump_things_service/utils.py index a6e77a6..9b93f1e 100644 --- a/dump_things_service/utils.py +++ b/dump_things_service/utils.py @@ -283,6 +283,27 @@ def create_token_store( from dump_things_service.exceptions import ConfigError from dump_things_service.store.model_store import ModelStore + # One early requirement for the service was to be able to specify + # arbitrary directories for curated stores and incoming stores. This + # explicitly included the use case where an incoming store and a + # curated store are identical. This has the following consequences: + # + # 1. Any collection might have multiple incoming stores that use the same + # directory as the curated store. + # + # 2. Multiple collections might share curated or incoming directories with + # other stores. + # + # From 1. follows that, for efficiency and consistency reasons, existing + # backends for a directory should be reused. With 2. one has to check that + # the collections that specify the backend have matching schemas. Schemas + # must match if the same backend, i.e., the same directory and basic backend + # type (basic backend types are `record_dir` or `sqlite`) are used. + # If different backend types are used (which is possible in the same + # directory), the schemas could in principle be different. For consistency + # reasons, we issue a warning on different schemas, but allow the + # configuration. + # Check if the store was already created and if it was created for the # same schema. if store_dir in instance_state.all_stores: @@ -350,6 +371,105 @@ def create_token_store( return model_store +def get_existing_store( + abstract_configuration: Configuration, + instance_state: InstanceState, + collection_name: str, + store_dir: Path, +) -> _ModelStore: + from dump_things_service.backends.schema_type_layer import SchemaTypeLayer + from dump_things_service.abstract_config import get_backend_and_extension + from dump_things_service.exceptions import ConfigError + from dump_things_service.store.model_store import ModelStore + + # One early requirement for the service was to be able to specify + # arbitrary directories for curated stores and incoming stores. This + # explicitly included the use case where an incoming store and a + # curated store are identical. This has the following consequences: + # + # 1. Any collection might have multiple incoming stores that use the same + # directory as the curated store. + # + # 2. Multiple collections might share curated or incoming directories with + # other stores. + # + # From 1. follows that, for efficiency and consistency reasons, existing + # backends for a directory should be reused. With 2. one has to check that + # the collections that specify the backend have matching schemas. Schemas + # must match if the same backend, i.e., the same directory and basic backend + # type (basic backend types are `record_dir` or `sqlite`) are used. + # If different backend types are used (which is possible in the same + # directory), the schemas could in principle be different. For consistency + # reasons, we issue a warning on different schemas, but allow the + # configuration. + + # Check if the store was already created and if it was created for the + # same schema. + if store_dir in instance_state.all_stores: + existing_collection_name, existing_model_store = instance_state.all_stores[store_dir] + if ( + existing_collection_name != collection_name + and instance_state.schemas[existing_collection_name] != instance_state.schemas[collection_name] + ): + msg = ( + f"collections '{existing_collection_name}' and " + f"'{collection_name}' with different schemas map onto the same" + f" storage directory: '/{store_dir.name}'" + ) + raise HTTPException( + status_code=HTTP_500_INTERNAL_SERVER_ERROR, + detail=msg, + ) + return existing_model_store + + store_dir.mkdir(parents=True, exist_ok=True) + + schema_uri = abstract_configuration.collections[collection_name].schema + + # We get the backend information from the abstract configuration + backend_type = abstract_configuration.collections[collection_name].backend.type + backend_name, extension = get_backend_and_extension(backend_type) + + backend = instance_state.curated_stores[collection_name].backend + if backend_name == 'record_dir': + # The configuration routines have read the backend configuration of the + # curated store from disk and stored it in `instance_state`. We fetch + # it from there. + if extension == 'stl': + backend = backend.backend + + token_store = create_record_dir_token_store( + store_dir=store_dir, + order_by=backend.order_by, + schema_uri=schema_uri, + mapping_function=backend.pid_mapping_function, + suffix=backend.suffix, + ) + elif backend_name == 'sqlite': + token_store = create_sqlite_token_store( + store_dir=store_dir, + order_by=backend.order_by, + ) + else: + # This should not happen because we base our decision on already + # existing backends. + msg = f'Unsupported backend type: `{backend_type}`.' + raise ConfigError(msg) + + if extension == 'stl': + token_store = SchemaTypeLayer(backend=token_store, schema=schema_uri) + + submission_tags = abstract_configuration.collections[collection_name].submission_tags + tags = { + 'id': submission_tags.submitter_id_tag, + 'time': submission_tags.submission_time_tag, + } + model_store = ModelStore(backend=token_store, schema=schema_uri, tags=tags) + instance_state.all_stores[store_dir] = (collection_name, model_store) + + return model_store + + def create_record_dir_token_store( store_dir: Path, order_by: list[str], -- 2.52.0 From 2ee8782a703cdfd9ce8760f64c49ad3832b159a2 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 20 May 2026 14:07:26 +0200 Subject: [PATCH 18/64] update format --- dump_things_service/backends/__init__.py | 48 ++++++++++++------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/dump_things_service/backends/__init__.py b/dump_things_service/backends/__init__.py index 37d013e..901b9e8 100644 --- a/dump_things_service/backends/__init__.py +++ b/dump_things_service/backends/__init__.py @@ -83,12 +83,12 @@ class BackendResultList(LazyList): @abstractmethod def generate_result( - self, - index: int, - iri: str, - class_name: str, - sort_key: str, - private: Any, + self, + index: int, + iri: str, + class_name: str, + sort_key: str, + private: Any, ) -> RecordInfo: """ Generate a record info object from the provided parameters. @@ -105,23 +105,23 @@ class BackendResultList(LazyList): class StorageBackend(metaclass=ABCMeta): def __init__( - self, - order_by: Iterable[str] | None = None, + self, + order_by: Iterable[str] | None = None, ): self.order_by = order_by or ['pid'] @abstractmethod def get_uri( - self + self ) -> str: raise NotImplementedError @abstractmethod def add_record( - self, - iri: str, - class_name: str, - json_object: dict, + self, + iri: str, + class_name: str, + json_object: dict, ): raise NotImplementedError @@ -139,37 +139,37 @@ class StorageBackend(metaclass=ABCMeta): @abstractmethod def remove_record( - self, - iri: str, + self, + iri: str, ) -> bool: raise NotImplementedError @abstractmethod def get_record_by_iri( - self, - iri: str, + self, + iri: str, ) -> RecordInfo | None: raise NotImplementedError @abstractmethod def get_records_of_classes( - self, - class_names: Iterable[str], - pattern: str | None = None, + self, + class_names: Iterable[str], + pattern: str | None = None, ) -> BackendResultList: raise NotImplementedError @abstractmethod def get_all_records( - self, - pattern: str | None = None, + self, + pattern: str | None = None, ) -> BackendResultList: raise NotImplementedError def create_sort_key( - json_object: dict[str, Any], - order_by: Iterable[str], + json_object: dict[str, Any], + order_by: Iterable[str], ) -> str: return '-'.join( str(json_object.get(key)) if json_object.get(key) is not None else chr(0x10FFFF) -- 2.52.0 From 0493499bf2bc9dc409a43f430c80dab4cc756129 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 20 May 2026 16:01:33 +0200 Subject: [PATCH 19/64] cleanup code Update format, remove unused code and debug code --- dump_things_service/lazy_list.py | 2 +- dump_things_service/store/model_store.py | 89 +++++++----- .../tests/test_web_interface.py | 30 ++-- dump_things_service/utils.py | 137 +++--------------- 4 files changed, 83 insertions(+), 175 deletions(-) diff --git a/dump_things_service/lazy_list.py b/dump_things_service/lazy_list.py index 91ca11a..83eb0de 100644 --- a/dump_things_service/lazy_list.py +++ b/dump_things_service/lazy_list.py @@ -177,7 +177,7 @@ class PriorityList(LazyList): """ def __init__( - self, + self, ): super().__init__() self.seen = set() diff --git a/dump_things_service/store/model_store.py b/dump_things_service/store/model_store.py index 2242abc..cb72c03 100644 --- a/dump_things_service/store/model_store.py +++ b/dump_things_service/store/model_store.py @@ -17,7 +17,7 @@ if TYPE_CHECKING: from pydantic import BaseModel from dump_things_service.backends import ( - RecordInfo, + _RecordInfo, StorageBackend, ) from dump_things_service.lazy_list import LazyList @@ -29,10 +29,10 @@ submitter_namespace = 'http://purl.obolibrary.org/obo/' class _ModelStore: def __init__( - self, - schema: str, - backend: StorageBackend, - tags: dict[str, str] + self, + schema: str, + backend: StorageBackend, + tags: dict[str, str] ): self.schema = schema self.model = get_model_for_schema(self.schema)[0] @@ -43,9 +43,9 @@ class _ModelStore: return self.backend.get_uri() def store_object( - self, - obj: BaseModel, - submitter: str, + self, + obj: BaseModel, + submitter: str, ) -> Iterable[tuple[str, dict]]: if obj.__class__.__name__ == 'Thing': msg = f'Cannot store `Thing` instance: {obj}.' @@ -65,15 +65,15 @@ class _ModelStore: ] def pid_to_iri( - self, - pid: str, + self, + pid: str, ): return resolve_curie(self.model, pid) def _store_flat_object( - self, - obj: BaseModel, - submitter: str, + self, + obj: BaseModel, + submitter: str, ) -> dict: iri = self.pid_to_iri(obj.pid) class_name = obj.__class__.__name__ @@ -93,9 +93,9 @@ class _ModelStore: return json_object def annotate( - self, - json_object: dict, - submitter: str, + self, + json_object: dict, + submitter: str, ) -> None: """Add submitter IRI to the record annotations, use CURIE if possible""" json_object['annotations'] = self.homogenize_annotations(json_object) @@ -112,8 +112,8 @@ class _ModelStore: } def get_curie( - self, - curie_or_iri: str, + self, + curie_or_iri: str, ) -> str: if is_curie(curie_or_iri): return curie_or_iri @@ -130,8 +130,8 @@ class _ModelStore: return curie_or_iri def extract_inlined( - self, - record: BaseModel, + self, + record: BaseModel, ) -> list[BaseModel]: # The trivial case: no relations if not hasattr(record, 'relations') or record.relations is None: @@ -158,14 +158,14 @@ class _ModelStore: return [new_record, *extracted_sub_records] def get_object_by_pid( - self, - pid: str, + self, + pid: str, ) -> tuple[str, dict] | tuple[None, None]: return self.get_object_by_iri(self.pid_to_iri(pid)) def get_object_by_iri( - self, - iri: str, + self, + iri: str, ) -> tuple[str, dict] | tuple[None, None]: record_info = self.backend.get_record_by_iri(iri) if record_info: @@ -173,12 +173,12 @@ class _ModelStore: return None, None def get_objects_of_class( - self, - class_name: str, - matching: str | None, - *, - include_subclasses: bool = True, - ) -> LazyList[RecordInfo]: + self, + class_name: str, + matching: str | None, + *, + include_subclasses: bool = True, + ) -> LazyList[_RecordInfo]: """ Get all objects of a specific class. @@ -196,9 +196,9 @@ class _ModelStore: return self.backend.get_records_of_classes(class_names, matching) def get_all_objects( - self, - matching: str | None = None, - ) -> LazyList[RecordInfo]: + self, + matching: str | None = None, + ) -> LazyList[_RecordInfo]: """ Get all objects of a specific class. @@ -208,8 +208,8 @@ class _ModelStore: return self.backend.get_all_records(matching) def delete_object( - self, - pid: str, + self, + pid: str, ) -> bool: return self.backend.remove_record(self.pid_to_iri(pid)) @@ -218,15 +218,18 @@ _existing_model_stores = {} def ModelStore( # noqa: N802 - schema: str, - backend: StorageBackend, - tags: dict[str, str], + schema: str, + backend: StorageBackend, + tags: dict[str, str], ) -> _ModelStore: - """ - Create a unique model store for the given schema and backend. + """Create a unique model store for the given schema and backend. + + Raise `ValueError` if a store with a different schema already exists for + the given backend. :param schema: The schema to use for the model store. :param backend: The storage backend to use. + :param tags: Tags that will be used for annotations :return: An instance of _ModelStore. """ existing_model_store, _ = _existing_model_stores.get(id(backend), (None, None)) @@ -235,4 +238,10 @@ def ModelStore( # noqa: N802 # We store a pointer to the backend in the value to ensure that the # backend object exists while we use its `id` as a key. _existing_model_stores[id(backend)] = existing_model_store, backend + else: + # Check that the schemas are compatible, if the backend is reused. + if existing_model_store.schema != schema: + msg = 'Backend is already used in a ModelStore with a different schema' + raise ValueError(msg) + return existing_model_store diff --git a/dump_things_service/tests/test_web_interface.py b/dump_things_service/tests/test_web_interface.py index ab63b3c..039cacb 100644 --- a/dump_things_service/tests/test_web_interface.py +++ b/dump_things_service/tests/test_web_interface.py @@ -18,11 +18,11 @@ pids = ('', '--------', '&&&&&', 'abc', 'abc&', 'abc&format=ttl') tuple(product(*(collection_names, class_names, queries, format_names))), ) def test_web_interface_post_errors( - fastapi_client_simple, - collection_name, - class_name, - query, - format_name, + fastapi_client_simple, + collection_name, + class_name, + query, + format_name, ): """Check that no internal server error occurs with weird input""" test_client, _ = fastapi_client_simple @@ -39,11 +39,11 @@ def test_web_interface_post_errors( tuple(product(*(collection_names, class_names, queries, format_names))), ) def test_web_interface_get_class_errors( - fastapi_client_simple, - collection_name, - class_name, - query, - format_name, + fastapi_client_simple, + collection_name, + class_name, + query, + format_name, ): """Check that no internal server error occurs with weird input""" test_client, _ = fastapi_client_simple @@ -64,11 +64,11 @@ def test_web_interface_get_class_errors( tuple(product(*(collection_names, pids, queries, format_names))), ) def test_web_interface_get_pid_errors( - fastapi_client_simple, - collection_name, - pid, - query, - format_name, + fastapi_client_simple, + collection_name, + pid, + query, + format_name, ): """Check that no internal server error occurs with weird input""" test_client, _ = fastapi_client_simple diff --git a/dump_things_service/utils.py b/dump_things_service/utils.py index 9b93f1e..31c98c4 100644 --- a/dump_things_service/utils.py +++ b/dump_things_service/utils.py @@ -26,7 +26,6 @@ from dump_things_service import ( HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, - HTTP_404_NOT_FOUND, HTTP_413_CONTENT_TOO_LARGE, HTTP_503_SERVICE_UNAVAILABLE, ) @@ -49,7 +48,7 @@ if TYPE_CHECKING: from dump_things_service import JSON from dump_things_service.backends.record_dir import _RecordDirStore - from dump_things_service.backends.sqlite import SQLiteBackend + from dump_things_service.backends.sqlite import _SQLiteBackend from dump_things_service.instance_state import InstanceState from dump_things_service.store.model_store import _ModelStore @@ -306,124 +305,24 @@ def create_token_store( # Check if the store was already created and if it was created for the # same schema. - if store_dir in instance_state.all_stores: - existing_collection_name, existing_model_store = instance_state.all_stores[store_dir] - if ( - existing_collection_name != collection_name - and instance_state.schemas[existing_collection_name] != instance_state.schemas[collection_name] - ): - msg = ( - f"collections '{existing_collection_name}' and " - f"'{collection_name}' with different schemas map onto the same" - f" storage directory: '/{store_dir.name}'" - ) - raise HTTPException( - status_code=HTTP_500_INTERNAL_SERVER_ERROR, - detail=msg, - ) - return existing_model_store + #if store_dir in instance_state.all_stores: + # existing_collection_name, existing_model_store = instance_state.all_stores[store_dir] + # if ( + # existing_collection_name != collection_name + # and instance_state.schemas[existing_collection_name] != instance_state.schemas[collection_name] + # ): + # msg = ( + # f"collections '{existing_collection_name}' and " + # f"'{collection_name}' with different schemas map onto the same" + # f" storage directory: '/{store_dir.name}'" + # ) + # raise HTTPException( + # status_code=HTTP_500_INTERNAL_SERVER_ERROR, + # detail=msg, + # ) + # return existing_model_store store_dir.mkdir(parents=True, exist_ok=True) - - schema_uri = abstract_configuration.collections[collection_name].schema - - # We get the backend information from the abstract configuration - backend_type = abstract_configuration.collections[collection_name].backend.type - backend_name, extension = get_backend_and_extension(backend_type) - - backend = instance_state.curated_stores[collection_name].backend - if backend_name == 'record_dir': - # The configuration routines have read the backend configuration of the - # curated store from disk and stored it in `instance_state`. We fetch - # it from there. - if extension == 'stl': - backend = backend.backend - - token_store = create_record_dir_token_store( - store_dir=store_dir, - order_by=backend.order_by, - schema_uri=schema_uri, - mapping_function=backend.pid_mapping_function, - suffix=backend.suffix, - ) - elif backend_name == 'sqlite': - token_store = create_sqlite_token_store( - store_dir=store_dir, - order_by=backend.order_by, - ) - else: - # This should not happen because we base our decision on already - # existing backends. - msg = f'Unsupported backend type: `{backend_type}`.' - raise ConfigError(msg) - - if extension == 'stl': - token_store = SchemaTypeLayer(backend=token_store, schema=schema_uri) - - submission_tags = abstract_configuration.collections[collection_name].submission_tags - tags = { - 'id': submission_tags.submitter_id_tag, - 'time': submission_tags.submission_time_tag, - } - model_store = ModelStore(backend=token_store, schema=schema_uri, tags=tags) - instance_state.all_stores[store_dir] = (collection_name, model_store) - - return model_store - - -def get_existing_store( - abstract_configuration: Configuration, - instance_state: InstanceState, - collection_name: str, - store_dir: Path, -) -> _ModelStore: - from dump_things_service.backends.schema_type_layer import SchemaTypeLayer - from dump_things_service.abstract_config import get_backend_and_extension - from dump_things_service.exceptions import ConfigError - from dump_things_service.store.model_store import ModelStore - - # One early requirement for the service was to be able to specify - # arbitrary directories for curated stores and incoming stores. This - # explicitly included the use case where an incoming store and a - # curated store are identical. This has the following consequences: - # - # 1. Any collection might have multiple incoming stores that use the same - # directory as the curated store. - # - # 2. Multiple collections might share curated or incoming directories with - # other stores. - # - # From 1. follows that, for efficiency and consistency reasons, existing - # backends for a directory should be reused. With 2. one has to check that - # the collections that specify the backend have matching schemas. Schemas - # must match if the same backend, i.e., the same directory and basic backend - # type (basic backend types are `record_dir` or `sqlite`) are used. - # If different backend types are used (which is possible in the same - # directory), the schemas could in principle be different. For consistency - # reasons, we issue a warning on different schemas, but allow the - # configuration. - - # Check if the store was already created and if it was created for the - # same schema. - if store_dir in instance_state.all_stores: - existing_collection_name, existing_model_store = instance_state.all_stores[store_dir] - if ( - existing_collection_name != collection_name - and instance_state.schemas[existing_collection_name] != instance_state.schemas[collection_name] - ): - msg = ( - f"collections '{existing_collection_name}' and " - f"'{collection_name}' with different schemas map onto the same" - f" storage directory: '/{store_dir.name}'" - ) - raise HTTPException( - status_code=HTTP_500_INTERNAL_SERVER_ERROR, - detail=msg, - ) - return existing_model_store - - store_dir.mkdir(parents=True, exist_ok=True) - schema_uri = abstract_configuration.collections[collection_name].schema # We get the backend information from the abstract configuration @@ -492,7 +391,7 @@ def create_record_dir_token_store( def create_sqlite_token_store( store_dir: Path, order_by: list[str], -) -> SQLiteBackend: +) -> _SQLiteBackend: from dump_things_service.backends.sqlite import SQLiteBackend from dump_things_service.backends.sqlite import ( record_file_name as sqlite_record_file_name, -- 2.52.0 From c0e100aa152ae563b708c27cf4f3736f7a8d78c2 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 20 May 2026 16:02:49 +0200 Subject: [PATCH 20/64] update comments --- dump_things_service/utils.py | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/dump_things_service/utils.py b/dump_things_service/utils.py index 31c98c4..5445543 100644 --- a/dump_things_service/utils.py +++ b/dump_things_service/utils.py @@ -299,28 +299,7 @@ def create_token_store( # must match if the same backend, i.e., the same directory and basic backend # type (basic backend types are `record_dir` or `sqlite`) are used. # If different backend types are used (which is possible in the same - # directory), the schemas could in principle be different. For consistency - # reasons, we issue a warning on different schemas, but allow the - # configuration. - - # Check if the store was already created and if it was created for the - # same schema. - #if store_dir in instance_state.all_stores: - # existing_collection_name, existing_model_store = instance_state.all_stores[store_dir] - # if ( - # existing_collection_name != collection_name - # and instance_state.schemas[existing_collection_name] != instance_state.schemas[collection_name] - # ): - # msg = ( - # f"collections '{existing_collection_name}' and " - # f"'{collection_name}' with different schemas map onto the same" - # f" storage directory: '/{store_dir.name}'" - # ) - # raise HTTPException( - # status_code=HTTP_500_INTERNAL_SERVER_ERROR, - # detail=msg, - # ) - # return existing_model_store + # directory), the schemas could in principle be different. store_dir.mkdir(parents=True, exist_ok=True) schema_uri = abstract_configuration.collections[collection_name].schema -- 2.52.0 From 950e2618fe4112fe7d9059ee12d95367f2357275 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 21 May 2026 15:14:24 +0200 Subject: [PATCH 21/64] add the dump-things-load-config command Add a command that interprets configuration files and creates the objects defined in the configuration file in a service. This command can be used to migrate from old dump-things server to new dump-things server, when the option --old-format is used. Note that there are a few differences between the old and the new format: 1. Keys in `tokens` are now token names, that map to token configurations. 2. Token configurations now contain the additional attributes `representation` and `hashed`. 3. `schema` is now an attribute of a collection. It is removed from sqlite backend configurations. --- dump_things_service/commands/load_config.py | 228 ++++++++++++++++++++ pyproject.toml | 1 + 2 files changed, 229 insertions(+) create mode 100644 dump_things_service/commands/load_config.py diff --git a/dump_things_service/commands/load_config.py b/dump_things_service/commands/load_config.py new file mode 100644 index 0000000..729fbbb --- /dev/null +++ b/dump_things_service/commands/load_config.py @@ -0,0 +1,228 @@ +from __future__ import annotations + +import os +import sys +from argparse import ArgumentParser +from itertools import count + +import requests +import yaml + +#from dump_things_service.token_endpoints import create_admin_token + +parser = ArgumentParser( + prog='Establish a configuration in a running service', + description='Read a configuration from a dump-things configuration-file ' + 'and instantiate its elements on a running server. Objects that ' + 'already exist on the server are left unchanged. ' + ' ' + 'An admin token has to be provided in the environment variable ' + '`DTS_ADMIN_TOKEN`.', +) +parser.add_argument( + 'config_file', + help='The path to the config file', +) +parser.add_argument( + '--send-to', + help='The base URL of the server API', +) +parser.add_argument( + '--old-format', + action='store_true', + help='If provided, assume that the configuration is in the old format ' + 'and convert it to the new format internally (in old format: tokens ' + 'had no `hashed`-attribute and no `representation`-attribute, the token ' + 'representation was the key of the token configuration, ' + 'collections had no `schema`-attribute, and `sqlite`-backends had ' + 'a `schema`-attribute).', +) +parser.add_argument( + '--schema', + action='append', + default=[], + help='If --old-format is provided, this option can be used to specify a ' + 'schema for a collection. This is necessary, if the collection has ' + 'a `record-dir`-backend.' + ' ' + 'The format is ":".', +) + + +def main(): + arguments = parser.parse_args() + + with open(arguments.config_file) as config_file: + configuration = yaml.safe_load(config_file) + + assert configuration['type'] == 'collections', '`type`-entry missing in old config-file' + if arguments.old_format: + configuration = convert_to_new_format(configuration, arguments.schema) + else: + if arguments.schema: + print( + 'Warning: ignoring `--schema` option because `--old-format` ' + 'is not provided.', + file=sys.stderr, + flush=True, + ) + + assert configuration['version'] == 2, '`version: 2` missing in config-file' + + if arguments.send_to: + admin_token = os.environ.get('DTS_ADMIN_TOKEN') + if not admin_token: + print( + 'An admin token not provided in the environment variable `DTS_ADMIN_TOKEN`', + file=sys.stderr, + flush=True, + ) + return 1 + + try: + establish_configuration( + configuration, + arguments.send_to[:-1] + if arguments.send_to.endswith('/') + else arguments.send_to, + admin_token, + ) + return 0 + except RuntimeError as rte: + print(f'{rte.args[0]}', file=sys.stderr, flush=True) + return 2 + + print( + yaml.dump( + data=configuration, + sort_keys=False, + allow_unicode=True, + default_flow_style=False, + ) + ) + return 0 + + +def convert_to_new_format( + old_configuration: dict, + schema_spec: list[str], +) -> dict: + + assert old_configuration['type'] == 'collections', '`type`-entry missing in old config-file' + assert old_configuration['version'] == 1, '`version: 1` missing in old config-file' + + schema_map = { + spec.split(':', maxsplit=1)[0]: spec.split(':', maxsplit=1)[1] + for spec in schema_spec + } + + for collection_name, collection_config in old_configuration['collections'].items(): + backend = collection_config['backend'] + if backend['type'].startswith('sqlite'): + collection_config['schema'] = backend['schema'] + del backend['schema'] + elif backend['type'].startswith('record_dir'): + collection_config['schema'] = schema_map[collection_name] + else: + msg = f'Unknown backend type: "{backend["type"]}" in collection: "{collection_name}"' + raise RuntimeError(msg) + + counter = count(1) + new_configuration = { + 'type': old_configuration['type'], + 'version': 2, + 'tokens': { + f'token_{next(counter)}': { + **old_token_config.copy(), + 'representation': token_representation, + 'hashed': False + } + for token_representation, old_token_config in old_configuration['tokens'].items() + }, + 'collections': old_configuration['collections'], + } + return new_configuration + + +def establish_configuration( + configuration: dict, + api_url: str, + admin_token: str, +): + create_collections(configuration, api_url, admin_token) + create_tokens(configuration, api_url, admin_token) + create_admin_tokens(configuration, api_url, admin_token) + + +def create_tokens( + configuration: dict, + api_url: str, + admin_token: str, +): + for token_name, token_config in configuration['tokens'].items(): + _post_data( + url=api_url + '/tokens', + data={ + **token_config, + 'name': token_name, + }, + token=admin_token, + content_class='token', + content_name=token_name, + ) + + +def create_collections( + configuration: dict, + api_url: str, + admin_token: str, +): + for collection_name, collection_config in configuration['collections'].items(): + _post_data( + url=api_url + '/collections', + data={ + **collection_config, + 'name': collection_name, + }, + token=admin_token, + content_class='collection', + content_name=collection_name, + ) + + +def create_admin_tokens( + configuration: dict, + api_url: str, + admin_token: str, +): + for admin_token_name, admin_token_config in configuration.get( + 'admin_tokens', + {} + ).items(): + _post_data( + url=api_url + '/admin_tokens', + data={ + **admin_token_config, + 'name': admin_token_name, + }, + token=admin_token, + content_class='admin token', + content_name=admin_token_name, + ) + + +def _post_data( + url: str, + data: dict, + token: str, + content_class: str, + content_name: str, +): + result = requests.post(url, headers={'x-dumpthings-token': token}, json=data,) + if result.status_code >= 300: + msg = f'Error uploading {content_class}: {content_name}: {result.text}' + raise RuntimeError(msg) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/pyproject.toml b/pyproject.toml index a3dc3fe..41f0165 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ dump-things-pid-check = "dump_things_service.commands.check_pids:main" dump-things-create-merged-schema = "dump_things_service.commands.create_merged_schema:main" dump-things-gitaudit-report = "dump_things_service.commands.gitaudit_report:main" dump-things-gitaudit-rebuild-index = "dump_things_service.commands.gitaudit_rebuild_index:main" +dump-things-load-config = "dump_things_service.commands.load_config:main" [tool.hatch.build.targets.wheel] exclude = [ -- 2.52.0 From b4220a3421cc1c79816329c4583e826ea97b8be9 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 21 May 2026 15:21:07 +0200 Subject: [PATCH 22/64] replace schema-property with schema_location --- dump_things_service/abstract_config.py | 8 ++++---- dump_things_service/collection.py | 14 +++++++------- dump_things_service/export/json.py | 2 +- dump_things_service/export/tree.py | 2 +- dump_things_service/main.py | 10 +++++----- dump_things_service/tests/fixtures.py | 8 ++++++-- .../tests/test_collection_administration.py | 4 ++-- dump_things_service/utils.py | 2 +- dump_things_service/validate.py | 2 +- 9 files changed, 28 insertions(+), 24 deletions(-) diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index ba4f4df..157a2a7 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -15,6 +15,7 @@ from fastapi import HTTPException from pydantic import ( BaseModel, ConfigDict, + Field, ) from dump_things_service import HTTP_404_NOT_FOUND @@ -74,7 +75,6 @@ class RecordDirBackendConfig(StrictModel): class SQLiteBackendConfig(StrictModel): type: Literal['sqlite', 'sqlite+stl'] - schema: str class GitAuditBackendConfig(StrictModel): @@ -88,7 +88,7 @@ class CollectionConfig(BaseModel): name: str default_token: str curated: PurePosixPath - schema: str + schema_location: str = Field(alias='schema') incoming: PurePosixPath | None = None backend: RecordDirBackendConfig | SQLiteBackendConfig = RecordDirBackendConfig(type='record_dir+stl') auth_sources: list[ForgejoAuthSpec | ConfigAuthSpec] = [ConfigAuthSpec()] @@ -147,7 +147,7 @@ class AdminTokenConfig(StrictModel): class Configuration(BaseModel): collections: dict[str, CollectionConfig] = {} tokens: dict[str, TokenConfig] = {} - admin_tokens: dict[str, AdminTokenConfig] = {} + admin_tokens: dict[str, AdminTokenConfig] | None = None pid: str = dump_things_config_iri @@ -242,7 +242,7 @@ def store_config( global g_abstract_configuration config_backend, audit_backend = get_config_backends(store_path) - json_object = config.model_dump(mode='json', exclude_none=True) + json_object = config.model_dump(mode='json', exclude_none=True, by_alias=True) json_object['pid'] = dump_things_config_iri config_backend.add_record( iri=dump_things_config_iri, diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index 632ebc7..4de8446 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -178,7 +178,7 @@ def create_collection( check_store_compatibility( curated_path, collection_configuration.backend, - collection_configuration.schema, + collection_configuration.schema_location, ) for audit_backend in collection_configuration.audit_backends: @@ -216,7 +216,7 @@ def create_collection( instance_state, curated_path, collection_configuration.backend, - collection_configuration.schema, + collection_configuration.schema_location, collection_configuration.submission_tags ) instance_state.curated_stores[collection_name] = curated_store @@ -225,7 +225,7 @@ def create_collection( instance_state.incoming_stores[collection_name] = {} # Create the schema modules, schema view, and conversion objects - schema_location = collection_configuration.schema + schema_location = collection_configuration.schema_location instance_state.schema_info[schema_location] = get_schema_info(schema_location) # Determine the active classes based on the classes defined in the schema @@ -242,7 +242,7 @@ def create_collection( # Create a validator for the collection instance_state.validators[collection_name] = FormatConverter( - schema=collection_configuration.schema, + schema=collection_configuration.schema_location, input_format=Format.json, output_format=Format.ttl, ) @@ -472,7 +472,7 @@ def create_endpoint( instance_state.collections[collection_name].tag_info[tag_group] = tag_name # TODO: get schema_info from instance_state!? - model, classes, model_var_name = get_model_for_schema(collection_config.schema) + model, classes, model_var_name = get_model_for_schema(collection_config.schema_location) globals()[model_var_name] = model active_classes = instance_state.collections[collection_name].active_classes @@ -637,7 +637,7 @@ def store_record( if input_format == Format.ttl: with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Conversion error'): json_object = FormatConverter( - abstract_config.collections[collection].schema, + abstract_config.collections[collection].schema_location, input_format=Format.ttl, output_format=Format.json, ).convert(data, class_name) @@ -654,7 +654,7 @@ def store_record( if input_format == Format.ttl: format_converter = FormatConverter( - abstract_config.collections[collection].schema, + abstract_config.collections[collection].schema_location, input_format=Format.json, output_format=Format.ttl, ) diff --git a/dump_things_service/export/json.py b/dump_things_service/export/json.py index 3181482..c33292a 100644 --- a/dump_things_service/export/json.py +++ b/dump_things_service/export/json.py @@ -60,7 +60,7 @@ def export_collection( indent: int, output: TextIO, ): - output.write(f'{indent * " "}"schema": "{abstract_config.collections[collection].schema}",\n') + output.write(f'{indent * " "}"schema": "{abstract_config.collections[collection].schema_location}",\n') output.write(f'{indent * " "}"curated": {{\n') append_classes( instance_config.curated_stores[collection], indent + level_width, output diff --git a/dump_things_service/export/tree.py b/dump_things_service/export/tree.py index df11d27..457e6ed 100644 --- a/dump_things_service/export/tree.py +++ b/dump_things_service/export/tree.py @@ -46,7 +46,7 @@ def export_collection( config_content = ( 'type: records\n' 'version: 1\n' - f'schema: {abstract_config.collections[collection].schema}\n' + f'schema: {abstract_config.collections[collection].schema_location}\n' 'format: yaml\n' 'idfx: digest-md5-p3-p3\n' ) diff --git a/dump_things_service/main.py b/dump_things_service/main.py index 308bcac..7e85f7a 100644 --- a/dump_things_service/main.py +++ b/dump_things_service/main.py @@ -227,8 +227,8 @@ async def server() -> ServerResponse: collections = [ ServerCollectionResponse( name=collection_name, - schema=g_configuration.collections[collection_name].schema, - classes=g_instance_state.schema_info[g_configuration.collections[collection_name].schema].classes, + schema=g_configuration.collections[collection_name].schema_location, + classes=g_instance_state.schema_info[g_configuration.collections[collection_name].schema_location].classes, ) for collection_name in g_configuration.collections ] @@ -309,7 +309,7 @@ async def read_record_with_pid( if format == Format.ttl: converter = FormatConverter( - schema=g_configuration.collections[collection].schema, + schema=g_configuration.collections[collection].schema_location, input_format=Format.json, output_format=format, ) @@ -453,7 +453,7 @@ async def _read_all_records( if format == Format.ttl: result_list = ConvertingList( result_list, - g_configuration.collections[collection].schema, + g_configuration.collections[collection].schema_location, input_format=Format.json, output_format=format, exception_handler=convert_to_http_exception, @@ -481,7 +481,7 @@ async def _read_records_of_type( ) from e check_collection(g_configuration, collection) - schema_location = g_configuration.collections[collection].schema + schema_location = g_configuration.collections[collection].schema_location model = g_instance_state.schema_info[schema_location].pydantic_module_info.module if class_name not in g_instance_state.collections[collection].active_classes: raise HTTPException( diff --git a/dump_things_service/tests/fixtures.py b/dump_things_service/tests/fixtures.py index 31f051e..67f5728 100644 --- a/dump_things_service/tests/fixtures.py +++ b/dump_things_service/tests/fixtures.py @@ -423,7 +423,11 @@ def fastapi_client_simple(fastapi_app_simple): for collection_config in g_default_collections: response = test_client.post( '/collections', - json=collection_config.model_dump(exclude_unset=True, mode='json'), + json=collection_config.model_dump( + exclude_unset=True, + mode='json', + by_alias=True, + ), headers={'x-dumpthings-token': 'admin-1'}, ) assert response.status_code == 201 @@ -451,7 +455,7 @@ def fastapi_client_simple(fastapi_app_simple): ), suffix='yaml', ) - pydantic_module = get_model_for_schema(collection_config.schema)[0] + pydantic_module = get_model_for_schema(collection_config.schema_location)[0] add_records_to_backend( backend, pydantic_module, diff --git a/dump_things_service/tests/test_collection_administration.py b/dump_things_service/tests/test_collection_administration.py index b178c5b..8d8901f 100644 --- a/dump_things_service/tests/test_collection_administration.py +++ b/dump_things_service/tests/test_collection_administration.py @@ -88,7 +88,7 @@ def test_collection_adding(fastapi_client_simple): response = test_client.post( '/collections', headers={'x-dumpthings-token': 'admin-1'}, - json=new_collection_config.model_dump(mode='json'), + json=new_collection_config.model_dump(mode='json', by_alias=True), ) assert response.status_code == HTTP_201_CREATED assert _name_in_openapi_paths(test_client, new_collection_name) @@ -98,7 +98,7 @@ def test_collection_adding(fastapi_client_simple): headers={'x-dumpthings-token': 'admin-1'}, ) assert response.status_code == HTTP_200_OK - assert response.json() == new_collection_config.model_dump(mode='json') + assert response.json() == new_collection_config.model_dump(mode='json', by_alias=True) # Add a token to the collection response = test_client.post( diff --git a/dump_things_service/utils.py b/dump_things_service/utils.py index 5445543..457136d 100644 --- a/dump_things_service/utils.py +++ b/dump_things_service/utils.py @@ -302,7 +302,7 @@ def create_token_store( # directory), the schemas could in principle be different. store_dir.mkdir(parents=True, exist_ok=True) - schema_uri = abstract_configuration.collections[collection_name].schema + schema_uri = abstract_configuration.collections[collection_name].schema_location # We get the backend information from the abstract configuration backend_type = abstract_configuration.collections[collection_name].backend.type diff --git a/dump_things_service/validate.py b/dump_things_service/validate.py index 601d778..4dd6f6f 100644 --- a/dump_things_service/validate.py +++ b/dump_things_service/validate.py @@ -83,7 +83,7 @@ def validate_record( if input_format == Format.ttl: with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Conversion error'): json_object = FormatConverter( - abstract_config.collections[collection].schema, + abstract_config.collections[collection].schema_location, input_format=Format.ttl, output_format=Format.json, ).convert(data, class_name) -- 2.52.0 From 1855d62b9a844b8cbb35101366e9385dd881ec1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20M=C3=B6nch?= Date: Thu, 21 May 2026 16:32:10 +0200 Subject: [PATCH 23/64] fix tests Remove schema-definitions from sqlite-backend configuration. Use empty dicitionary as default for Configuration.admin_tokens. --- dump_things_service/abstract_config.py | 2 +- dump_things_service/tests/fixtures.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index 157a2a7..d5b1593 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -147,7 +147,7 @@ class AdminTokenConfig(StrictModel): class Configuration(BaseModel): collections: dict[str, CollectionConfig] = {} tokens: dict[str, TokenConfig] = {} - admin_tokens: dict[str, AdminTokenConfig] | None = None + admin_tokens: dict[str, AdminTokenConfig] = {} pid: str = dump_things_config_iri diff --git a/dump_things_service/tests/fixtures.py b/dump_things_service/tests/fixtures.py index 67f5728..a207428 100644 --- a/dump_things_service/tests/fixtures.py +++ b/dump_things_service/tests/fixtures.py @@ -67,7 +67,6 @@ g_default_collections.append( incoming=PurePosixPath(f'{incoming}/collection_8'), backend=SQLiteBackendConfig( type='sqlite', - schema=test_schema_location, ) ) ) @@ -88,7 +87,6 @@ g_default_collections.extend([ incoming=PurePosixPath(f'{incoming}/collection_dlflatsocial-2'), backend=SQLiteBackendConfig( type='sqlite', - schema='https://concepts.datalad.org/s/flat-social/unreleased.yaml', ), use_classes=[ 'Organization', -- 2.52.0 From e91162bdd8ccbbd9bd49e7a907a00d2c7e906082 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20M=C3=B6nch?= Date: Thu, 21 May 2026 16:41:34 +0200 Subject: [PATCH 24/64] improve admin_token handling in config-loader --- dump_things_service/commands/load_config.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dump_things_service/commands/load_config.py b/dump_things_service/commands/load_config.py index 729fbbb..adf057b 100644 --- a/dump_things_service/commands/load_config.py +++ b/dump_things_service/commands/load_config.py @@ -140,6 +140,7 @@ def convert_to_new_format( for token_representation, old_token_config in old_configuration['tokens'].items() }, 'collections': old_configuration['collections'], + 'admin_tokens': {}, } return new_configuration @@ -195,10 +196,7 @@ def create_admin_tokens( api_url: str, admin_token: str, ): - for admin_token_name, admin_token_config in configuration.get( - 'admin_tokens', - {} - ).items(): + for admin_token_name, admin_token_config in configuration['admin_tokens'].items(): _post_data( url=api_url + '/admin_tokens', data={ -- 2.52.0 From cee9e11b67bc1bdd3e0a53bbb8fa84a1bd6d095b Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 22 May 2026 00:26:57 +0200 Subject: [PATCH 25/64] remove unused attributes from instance_state --- dump_things_service/instance_state.py | 1 - dump_things_service/utils.py | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/dump_things_service/instance_state.py b/dump_things_service/instance_state.py index 436d276..d5e1c94 100644 --- a/dump_things_service/instance_state.py +++ b/dump_things_service/instance_state.py @@ -97,7 +97,6 @@ class InstanceState: schema_info: dict[str, SchemaInfo] = dataclasses.field(default_factory=dict) validators: dict = dataclasses.field(default_factory=dict) order_by: list[str] = dataclasses.field(default_factory=list) - all_stores: dict = dataclasses.field(default_factory=dict) g_instance_state:InstanceState | None = None diff --git a/dump_things_service/utils.py b/dump_things_service/utils.py index 457136d..c023a85 100644 --- a/dump_things_service/utils.py +++ b/dump_things_service/utils.py @@ -342,10 +342,7 @@ def create_token_store( 'id': submission_tags.submitter_id_tag, 'time': submission_tags.submission_time_tag, } - model_store = ModelStore(backend=token_store, schema=schema_uri, tags=tags) - instance_state.all_stores[store_dir] = (collection_name, model_store) - - return model_store + return ModelStore(backend=token_store, schema=schema_uri, tags=tags) def create_record_dir_token_store( -- 2.52.0 From 0cd1faf45b5aae38500290bd4cfd30557ae38f63 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 22 May 2026 10:57:27 +0200 Subject: [PATCH 26/64] add collection configuration checks Check the following two conditions when new collections are created: 1. the name of the collection is not a reserved name. 2. collections do not share incoming or curated directories with other collections. This is done to prevent data leaks, which could happen when different collections share directories. --- dump_things_service/__init__.py | 11 ++++ dump_things_service/abstract_config.py | 7 ++- dump_things_service/collection.py | 19 +++---- dump_things_service/collection_endpoints.py | 40 +++++++++++++- dump_things_service/tests/__init__.py | 4 ++ dump_things_service/tests/test_basic.py | 3 +- dump_things_service/tests/test_config.py | 58 +++++++++++++++++++++ 7 files changed, 125 insertions(+), 17 deletions(-) diff --git a/dump_things_service/__init__.py b/dump_things_service/__init__.py index dbd6b51..4bb4a39 100644 --- a/dump_things_service/__init__.py +++ b/dump_things_service/__init__.py @@ -40,6 +40,7 @@ __all__ = [ 'JSON', 'YAML', 'config_file_name', + 'reserved_collection_names', ] @@ -52,3 +53,13 @@ JSON = Union[dict[str, Any], list[Any], str, int, float, None] YAML = JSON config_file_name = '.dumpthings.yaml' + +dump_things_private_collection_name = '__dump_things__' + + +reserved_collection_names = ( + 'collections', + 'tokens', + 'admin_tokens', + dump_things_private_collection_name, +) diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index d5b1593..b35d6bd 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -18,7 +18,10 @@ from pydantic import ( Field, ) -from dump_things_service import HTTP_404_NOT_FOUND +from dump_things_service import ( + HTTP_404_NOT_FOUND, + dump_things_private_collection_name, +) from dump_things_service.audit.gitaudit import GitAuditBackend from dump_things_service.backends.record_dir import ( _RecordDirStore, @@ -35,7 +38,7 @@ logger = logging.getLogger('dump_things_service') g_abstract_configuration = None dump_things_config_iri = 'dump_things:config' -dump_things_private_path = Path('__dump_things__') +dump_things_private_path = Path(dump_things_private_collection_name) config_backend_path = dump_things_private_path / 'config_store' config_audit_path = dump_things_private_path / 'config_audit' config_backend = None diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index 4de8446..d2c552a 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import Any from fastapi import ( + Depends, FastAPI, HTTPException, ) @@ -20,6 +21,7 @@ from starlette.responses import ( from starlette.status import HTTP_401_UNAUTHORIZED from dump_things_service import ( + Format, HTTP_400_BAD_REQUEST, HTTP_403_FORBIDDEN, HTTP_422_UNPROCESSABLE_CONTENT, @@ -38,7 +40,6 @@ from dump_things_service.abstract_config import ( get_default_token_representation, get_mapping_function, ) -from dump_things_service.audit import AuditBackend from dump_things_service.audit.gitaudit import GitAuditBackend from dump_things_service.auth.config import ConfigAuthenticationSource from dump_things_service.auth.forgejo import ForgejoAuthenticationSource @@ -81,16 +82,12 @@ from dump_things_service.utils import ( # This following lines are required for dynamic endpoint generation -from typing import Annotated #noqa 401 -from fastapi import ( - Body, - Depends, -) -from dump_things_service import Format -from dump_things_service.api_key import api_key_header_scheme -from dump_things_service.curated import store_curated_record -from dump_things_service.incoming import store_incoming_record -from dump_things_service.validate import validate_record +from typing import Annotated # noqa 401 -- used by autogenerated code +from fastapi import Body # noqa 401 -- used by autogenerated code +from dump_things_service.api_key import api_key_header_scheme # noqa 401 -- used by autogenerated code +from dump_things_service.curated import store_curated_record # noqa 401 -- used by autogenerated code +from dump_things_service.incoming import store_incoming_record # noqa 401 -- used by autogenerated code +from dump_things_service.validate import validate_record # noqa 401 -- used by autogenerated code logger = logging.getLogger('dump_things_service') diff --git a/dump_things_service/collection_endpoints.py b/dump_things_service/collection_endpoints.py index a727234..e8404f8 100644 --- a/dump_things_service/collection_endpoints.py +++ b/dump_things_service/collection_endpoints.py @@ -1,4 +1,8 @@ import logging +from pathlib import ( + Path, + PurePosixPath, +) from typing import Literal from urllib.parse import quote @@ -14,15 +18,16 @@ from dump_things_service import ( HTTP_201_CREATED, HTTP_404_NOT_FOUND, HTTP_409_CONFLICT, + reserved_collection_names, ) from dump_things_service.abstract_config import ( store_config, CollectionConfig, - get_config, + get_config, Configuration, ) from dump_things_service.admin import authenticate_admin from dump_things_service.api_key import api_key_header_scheme -from dump_things_service.instance_state import get_instance_state +from dump_things_service.instance_state import get_instance_state, InstanceState from dump_things_service.manifest import manifest_configuration from dump_things_service.exceptions import ConfigError from dump_things_service.utils import wrap_http_exception @@ -81,6 +86,21 @@ async def create_collection( detail=f"Collection with name '{body.name}' already exists.", ) + # Check for reserved collection names + if body.name in reserved_collection_names: + raise HTTPException( + status_code=HTTP_409_CONFLICT, + detail=f"Collection name '{body.name}' is reserved and cannot be created.", + ) + + # Check for distinct directories + for directory in (body.incoming, body.curated): + ensure_unique_directory( + abstract_config, + instance_state, + directory, + ) + # Update the abstract configuration abstract_config.collections[body.name] = body @@ -172,3 +192,19 @@ async def delete_collections( store_path=instance_state.store_path, config=abstract_config, ) + + +def ensure_unique_directory( + abstract_config: Configuration, + instance_state: InstanceState, + existing_dir: PurePosixPath, +): + abs_existing_dir = (instance_state.store_path / Path(existing_dir)).absolute() + for collection_name, collection_config in abstract_config.collections.items(): + for collection_dir in collection_config.curated, collection_config.incoming: + abs_collection_dir = (instance_state.store_path / Path(collection_dir)).absolute() + if abs_collection_dir == abs_existing_dir: + raise HTTPException( + status_code=HTTP_409_CONFLICT, + detail=f"Directory '{collection_dir}' already used by collection '{collection_name}'.", + ) diff --git a/dump_things_service/tests/__init__.py b/dump_things_service/tests/__init__.py index e69de29..d5185ae 100644 --- a/dump_things_service/tests/__init__.py +++ b/dump_things_service/tests/__init__.py @@ -0,0 +1,4 @@ +from pathlib import Path + +# Path to a local simple test schema +schema_file = Path(__file__).parent / 'testschema.yaml' diff --git a/dump_things_service/tests/test_basic.py b/dump_things_service/tests/test_basic.py index 02348bd..186582c 100644 --- a/dump_things_service/tests/test_basic.py +++ b/dump_things_service/tests/test_basic.py @@ -2,6 +2,7 @@ from pathlib import Path import pytest # F401 +from . import schema_file from .. import ( HTTP_200_OK, HTTP_400_BAD_REQUEST, @@ -18,8 +19,6 @@ from .create_store import ( ) from .test_utils import basic_write_locations -# Path to a local simple test schema -schema_file = Path(__file__).parent / 'testschema.yaml' extra_record = { 'schema_type': 'abc:Person', diff --git a/dump_things_service/tests/test_config.py b/dump_things_service/tests/test_config.py index 0f08aac..295d0ee 100644 --- a/dump_things_service/tests/test_config.py +++ b/dump_things_service/tests/test_config.py @@ -1,11 +1,69 @@ +from pathlib import PurePosixPath import pytest import yaml +from json_flattener import GlobalConfig from pydantic import ValidationError from yaml.scanner import ScannerError +from dump_things_service import HTTP_409_CONFLICT +from dump_things_service.abstract_config import ( + CollectionConfig, + dump_things_private_collection_name, +) from dump_things_service.exceptions import ConfigError +from . import schema_file + + +collection_config_pattern = CollectionConfig( + name='', + schema=str(schema_file), + default_token='test_default_token', + curated=PurePosixPath('curate_dir'), + incoming=PurePosixPath(f'incoming_dir'), +) + + +def test_illegal_collection_name_detection(fastapi_client_simple): + test_client, _ = fastapi_client_simple + + for name in ( + 'collections', + 'tokens', + 'admin_tokens', + dump_things_private_collection_name, + ): + response = test_client.post( + f'/collections', + json={ + **collection_config_pattern.model_dump(mode='json', by_alias=True), + 'name': name, + }, + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_409_CONFLICT + + +def test_collection_dir_reuse_detection(fastapi_client_simple): + test_client, _ = fastapi_client_simple + + for curated_path, incoming_path in ( + ('curated/collection_1', 'incoming/XXXX'), + ('curated/XXXX', 'incoming/collection_1'), + ('curated/collection_1', 'incoming/collection_2'), + ): + response = test_client.post( + f'/collections', + json={ + **collection_config_pattern.model_dump(mode='json', by_alias=True), + 'curated': curated_path, + 'incoming': incoming_path, + }, + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_409_CONFLICT + def xxx_test_scanner_error_detection(tmp_path): config_file_path = tmp_path / 'config.yaml' -- 2.52.0 From 297a4c268ef9c29f9f741aa0f31d617893674d27 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 22 May 2026 12:05:30 +0200 Subject: [PATCH 27/64] add `dump-things-hash-token`-command --- dump_things_service/commands/hash_token.py | 34 ++++++++++++++++++++++ pyproject.toml | 1 + 2 files changed, 35 insertions(+) create mode 100644 dump_things_service/commands/hash_token.py diff --git a/dump_things_service/commands/hash_token.py b/dump_things_service/commands/hash_token.py new file mode 100644 index 0000000..5f5d478 --- /dev/null +++ b/dump_things_service/commands/hash_token.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +import sys +from argparse import ArgumentParser + +from dump_things_service.abstract_config import hash_token_representation + + +parser = ArgumentParser( + prog='Hash a plain text token to create a hashed token in a dump-things server', + description='Hash a token and print the calculated hash value. The hash value ' + 'can be used to create a hashed token via the `/tokens`-endpoint ' + 'of a dump-things-server.', +) +parser.add_argument( + 'token', + type=str, + help='The plain text token', +) + + +def main(): + arguments = parser.parse_args() + + token = arguments.token.strip() + if any(map(lambda s: s.isspace(), token)): + print('Whitespace are not allowed in token', file=sys.stderr, flush=True) + return 1 + + print(hash_token_representation(token)) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/pyproject.toml b/pyproject.toml index 41f0165..7ead1fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,7 @@ dump-things-create-merged-schema = "dump_things_service.commands.create_merged_s dump-things-gitaudit-report = "dump_things_service.commands.gitaudit_report:main" dump-things-gitaudit-rebuild-index = "dump_things_service.commands.gitaudit_rebuild_index:main" dump-things-load-config = "dump_things_service.commands.load_config:main" +dump-things-hash-token = "dump_things_service.commands.hash_token:main" [tool.hatch.build.targets.wheel] exclude = [ -- 2.52.0 From 5bc17d48173619f062f4fbf4d1ebe8112187e24d Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 22 May 2026 12:48:38 +0200 Subject: [PATCH 28/64] use sha256 for token checksums --- dump_things_service/abstract_config.py | 2 +- dump_things_service/token_endpoints.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index b35d6bd..8b29784 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -339,7 +339,7 @@ def get_token_info_by_representation( def hash_token_representation( token_representation: str, ) -> str: - return hashlib.sha1(token_representation.encode()).hexdigest() + return hashlib.sha256(token_representation.encode()).hexdigest() def get_token_config_by_name( diff --git a/dump_things_service/token_endpoints.py b/dump_things_service/token_endpoints.py index ad067ce..4fca7d9 100644 --- a/dump_things_service/token_endpoints.py +++ b/dump_things_service/token_endpoints.py @@ -38,7 +38,7 @@ from dump_things_service.utils import wrap_http_exception logger = logging.getLogger('dump_things_service') router = APIRouter() -hash_matcher = re.compile(r'^[a-f0-9A-F]{40}$') +hash_matcher = re.compile(r'^[a-f0-9A-F]{64}$') class TokenRequest(TokenConfig): @@ -249,7 +249,7 @@ async def create_admin_token( raise HTTPException(status_code=HTTP_406_NOT_ACCEPTABLE, detail=detail) if not hash_matcher.match(body.representation.strip()): - detail='Hashed token is not a 40-digits hex-number' + detail='Hashed token is not a 64-digits hex-number' raise HTTPException(status_code=HTTP_406_NOT_ACCEPTABLE, detail=detail) # Check for existing token-name -- 2.52.0 From 7b00685da54102dbba3a9434e52331586c414c67 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 22 May 2026 13:28:55 +0200 Subject: [PATCH 29/64] fix method names of endpoints --- dump_things_service/collection_endpoints.py | 4 ++-- dump_things_service/token_endpoints.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dump_things_service/collection_endpoints.py b/dump_things_service/collection_endpoints.py index e8404f8..60eef9f 100644 --- a/dump_things_service/collection_endpoints.py +++ b/dump_things_service/collection_endpoints.py @@ -139,7 +139,7 @@ async def get_collections( tags=['Administration interface'], name='Get existing collection by name', ) -async def get_collections( +async def get_collection_with_name( collection_name: str, api_key: str = Depends(api_key_header_scheme), ) -> CollectionConfig: @@ -163,7 +163,7 @@ async def get_collections( tags=['Administration interface'], name='Delete collection with name', ) -async def delete_collections( +async def delete_collection( collection_name: str, api_key: str = Depends(api_key_header_scheme), ): diff --git a/dump_things_service/token_endpoints.py b/dump_things_service/token_endpoints.py index 4fca7d9..b192e6a 100644 --- a/dump_things_service/token_endpoints.py +++ b/dump_things_service/token_endpoints.py @@ -204,7 +204,7 @@ async def get_token_with_name( tags=['Administration interface'], name='Delete token with name', ) -async def get_token_with_name( +async def delete_token_with_name( token_name: str, api_key: str = Depends(api_key_header_scheme), ): @@ -277,7 +277,7 @@ async def create_admin_token( tags=['Administration interface'], name='Get admin token names', ) -async def create_admin_token( +async def get_admin_token( api_key: str = Depends(api_key_header_scheme), ) -> list[str]: instance_state = get_instance_state() @@ -297,7 +297,7 @@ async def create_admin_token( tags=['Administration interface'], name='Delete admin token with name', ) -async def create_admin_token( +async def delete_admin_token( token_name: str, api_key: str = Depends(api_key_header_scheme), ): -- 2.52.0 From 911820a26d1a4bc8d5556c7f457031025017eb19 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 22 May 2026 13:30:27 +0200 Subject: [PATCH 30/64] update README.md and CHANGELOG.md --- CHANGELOG.md | 41 +++++++++++++ README.md | 164 ++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 183 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea675fc..2819285 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,44 @@ +# 6.0.0b1 + +## New features + +- Allow dynamic creation and deletion of collections, tokens and + administration-tokens via the endpoints: `/collections`, `/tokens`, and + `/admin_tokens`. + +- The configuration is persisted in the store of the service. An audit log + log of all configuration changes is kept. + +- A new token type is introduced: admin-tokens. Admin-tokens are required to + perform the creation- and deletion-operations described above. + +- The tool `dump-things-load-config` was added to manifest a configuration that + is defined in a configuration file on a running server. + +- The tool `dump-things-hash-token` was added to calculate the hashed + representation of a token. This representation can be used to create a + "hashed"-token, i.e., a token that is stored in hashed format. So the plain + token is never stored in the dump-things server configuration + +## Breaking changes + +- Configuration structure has changed: + 1. `schema` is now an attribute of a collection. It is no longer an + attribute of a sqlite-backend configuration. + 2. The keys in the top-level mapping `tokens` are now token names and no + longer token representations. Token representations are now defined in + the values of the `tokens`-mapping + 3. The top-level mapping `admin_tokens` was added. + +- Configuration files are no longer read when the service is started. Instead + the service reads its configuration from the store, if it is present. Thw tool + (`dump-things-load-config`) can read an existing configuration + file and manifest the described configuration on a running dump-things server. + It supports pre version 6 config files and converts them to the new + configuration structure (except from hashed tokens, which cannot be + automatically converted). + + # 5.6.1 (2026-03-20) ## Bugfixes diff --git a/README.md b/README.md index 024aee7..111c3d8 100644 --- a/README.md +++ b/README.md @@ -47,14 +47,15 @@ The following command line parameters are supported: - `--port `: The port on which the service should accept connections (default: `8000`). -- `-c/--config `: provide a path to the configuration file. The configuration file in `/.dumpthings.yaml` will be ignored, if it exists at all. - - `--origins `: add a CORS origin hosts (repeat to add multiple CORS origin URLs).` - `--root-path `: Set the ASGI 'root_path' for applications submounted below a given URL path. - `--log-level`: set the log level for the service, allowed values are `ERROR`, `WARNING`, `INFO`, `DEBUG`. The default-level is `WARNING`. +- `--admin_token `: set an administrator token. This token can be used to create and delete collections, tokens, and admin tokens. This is useful to configure the service if no admin token was yet created. + + ```bash dump-things-service /data-storage/store --host 127.0.0.1 --port 8000 ``` @@ -63,16 +64,18 @@ The above command runs the service on the network location `127.0.0.1:8000` and ### Configuration file -The service is configured via a configuration file that defines collections, paths for incoming and curated data for each collection, as well as token properties. +The service provides the tool `dump-things-load-config` which can load configurations from a file and manifest those configurations on a running service via the administration endpoints. + +A configuration defines collections, paths for incoming and curated data for each collection, as well as token properties. Token properties include a submitter identification and for each collection an incoming zone specifier, permissions for reading and writing of the incoming zone and permission for reading the curated data of the collection. -A "formal" definition of the configuration file is provided by the class `GlobalConfig` in the file `dumpthings-server/config.py`. +A "formal" definition of the configuration file is provided by the class `Configuration` in the file `dumpthings-server/abstract_config.py`. -Configurations are read in YAML format. The following is an example configuration file that illustrates all options: +Configurations are read in YAML format. The following is an example configuration file (version 6 and higher) that illustrates all options: ```yaml type: collections # has to be "collections" -version: 1 # has to be 1 +version: 2 # has to be 2 # All collections are listed in "collections" collections: @@ -86,6 +89,9 @@ collections: # client provided token. default_token: no_access + # The schema that is used by the collection + schema: https://concepts.inm7.de/s/flat-data/unreleased.yaml + # The path to the curated data of the collection. This path should contain the # ".dumpthings.yaml"-configuration for collections that is described # here: . @@ -134,13 +140,24 @@ collections: # All tokens are listed in "tokens" tokens: - # The following entry defines the token "basic_access". This token allows read-only - # access to the two collections: "rooms_and_buildings" and "fixed_data". + # The following entry defines the token "basic_access". basic_access: + # The representation of the token, this is the value that the user has to + # provide in the `x-dumpthings-token`-header to authenticate with this token. + representation: anonymous + + # If hashed is `True`, the representation must be a 40-hexdigit number, + # representing the hash of the plain token. Setting `hashed` to `True` ensures + # that the plain-text token is not stored in the configuration store of the + # running server. + # + # The tool `dump-things-hash-token` can be used to calculate the correct hash. + hashed: False + # The value of "user_id" will be added as an annotation to each record that is # uploaded with this token. - user_id: anonymous + user_id: anonymous_user # The collections for which the token holds rights are defined in "collections" collections: @@ -155,8 +172,9 @@ tokens: # A token and collection-specific label, that defines "zones" in which incoming # records are stored. Multiple tokens can share the same zone, for example if # many clients with individual tokens work together to build a collection. - # (Since this token does not allow write access, "incoming_label" is ignored and - # left empty here (TODO: it should not be required in this case)). + # (Since this token does not allow write access, "incoming_label" is ignored. It + # is set to an empty string here in order to document it, but it could as well + # be omitted) incoming_label: '' # The rights that "basic_access" carries for the collection "fixed_data" @@ -168,6 +186,9 @@ tokens: # The following entry defines the token "no_access". This token does not allow # any access and is used as a default token for the collection "personal_records". no_access: + + representation: no_access + user_id: nobody collections: @@ -175,9 +196,10 @@ tokens: mode: NOTHING incoming_label: '' - # The following entry defines the token "admin". It gives full access rights to - # the collection "personal_records". - admin: + # The following entry defines a token with the name "admin_token" and the plain + # representation: "admin". It gives full access rights to the collection "personal_records". + admin_token: + representation: admin user_id: Admin collections: personal_records: @@ -187,6 +209,7 @@ tokens: # The following entry defines the token "contributor_bob". It gives full access # to "rooms_and_buildings" for a user with the id "Bob". contributor_bob: + representation: bob user_id: Bob collections: rooms_and_buildings: @@ -198,6 +221,7 @@ tokens: # same incoming-zone, i.e. "new_rooms_and_buildings". That means they can read # incoming records that the other one posted. contributor_alice: + representation: alice user_id: Alice collections: rooms_and_buildings: @@ -205,14 +229,17 @@ tokens: incoming_label: new_rooms_and_buildings # The following entry defines a hashed token because the key `hashed` is set - # to `True`. A hashed token has the structure - # `-`. It will match an incoming token if the incoming token has - # the structure `-` and if sha256(``) equals ``. - # In this example, if the client presents the token `bob-hello`, he will be - # granted access because `sha256('hello')` equals - # `2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824` - bob-2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824: + # to `True`. A hashed token representation is the hex-digit representation of + # the sha-256 checksum of the plain token. + # In this example, if the client presents the token `hello`, he will be + # granted access because `dump-things-hash-token 'hello'` yields + # `90b1b286043f1b7612e423c74608f5ea2f676340507f0b67219b20d09fc4777b`, i.e. + # sha256('hello') == 90b1b286043f1b7612e423c74608f5ea2f676340507f0b67219b20d09fc4777b + # is true. + hashed_token_1: + representation: 90b1b286043f1b7612e423c74608f5ea2f676340507f0b67219b20d09fc4777b hashed: True + user_id: Walter collections: rooms_and_buildings: mode: WRITE_COLLECTION @@ -250,6 +277,7 @@ collections: collection_with_default_record_dir+stl_backend: # This is a collection with the default backend, i.e. `record_dir+stl` and # the default authentication, i.e. config-based authentication. + schema: https://concepts.inm7.de/s/flat-data/unreleased.yaml default_token: anon_read curated: collection_1/curated @@ -268,6 +296,7 @@ collections: # example by the forgejo-instance at `https://forgejo.example.com`. # If there is more than one authentication source, they will be tried # in the order they are defined in the config file. + schema: https://concepts.inm7.de/s/flat-data/unreleased.yaml default_token: anon_read # We still need a default token curated: collection_2/curated @@ -313,6 +342,7 @@ collections: # permissions, user-id, and incoming from the config file. collection_with_explicit_record_dir+stl_backend: + schema: https://concepts.inm7.de/s/flat-data/unreleased.yaml default_token: anon_read curated: collection_3/curated backend: @@ -322,6 +352,7 @@ collections: type: record_dir+stl collection_with_sqlite_backend: + schema: https://concepts.inm7.de/s/flat-data/unreleased.yaml default_token: anon_read curated: collection_4/curated backend: @@ -330,9 +361,18 @@ collections: # that holds the URL of the schema that should # be used in this backend. type: sqlite - schema: https://concepts.inm7.de/s/flat-data/unreleased.yaml ``` +#### Reserved names + +The following collection names are reserved and must not be used: + +- collections +- tokens +- admin_tokens +- __dump_things__ + + #### Authentication and authorization To authenticate and authorize a user based on tokens, dumpthing-service uses @@ -683,6 +723,59 @@ A `CURATOR`-token required to access these endpoints. Details about the curation endpoints can be found in [this issue](https://codeberg.org/datalink/dump-things-server/issues/118). +#### Administration endpoints + +Operations on the endpoints described in this section require an administrator token. +If desired, use `dump-things-load-config` to read the configuration from a file and +generate respective POST-requests. `dump-things-load-config` can also be used to +generate a configuration from an old, i.e. dump-things version < 6, configuration file. + +##### Collections + +- `POST /collections`: create a new collection from the posted configuration object. + For a specification of the configuration object see the object `CollectionConfig` in the file `dump_things_service/abstract_config.py` + (Use `dump-things-load-config` to read the configuration from a file and generate respective POST-requests) + +- `GET /collections`: get information about the currently existing collections. + +- `GET /collections/`: get information about the collection with name ``. + +- `DELETE /collections/`: delete the collection with the given name. + Note: deleting a collection does not delete any records or any storage dir, it just removes the + collection from the internal state of the service. Recreating it (via `POST /collections`) will make + all data reachable again through the Web-API. + + +##### Tokens + +- `POST /tokens`: create a new token from the posted configuration object. + For a specification of the configuration object see the object `TokenRequest` in the file `dump_things_service/token_endpoints.py` + NOTE: Before a token for configuration can be generated, the collection must exist. + +- `GET /tokens`: get information about the currently existing tokens. + +- `GET /tokens/`: get information about the token with name ``. + +- `DELETE /tokens/`: delete the token with the given name. + +- this endpoint (ending on `.../p/`) provides the same functionality as the endpoint `GET //records/` (without `.../p/`) but supports result pagination. In addition to the query parameters `format` and `matching`, it supports the query parameters `page` and `size`. + + +##### Admin Tokens + +- `POST /admin_tokens`: create a new admin token from the posted configuration object. + For a specification of the configuration object see the object `AdminTokenRequest` in the file `dump_things_service/token_endpoints.py` + Note that admin token are always stored as hashed values. + Therefore the representation in the request should be `sha256()` + +- `GET /admin_tokens`: get information about the currently existing admin tokens. + +- `GET /admin_tokens/`: get information about the admin token with name ``. + +- `DELETE /admin_tokens/`: delete the admin token with the given name. + + + ### Tips & Tricks @@ -764,6 +857,33 @@ If any backend is a `record_dir+stl` backend, a schema has to be supplied via th - `dump-things-create-merged-schema`: this command creates a new schema that statically contains all schemas that the original schema imports. The new schema is fully self-contained and does not reference any other schemas. + +### Migrate to version 6 + +Migration to version 6 is simple. It involves the following steps: + + +1. Start the version 6 service on the store that you used and provide an +administrator token +``` +> dump-things-service --admin-token admin-1 +``` + +2. Use `dump-things-load-config` to load the old configuration from the +configuration file that you used in the old version. + +``` +> export DTS_ADMIN_TOKEN=admin-1 +> dump-things-load-config --send-to "https://" --old-config /.dumpthings.yaml +``` + +At this point the service should be running and be configured exactly as +before. The configuration is persisted and will be etablished next time the +service starts. + + + + ### If things go wrong #### Delete a record manually -- 2.52.0 From 907f5ffbb6f22389377f74181e5bdd4d76a0cbef Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 22 May 2026 13:31:35 +0200 Subject: [PATCH 31/64] bump version to 6.0.0b1 --- dump_things_service/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dump_things_service/__about__.py b/dump_things_service/__about__.py index 2c06c79..f11c59f 100644 --- a/dump_things_service/__about__.py +++ b/dump_things_service/__about__.py @@ -1 +1 @@ -__version__ = '5.6.1' +__version__ = '6.0.0b1' -- 2.52.0 From a853578482d0d7743b2c54a6421ae2f3ade0b6cd Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 22 May 2026 13:34:50 +0200 Subject: [PATCH 32/64] add MIT license --- LICENSE | 24 ++++++++++++++++++++++++ pyproject.toml | 1 + 2 files changed, 25 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..fc7de03 --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +things-graph-renderer, including all examples, code snippets and attached +documentation is covered by the MIT license. + + The MIT License + + Copyright (c) 2026- Michael Hanke + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. diff --git a/pyproject.toml b/pyproject.toml index 7ead1fb..99abeed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ authors = [ { name = "Christian Mönch", email = "christian.moench@web.de" }, ] classifiers = [ + "License :: OSI Approved :: MIT License", "Development Status :: 4 - Beta", "Programming Language :: Python", "Programming Language :: Python :: 3.8", -- 2.52.0 From 295e31e8bb2eac12e3983bfaf8c235628d978fd1 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 26 May 2026 09:14:30 +0200 Subject: [PATCH 33/64] add `--admin-token-hash`, remove `--admin-token` This commit removes the option `--admin-token` and adds the option `--admin-token-hash`, which accepts a hashed token (sha256 in 64-digits hex representation). --- dump_things_service/admin.py | 4 +-- dump_things_service/main.py | 37 +++++++++++++++++++++------ dump_things_service/tests/fixtures.py | 4 +-- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/dump_things_service/admin.py b/dump_things_service/admin.py index 2f95b59..20a19c0 100644 --- a/dump_things_service/admin.py +++ b/dump_things_service/admin.py @@ -19,11 +19,11 @@ def authenticate_admin( api_key: str, ): if api_key: - if api_key == instance_state.bootstrap_token: + hashed_token_representation = hash_token_representation(api_key) + if hashed_token_representation == instance_state.bootstrap_token: logger.info('authenticate_admin: using bootstrap token') return - hashed_token_representation = hash_token_representation(api_key) for token_name, token_config in abstract_config.admin_tokens.items(): if token_config.representation == hashed_token_representation: logger.info( diff --git a/dump_things_service/main.py b/dump_things_service/main.py index 7e85f7a..b297a22 100644 --- a/dump_things_service/main.py +++ b/dump_things_service/main.py @@ -3,6 +3,7 @@ from __future__ import annotations # noqa: I001 -- the patches have to be impor import argparse import logging import os +import sys from pathlib import Path from typing import TYPE_CHECKING @@ -40,6 +41,7 @@ from dump_things_service import ( from dump_things_service.__about__ import __version__ from dump_things_service.abstract_config import ( check_collection, + hash_token_representation, read_config, ) from dump_things_service.api_key import api_key_header_scheme @@ -57,7 +59,10 @@ from dump_things_service.lazy_list import ( ) from dump_things_service.model import get_subclasses from dump_things_service.collection_endpoints import router as collection_router -from dump_things_service.token_endpoints import router as token_router +from dump_things_service.token_endpoints import ( + hash_matcher, + router as token_router, +) from dump_things_service.utils import ( authenticate_token, check_bounds, @@ -99,10 +104,12 @@ parser.add_argument('--host', default='0.0.0.0') # noqa S104 parser.add_argument('--port', default=8000, type=int) parser.add_argument('--origins', action='append', default=[]) parser.add_argument( - '--admin-token', + '--admin-token-hash', type=str, default='', - help='An initial admin token that will allow to add or remove tokens and collections', + help='The sha256 hash of an initial admin token that will allow to add or ' + 'remove tokens, collections, and additional admin tokens (64 ' + 'characters hex-digit)', ) parser.add_argument( '--root-path', @@ -136,16 +143,30 @@ Curators store data in an incoming area or in the curated area and read data from any incoming area or the curated area. -For more information refer to the [README-file](https://github.com/christian-monch/dump-things-server?tab=readme-ov-file#dump-things-service) +For more information refer to the [README-file](https://hub.psychoinformatics.de/orinoco/dump-things-server) of the project. """ arguments = parser.parse_args() -# Get bootstrap token from environment -if not arguments.admin_token: - arguments.admin_token = os.environ.get('DTS_ADMIN_TOKEN', '') +# Try to get bootstrap token from environment if an admin token hash is +# not provided via option +if not arguments.admin_token_hash: + if 'DTS_ADMIN_TOKEN' in os.environ: + arguments.admin_token_hash = hash_token_representation( + os.environ.get('DTS_ADMIN_TOKEN', ''), + ) +else: + # Validate the hash token format + if not hash_matcher.match(arguments.admin_token_hash): + print( + 'Hashed admin token is not a 64-digits hex-number', + file=sys.stderr, + flush=True, + ) + sys.exit(1) + # Set the log level numeric_level = getattr(logging, arguments.log_level.upper(), None) @@ -192,7 +213,7 @@ add_pagination(app) g_instance_state = create_instance_state( store_path=store_path, - bootstrap_token=arguments.admin_token, + bootstrap_token=arguments.admin_token_hash, fastapi_app=app, ) diff --git a/dump_things_service/tests/fixtures.py b/dump_things_service/tests/fixtures.py index a207428..f270b0b 100644 --- a/dump_things_service/tests/fixtures.py +++ b/dump_things_service/tests/fixtures.py @@ -13,7 +13,7 @@ from dump_things_service.abstract_config import ( GitAuditBackendConfig, SQLiteBackendConfig, TokenCollectionConfig, - TokenModes, + TokenModes, hash_token_representation, ) from dump_things_service.backends import StorageBackend from dump_things_service.backends.record_dir import RecordDirStore @@ -390,7 +390,7 @@ def fastapi_app_simple(dump_stores_simple): old_sys_argv = sys.argv sys.argv = [ 'test-runner', - '--admin-token', 'admin-1', + '--admin-token-hash', hash_token_representation('admin-1'), str(tmp_path), ] from dump_things_service.main import app -- 2.52.0 From b4ca7f555b5167392f1cb1626ac55ea2a7d8cb45 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 26 May 2026 11:41:52 +0200 Subject: [PATCH 34/64] refactor imports --- dump_things_service/instance_state.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dump_things_service/instance_state.py b/dump_things_service/instance_state.py index d5e1c94..359a185 100644 --- a/dump_things_service/instance_state.py +++ b/dump_things_service/instance_state.py @@ -23,9 +23,7 @@ from dump_things_service.abstract_config import ( ) from dump_things_service.converter import get_conversion_objects -from dump_things_service.exceptions import ( - ConfigError, -) +from dump_things_service.exceptions import ConfigError from dump_things_service.model import ( get_model_for_schema, get_schema_model_for_schema, -- 2.52.0 From 3675cac64e67d74724360928d9addf6f99c6d79f Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 26 May 2026 14:00:54 +0200 Subject: [PATCH 35/64] fix forgejo instance id setting --- dump_things_service/collection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index d2c552a..11560cd 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -351,7 +351,7 @@ def create_authentication_source( organization=authentication_spec.organization, team=authentication_spec.team, label_type=authentication_spec.label_type, - instance_id=authentication_spec.repository, + instance_id=authentication_spec.instance_id, repository=authentication_spec.repository, ) else: -- 2.52.0 From 171f38bc735172d00e2e72e2c0f030ea8f231d7b Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 26 May 2026 14:14:22 +0200 Subject: [PATCH 36/64] initialize empty store from configuration file --- README.md | 2 +- dump_things_service/abstract_config.py | 37 +++++- dump_things_service/collection.py | 15 ++- dump_things_service/collection_endpoints.py | 24 ++-- dump_things_service/commands/load_config.py | 6 + dump_things_service/main.py | 123 +++++++++++++++++- dump_things_service/tests/fixtures.py | 56 +------- .../tests/test_collection_administration.py | 26 +++- dump_things_service/tests/test_config.py | 12 +- 9 files changed, 218 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index 111c3d8..18bdaae 100644 --- a/README.md +++ b/README.md @@ -733,7 +733,7 @@ generate a configuration from an old, i.e. dump-things version < 6, configuratio ##### Collections - `POST /collections`: create a new collection from the posted configuration object. - For a specification of the configuration object see the object `CollectionConfig` in the file `dump_things_service/abstract_config.py` + For a specification of the configuration object see the object `CollectionRequest` in the file `dump_things_service/collection_endpoints.py` (Use `dump-things-load-config` to read the configuration from a file and generate respective POST-requests) - `GET /collections`: get information about the currently existing collections. diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index 8b29784..dff0477 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -62,6 +62,7 @@ class ForgejoAuthSpec(BaseModel): organization: str team: str label_type: Literal['team', 'user'] + instance_id: str | None = None repository: str | None = None @@ -88,7 +89,6 @@ class GitAuditBackendConfig(StrictModel): class CollectionConfig(BaseModel): model_config = ConfigDict(extra='forbid', use_enum_values=True) - name: str default_token: str curated: PurePosixPath schema_location: str = Field(alias='schema') @@ -154,6 +154,41 @@ class Configuration(BaseModel): pid: str = dump_things_config_iri +class TokenConfig_v1(StrictModel): + user_id: str + collections: dict[str, TokenCollectionConfig] + hashed: bool = False + + +class RecordDirBackendConfig_v1(StrictModel): + model_config = ConfigDict(use_enum_values=True) + type: Literal['record_dir', 'record_dir+stl'] + + +class SQLiteBackendConfig_v1(StrictModel): + type: Literal['sqlite', 'sqlite+stl'] + schema: str + + +class CollectionConfig_v1(StrictModel): + default_token: str + curated: Path + incoming: Path | None = None + backend: RecordDirBackendConfig_v1 | SQLiteBackendConfig_v1 | None = None + auth_sources: list[ForgejoAuthSpec | ConfigAuthSpec] = [ConfigAuthSpec()] + submission_tags: TagSpec = TagSpec() + use_classes: list[str] = [] + ignore_classes: list[str] = [] + audit_backends: list[GitAuditBackendConfig] = [] + + +class ConfigurationFile_v1(StrictModel): + type: Literal['collections'] + version: Literal[1] + collections: dict[str, CollectionConfig_v1] + tokens: dict[str, TokenConfig_v1] + + mode_mapping = { TokenModes.READ_CURATED: TokenPermission(curated_read=True), TokenModes.READ_COLLECTION: TokenPermission( diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index 11560cd..54b73d6 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -267,6 +267,7 @@ def create_collection( # inbox-storing, and for curated area storing. create_endpoints_for_collection( instance_state, + collection_name, collection_configuration, instance_state.fastapi_app, ) @@ -452,6 +453,7 @@ def create_endpoint( operation_name: str, operation_path: str, instance_state: InstanceState, + collection_name: str, collection_config: CollectionConfig, template: str, handler: str, @@ -462,10 +464,9 @@ def create_endpoint( logger.info( f'Creating %s-endpoints for collection: "%s"', operation_name, - collection_config.name, + collection_name, ) - collection_name = collection_config.name instance_state.collections[collection_name].tag_info[tag_group] = tag_name # TODO: get schema_info from instance_state!? @@ -504,6 +505,7 @@ def create_endpoint( def create_endpoints_for_collection( instance_state: InstanceState, + collection_name: str, collection_config: CollectionConfig, app: FastAPI, ): @@ -515,15 +517,16 @@ def create_endpoints_for_collection( tag_group, tag_name, ) in ( - ('store', 'record', _endpoint_template, 'store_record', 'write', f'Write records to collection "{collection_config.name}"'), - ('validate', 'validate/record', _endpoint_template, 'validate_record', 'validate', f'Validate records for collection "{collection_config.name}"'), - ('curated', 'curated/record', _endpoint_curated_template, 'store_curated_record', 'curated_write', f'Curated area: store records in curated area of collection "{collection_config.name}"'), - ('incoming', 'incoming/{label}/record', _endpoint_incoming_template, 'store_incoming_record', 'incoming_write', f'Incoming area: store records in incoming area "{{label}}" of collection "{collection_config.name}"'), + ('store', 'record', _endpoint_template, 'store_record', 'write', f'Write records to collection "{collection_name}"'), + ('validate', 'validate/record', _endpoint_template, 'validate_record', 'validate', f'Validate records for collection "{collection_name}"'), + ('curated', 'curated/record', _endpoint_curated_template, 'store_curated_record', 'curated_write', f'Curated area: store records in curated area of collection "{collection_name}"'), + ('incoming', 'incoming/{label}/record', _endpoint_incoming_template, 'store_incoming_record', 'incoming_write', f'Incoming area: store records in incoming area "{{label}}" of collection "{collection_name}"'), ): create_endpoint( operation_name=operation_name, operation_path=operation_path, instance_state=instance_state, + collection_name=collection_name, collection_config=collection_config, template=template, handler=handler, diff --git a/dump_things_service/collection_endpoints.py b/dump_things_service/collection_endpoints.py index 60eef9f..15954fd 100644 --- a/dump_things_service/collection_endpoints.py +++ b/dump_things_service/collection_endpoints.py @@ -21,9 +21,11 @@ from dump_things_service import ( reserved_collection_names, ) from dump_things_service.abstract_config import ( - store_config, + Configuration, CollectionConfig, - get_config, Configuration, + StrictModel, + store_config, + get_config, ) from dump_things_service.admin import authenticate_admin from dump_things_service.api_key import api_key_header_scheme @@ -56,6 +58,13 @@ class TagSpec(BaseModel): submission_time_tag: str = 'http://semanticscience.org/resource/SIO_001083' +from pydantic import ConfigDict, Field +from dump_things_service.abstract_config import RecordDirBackendConfig, SQLiteBackendConfig, GitAuditBackendConfig + +class CollectionRequest(CollectionConfig): + name: str + + @router.post( '/collections', tags=['Administration interface'], @@ -64,7 +73,7 @@ class TagSpec(BaseModel): ) async def create_collection( response: Response, - body: CollectionConfig, + body: CollectionRequest, api_key: str = Depends(api_key_header_scheme), ): @@ -74,11 +83,6 @@ async def create_collection( # Check admin rights authenticate_admin(instance_state, abstract_config, api_key) - # TODO: read the current abstract configuration, check for a collection - # of the given name. If it does not exist yet, add a collection - # configuration that reflects the `body`. Then try to manifest the - # new configuration. If there are no errors, persist the new - # configuration. # Check for existing collection name if body.name in abstract_config.collections: raise HTTPException( @@ -124,14 +128,14 @@ async def create_collection( ) async def get_collections( api_key: str = Depends(api_key_header_scheme), -) -> list[CollectionConfig]: +) -> dict[str, CollectionConfig]: instance_state = get_instance_state() abstract_config = get_config() # Check admin rights authenticate_admin(instance_state, abstract_config, api_key) - return list(abstract_config.collections.values()) + return abstract_config.collections @router.get( diff --git a/dump_things_service/commands/load_config.py b/dump_things_service/commands/load_config.py index adf057b..7af0a3b 100644 --- a/dump_things_service/commands/load_config.py +++ b/dump_things_service/commands/load_config.py @@ -122,6 +122,12 @@ def convert_to_new_format( collection_config['schema'] = backend['schema'] del backend['schema'] elif backend['type'].startswith('record_dir'): + if collection_name not in schema_map: + msg = ( + f'Schema specification for collection {collection_name} ' + 'missing' + ) + raise RuntimeError(msg) collection_config['schema'] = schema_map[collection_name] else: msg = f'Unknown backend type: "{backend["type"]}" in collection: "{collection_name}"' diff --git a/dump_things_service/main.py b/dump_things_service/main.py index b297a22..214c0b9 100644 --- a/dump_things_service/main.py +++ b/dump_things_service/main.py @@ -7,6 +7,11 @@ import sys from pathlib import Path from typing import TYPE_CHECKING +import yaml + +from dump_things_service.abstract_config import store_config +from dump_things_service.commands.load_config import convert_to_new_format +from dump_things_service.exceptions import ConfigError from dump_things_service.manifest import manifest_configuration # Perform the patching before importing any third-party libraries from dump_things_service.patches import enabled # noqa F401 -- used by generated code @@ -40,6 +45,7 @@ from dump_things_service import ( ) from dump_things_service.__about__ import __version__ from dump_things_service.abstract_config import ( + Configuration, check_collection, hash_token_representation, read_config, @@ -52,7 +58,8 @@ from dump_things_service.converter import ( from dump_things_service.curated import router as curated_router from dump_things_service.exceptions import CurieResolutionError from dump_things_service.incoming import router as incoming_router -from dump_things_service.instance_state import create_instance_state +from dump_things_service.instance_state import create_instance_state, \ + InstanceState from dump_things_service.lazy_list import ( PriorityList, ModifierList, @@ -111,6 +118,27 @@ parser.add_argument( 'remove tokens, collections, and additional admin tokens (64 ' 'characters hex-digit)', ) +parser.add_argument( + '-c', + '--config', + metavar='CONFIG_FILE', + help="Read the configuration from 'CONFIG_FILE' if no persisted " + "configuration is found in the data store root directory, and " + "initialize the persistent configuration and the service state with " + "the values in 'CONFIG_FILE'.", +) +parser.add_argument( + '--schema-location', + action='append', + default=[], + help='If a configuration file is read from disk and is converted from ' + 'version 1 to version 2, and a record_dir-backend configuration ' + 'cannot be read from disk, this option can be used to specify schemas ' + 'for collections. The value will only be used, if the schema location ' + 'cannot be automatically determined, i.e., cannot be read from disk. ' + 'The format is ":". The option can ' + 'be repeated.', +) parser.add_argument( '--root-path', default='', @@ -123,7 +151,7 @@ parser.add_argument( ) parser.add_argument( 'store', - help='The root of the data stores, it should contain a global_store and token_stores.', + help='The root of the data store, it should contain a global_store and token_stores.', ) @@ -221,6 +249,97 @@ g_instance_state = create_instance_state( g_configuration = read_config(store_path) +def initialize_from_config_file( + instance_state: InstanceState, + config_file: str, + schema_locations: list[str], +) -> Configuration: + with open(config_file) as f: + config_dict = yaml.safe_load(f) + + config_version = config_dict['version'] + if config_version == 1: + logger.info( + 'Converting version 1 configuration at %s', + arguments.config, + ) + config_dict = convert_version_1_2( + instance_state, + config_dict, + schema_locations, + ) + elif config_version != 2: + msg = f'Invalid version in config file: {config_version}' + raise ValueError(msg) + + return Configuration(**config_dict) + + +def convert_version_1_2( + instance_state: InstanceState, + config_dict: dict, + schema_locations: list[str], +) -> dict: + + on_disk_schema_locations = {} + for collection_name, collection_info in config_dict['collections'].items(): + if collection_info['backend']['type'].startswith('record_dir'): + schema_location = get_schema_location( + instance_state.store_path / collection_info['curated'] + ) + if schema_location: + on_disk_schema_locations[collection_name] = schema_location + + provided_schema_locations = { + spec.split(':', maxsplit=1)[0]: spec.split(':', maxsplit=1)[1] + for spec in schema_locations + } + + return convert_to_new_format( + config_dict, + [ + f'{collection_name}:{schema_location}' + for collection_name, schema_location in { + **provided_schema_locations, + **on_disk_schema_locations, + } + ], + ) + + +def get_schema_location(backend_path: Path): + from instance_state import get_record_dir_config + + try: + record_dir_config = get_record_dir_config(backend_path) + except ConfigError: + return None + return record_dir_config.schema + + +# If the configuration is empty, check for configuration option +if not ( + g_configuration.admin_tokens + or g_configuration.collections + or g_configuration.tokens +): + if arguments.config: + logger.info( + 'Initializing empty persisted configuration from %s', + arguments.config, + ) + g_configuration = initialize_from_config_file( + g_instance_state, + arguments.config, + arguments.schema_location, + ) + # Persist the configuration + store_config( + store_path=g_instance_state.store_path, + config=g_configuration, + ) + + manifest_configuration( configuration=g_configuration, instance_state=g_instance_state, diff --git a/dump_things_service/tests/fixtures.py b/dump_things_service/tests/fixtures.py index f270b0b..1604562 100644 --- a/dump_things_service/tests/fixtures.py +++ b/dump_things_service/tests/fixtures.py @@ -9,7 +9,6 @@ import pytest import yaml from dump_things_service.abstract_config import ( - CollectionConfig, GitAuditBackendConfig, SQLiteBackendConfig, TokenCollectionConfig, @@ -21,12 +20,12 @@ from dump_things_service.backends.sqlite import ( SQLiteBackend, record_file_name as sqlite_db_filename, ) +from dump_things_service.collection_endpoints import CollectionRequest from dump_things_service.instance_state import get_mapping_function_by_name from dump_things_service.model import get_model_for_schema from dump_things_service.resolve_curie import resolve_curie from dump_things_service.token_endpoints import TokenRequest from dump_things_service.tests.create_store import ( - create_store, pid, pid_curated, pid_trr, @@ -48,7 +47,7 @@ flat_social_schema_location = 'https://concepts.datalad.org/s/flat-social/unrele # The test store is created empty and collections are added via the admin # web interface. g_default_collections = [ - CollectionConfig( + CollectionRequest( name=f'collection_{i}', default_token='test_default_token', curated=PurePosixPath(f'{curated}/collection_{i}'), @@ -59,7 +58,7 @@ g_default_collections = [ ] g_default_collections.append( - CollectionConfig( + CollectionRequest( name=f'collection_8', default_token='test_default_token', curated=PurePosixPath(f'{curated}/collection_8'), @@ -72,14 +71,14 @@ g_default_collections.append( ) g_default_collections.extend([ - CollectionConfig( + CollectionRequest( name='collection_dlflatsocial-1', schema=flat_social_schema_location, default_token='test_default_token', curated=PurePosixPath(f'{curated}/collection_dlflatsocial-1'), incoming=PurePosixPath(f'{incoming}/collection_dlflatsocial-1'), ), - CollectionConfig( + CollectionRequest( name='collection_dlflatsocial-2', schema=flat_social_schema_location, default_token='test_default_token', @@ -335,52 +334,7 @@ g_default_entries['collection_dlflatsocial-2'] = [('Person', pid_trr, test_recor def dump_stores_simple(tmp_path_factory): tmp_path = tmp_path_factory.mktemp('dump_store') audit_store_path = tmp_path_factory.mktemp('audit_store') - return tmp_path, audit_store_path - #final_config_text = global_config_text.format(audit_store_path=str(audit_store_path)) - #(tmp_path / config_file_name).write_text(final_config_text) - - default_entries = { - f'collection_{i}': [('Person', pid, test_record)] for i in range(1, 9) - } - for collection_id in (1, 8): - default_entries[f'collection_{collection_id}'].extend( - [ - ('Person', pid_curated, test_record_curated), - ( - 'Person', - 'abc:mode_test', - 'pid: abc:mode_test\ngiven_name: mode_curated\nschema_type: abc:Person\n', - ), - ] - ) - default_entries['collection_dlflatsocial-1'] = [('Person', pid_trr, test_record_trr)] - default_entries['collection_dlflatsocial-2'] = [('Person', pid_trr, test_record_trr)] - - create_store( - root_dir=tmp_path, - abstract_config=GlobalConfig(**yaml.safe_load(final_config_text)), - per_collection_info={ - 'collection_1': (str(test_schema_location), 'digest-md5'), - 'collection_2': (str(test_schema_location), 'digest-md5-p3'), - 'collection_3': (str(test_schema_location), 'digest-sha1'), - 'collection_4': (str(test_schema_location), 'digest-sha1-p3'), - 'collection_5': (str(test_schema_location), 'after-last-colon'), - 'collection_6': (str(test_schema_location), 'digest-md5-p3-p3'), - 'collection_7': (str(test_schema_location), 'digest-sha1-p3-p3'), - 'collection_8': (str(test_schema_location), 'digest-md5'), - 'collection_dlflatsocial-1': ( - 'https://concepts.datalad.org/s/flat-social/unreleased.yaml', - 'digest-md5', - ), - 'collection_dlflatsocial-2': ( - 'https://concepts.datalad.org/s/flat-social/unreleased.yaml', - 'digest-md5', - ), - }, - default_entries=default_entries, - ) - return tmp_path @pytest.fixture(scope='session') diff --git a/dump_things_service/tests/test_collection_administration.py b/dump_things_service/tests/test_collection_administration.py index 8d8901f..53682db 100644 --- a/dump_things_service/tests/test_collection_administration.py +++ b/dump_things_service/tests/test_collection_administration.py @@ -12,11 +12,11 @@ from dump_things_service import ( HTTP_401_UNAUTHORIZED, ) from dump_things_service.abstract_config import ( - CollectionConfig, TokenCollectionConfig, TokenModes, hash_token_representation, ) +from dump_things_service.collection_endpoints import CollectionRequest from dump_things_service.token_endpoints import ( TokenRequest, AdminTokenRequest, @@ -33,7 +33,7 @@ test_schema_location = str((Path(__file__).parent / 'testschema.yaml').absolute( new_collection_name = 'admin_test_collection' new_token_name = 'admin_test_token' new_token_representation = 'admin_test_token' -new_collection_config = CollectionConfig( +new_collection_request = CollectionRequest( name=new_collection_name, default_token='test_default_token', curated=PurePosixPath(f'{curated}/admin_test_collection'), @@ -76,7 +76,7 @@ def _name_in_openapi_paths( def test_collection_adding(fastapi_client_simple): test_client, _ = fastapi_client_simple - # Check the the collection does not yet exist + # Check that the collection does not yet exist response = test_client.get( f'/collections/{new_collection_name}', headers={'x-dumpthings-token': 'admin-1'}, @@ -88,7 +88,7 @@ def test_collection_adding(fastapi_client_simple): response = test_client.post( '/collections', headers={'x-dumpthings-token': 'admin-1'}, - json=new_collection_config.model_dump(mode='json', by_alias=True), + json=new_collection_request.model_dump(mode='json', by_alias=True), ) assert response.status_code == HTTP_201_CREATED assert _name_in_openapi_paths(test_client, new_collection_name) @@ -98,7 +98,9 @@ def test_collection_adding(fastapi_client_simple): headers={'x-dumpthings-token': 'admin-1'}, ) assert response.status_code == HTTP_200_OK - assert response.json() == new_collection_config.model_dump(mode='json', by_alias=True) + new_collection_config = new_collection_request.model_dump(mode='json', by_alias=True) + del new_collection_config['name'] + assert response.json() == new_collection_config # Add a token to the collection response = test_client.post( @@ -175,6 +177,20 @@ def test_collection_adding(fastapi_client_simple): assert not _name_in_openapi_paths(test_client, new_collection_name) +def test_collection_reading(fastapi_client_simple): + test_client, _ = fastapi_client_simple + + # Check that the new admin token is not yet working + response = test_client.get( + f'/collections', + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_200_OK + response_object = response.json() + assert isinstance(response_object, dict) + assert len(response_object) == 10 + + def test_admin_token_management(fastapi_client_simple): test_client, _ = fastapi_client_simple diff --git a/dump_things_service/tests/test_config.py b/dump_things_service/tests/test_config.py index 295d0ee..fadbb33 100644 --- a/dump_things_service/tests/test_config.py +++ b/dump_things_service/tests/test_config.py @@ -7,16 +7,14 @@ from pydantic import ValidationError from yaml.scanner import ScannerError from dump_things_service import HTTP_409_CONFLICT -from dump_things_service.abstract_config import ( - CollectionConfig, - dump_things_private_collection_name, -) +from dump_things_service.abstract_config import dump_things_private_collection_name +from dump_things_service.collection_endpoints import CollectionRequest from dump_things_service.exceptions import ConfigError from . import schema_file -collection_config_pattern = CollectionConfig( +collection_request_pattern = CollectionRequest( name='', schema=str(schema_file), default_token='test_default_token', @@ -37,7 +35,7 @@ def test_illegal_collection_name_detection(fastapi_client_simple): response = test_client.post( f'/collections', json={ - **collection_config_pattern.model_dump(mode='json', by_alias=True), + **collection_request_pattern.model_dump(mode='json', by_alias=True), 'name': name, }, headers={'x-dumpthings-token': 'admin-1'}, @@ -56,7 +54,7 @@ def test_collection_dir_reuse_detection(fastapi_client_simple): response = test_client.post( f'/collections', json={ - **collection_config_pattern.model_dump(mode='json', by_alias=True), + **collection_request_pattern.model_dump(mode='json', by_alias=True), 'curated': curated_path, 'incoming': incoming_path, }, -- 2.52.0 From 1b286d4742662f551bc54a325f7f985d5b7a7cae Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 26 May 2026 15:33:08 +0200 Subject: [PATCH 37/64] fix default token name setting in config conversion Use the correct token names, when converting version 1 configuration files to version 2 configuration files. --- dump_things_service/commands/load_config.py | 26 ++++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/dump_things_service/commands/load_config.py b/dump_things_service/commands/load_config.py index 7af0a3b..18453d1 100644 --- a/dump_things_service/commands/load_config.py +++ b/dump_things_service/commands/load_config.py @@ -116,6 +116,21 @@ def convert_to_new_format( for spec in schema_spec } + counter = count(1) + new_tokens_dict = { + f'token_{next(counter)}': { + **old_token_config.copy(), + 'representation': token_representation, + 'hashed': False + } + for token_representation, old_token_config in old_configuration['tokens'].items() + } + + old_to_new_token_mapping = { + token_config['representation']: token_name + for token_name, token_config in new_tokens_dict.items() + } + for collection_name, collection_config in old_configuration['collections'].items(): backend = collection_config['backend'] if backend['type'].startswith('sqlite'): @@ -132,19 +147,12 @@ def convert_to_new_format( else: msg = f'Unknown backend type: "{backend["type"]}" in collection: "{collection_name}"' raise RuntimeError(msg) + collection_config['default_token'] = old_to_new_token_mapping[collection_config['default_token']] - counter = count(1) new_configuration = { 'type': old_configuration['type'], 'version': 2, - 'tokens': { - f'token_{next(counter)}': { - **old_token_config.copy(), - 'representation': token_representation, - 'hashed': False - } - for token_representation, old_token_config in old_configuration['tokens'].items() - }, + 'tokens': new_tokens_dict, 'collections': old_configuration['collections'], 'admin_tokens': {}, } -- 2.52.0 From f6a525c8c5b20fba5f05a3f530509d03938c4c5e Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 26 May 2026 17:09:36 +0200 Subject: [PATCH 38/64] fix backend handling in config format conversion --- dump_things_service/commands/load_config.py | 58 +++++++++----------- dump_things_service/main.py | 61 +-------------------- 2 files changed, 29 insertions(+), 90 deletions(-) diff --git a/dump_things_service/commands/load_config.py b/dump_things_service/commands/load_config.py index 18453d1..11e73ad 100644 --- a/dump_things_service/commands/load_config.py +++ b/dump_things_service/commands/load_config.py @@ -4,11 +4,13 @@ import os import sys from argparse import ArgumentParser from itertools import count +from pathlib import Path import requests import yaml -#from dump_things_service.token_endpoints import create_admin_token +from dump_things_service.instance_state import get_record_dir_config + parser = ArgumentParser( prog='Establish a configuration in a running service', @@ -38,14 +40,13 @@ parser.add_argument( 'a `schema`-attribute).', ) parser.add_argument( - '--schema', - action='append', - default=[], + '--store', + default=None, help='If --old-format is provided, this option can be used to specify a ' - 'schema for a collection. This is necessary, if the collection has ' - 'a `record-dir`-backend.' - ' ' - 'The format is ":".', + 'store directory. The store directory will be used to load `RecordDir` ' + 'configurations, if a collection defines are `RecordDir`-backend. ' + '(This option has no effect if no collection in the old configuration ' + 'uses a `RecordDir`-backend.)', ) @@ -57,11 +58,11 @@ def main(): assert configuration['type'] == 'collections', '`type`-entry missing in old config-file' if arguments.old_format: - configuration = convert_to_new_format(configuration, arguments.schema) + configuration = convert_to_new_format(configuration, arguments.store) else: - if arguments.schema: + if arguments.store: print( - 'Warning: ignoring `--schema` option because `--old-format` ' + 'Warning: ignoring `--store` option because `--old-format` ' 'is not provided.', file=sys.stderr, flush=True, @@ -105,17 +106,11 @@ def main(): def convert_to_new_format( old_configuration: dict, - schema_spec: list[str], + store_path: str | Path, ) -> dict: - assert old_configuration['type'] == 'collections', '`type`-entry missing in old config-file' assert old_configuration['version'] == 1, '`version: 1` missing in old config-file' - schema_map = { - spec.split(':', maxsplit=1)[0]: spec.split(':', maxsplit=1)[1] - for spec in schema_spec - } - counter = count(1) new_tokens_dict = { f'token_{next(counter)}': { @@ -131,22 +126,23 @@ def convert_to_new_format( for token_name, token_config in new_tokens_dict.items() } + store_path = Path(store_path) for collection_name, collection_config in old_configuration['collections'].items(): - backend = collection_config['backend'] - if backend['type'].startswith('sqlite'): + backend = collection_config.get('backend') + if backend and backend['type'].startswith('sqlite'): collection_config['schema'] = backend['schema'] del backend['schema'] - elif backend['type'].startswith('record_dir'): - if collection_name not in schema_map: - msg = ( - f'Schema specification for collection {collection_name} ' - 'missing' - ) - raise RuntimeError(msg) - collection_config['schema'] = schema_map[collection_name] - else: - msg = f'Unknown backend type: "{backend["type"]}" in collection: "{collection_name}"' - raise RuntimeError(msg) + elif not backend or backend['type'].startswith('record_dir'): + if store_path is None: + msg = '--store has to be provided to convert collection with record_dir-backends' + raise ValueError(msg) + record_dir_config = get_record_dir_config(store_path / collection_config['curated']) + collection_config['schema'] = record_dir_config.schema + backend = { + 'type': 'record_dir+stl' if not backend else backend['type'], + 'mapping_method': record_dir_config.idfx.value + } + collection_config['backend'] = backend collection_config['default_token'] = old_to_new_token_mapping[collection_config['default_token']] new_configuration = { diff --git a/dump_things_service/main.py b/dump_things_service/main.py index 214c0b9..c2456b3 100644 --- a/dump_things_service/main.py +++ b/dump_things_service/main.py @@ -127,18 +127,6 @@ parser.add_argument( "initialize the persistent configuration and the service state with " "the values in 'CONFIG_FILE'.", ) -parser.add_argument( - '--schema-location', - action='append', - default=[], - help='If a configuration file is read from disk and is converted from ' - 'version 1 to version 2, and a record_dir-backend configuration ' - 'cannot be read from disk, this option can be used to specify schemas ' - 'for collections. The value will only be used, if the schema location ' - 'cannot be automatically determined, i.e., cannot be read from disk. ' - 'The format is ":". The option can ' - 'be repeated.', -) parser.add_argument( '--root-path', default='', @@ -252,7 +240,6 @@ g_configuration = read_config(store_path) def initialize_from_config_file( instance_state: InstanceState, config_file: str, - schema_locations: list[str], ) -> Configuration: with open(config_file) as f: config_dict = yaml.safe_load(f) @@ -263,10 +250,9 @@ def initialize_from_config_file( 'Converting version 1 configuration at %s', arguments.config, ) - config_dict = convert_version_1_2( - instance_state, + config_dict = convert_to_new_format( config_dict, - schema_locations, + instance_state.store_path, ) elif config_version != 2: msg = f'Invalid version in config file: {config_version}' @@ -275,48 +261,6 @@ def initialize_from_config_file( return Configuration(**config_dict) -def convert_version_1_2( - instance_state: InstanceState, - config_dict: dict, - schema_locations: list[str], -) -> dict: - - on_disk_schema_locations = {} - for collection_name, collection_info in config_dict['collections'].items(): - if collection_info['backend']['type'].startswith('record_dir'): - schema_location = get_schema_location( - instance_state.store_path / collection_info['curated'] - ) - if schema_location: - on_disk_schema_locations[collection_name] = schema_location - - provided_schema_locations = { - spec.split(':', maxsplit=1)[0]: spec.split(':', maxsplit=1)[1] - for spec in schema_locations - } - - return convert_to_new_format( - config_dict, - [ - f'{collection_name}:{schema_location}' - for collection_name, schema_location in { - **provided_schema_locations, - **on_disk_schema_locations, - } - ], - ) - - -def get_schema_location(backend_path: Path): - from instance_state import get_record_dir_config - - try: - record_dir_config = get_record_dir_config(backend_path) - except ConfigError: - return None - return record_dir_config.schema - - # If the configuration is empty, check for configuration option if not ( g_configuration.admin_tokens @@ -331,7 +275,6 @@ if not ( g_configuration = initialize_from_config_file( g_instance_state, arguments.config, - arguments.schema_location, ) # Persist the configuration store_config( -- 2.52.0 From 072283ec2701ee460d3d1443adf1512f0805760e Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 26 May 2026 17:30:43 +0200 Subject: [PATCH 39/64] update changelog, bump version --- CHANGELOG.md | 13 ++++++++++++- dump_things_service/__about__.py | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2819285..6cf4205 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,15 @@ -# 6.0.0b1 +# 6.0.0b2 (26.4.2026) + +# New feature + +- Support `-c/--config`-option. If this option is provided and if the service + has an empty persistent configuration store, the configuration store and the + service configuration will be initialized from the configuration file content. + If the configuration file is `version: 1`, it will be converted to the updated + configuration format. + + +# 6.0.0b1 (22.5.2026) ## New features diff --git a/dump_things_service/__about__.py b/dump_things_service/__about__.py index f11c59f..4ab72f5 100644 --- a/dump_things_service/__about__.py +++ b/dump_things_service/__about__.py @@ -1 +1 @@ -__version__ = '6.0.0b1' +__version__ = '6.0.0b2' -- 2.52.0 From 850071076f302ce873c41fc0a368b9ca859d6e32 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 26 May 2026 17:55:44 +0200 Subject: [PATCH 40/64] add -c/--config description to README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 18bdaae..d4a05b0 100644 --- a/README.md +++ b/README.md @@ -47,13 +47,17 @@ The following command line parameters are supported: - `--port `: The port on which the service should accept connections (default: `8000`). +- `-c/--config `: provide a path to a configuration file (configuration file version 1). + If no dynamically managed configuration is found in the data store, the dynamically managed configuration and the service state will be initialized with the content of the configuration file. + This allows an easy transition from dump-things-servers of version 5 and lower to version 6. + - `--origins `: add a CORS origin hosts (repeat to add multiple CORS origin URLs).` - `--root-path `: Set the ASGI 'root_path' for applications submounted below a given URL path. - `--log-level`: set the log level for the service, allowed values are `ERROR`, `WARNING`, `INFO`, `DEBUG`. The default-level is `WARNING`. -- `--admin_token `: set an administrator token. This token can be used to create and delete collections, tokens, and admin tokens. This is useful to configure the service if no admin token was yet created. +- `--admin_token-hash `: set an administrator token hash. This token can be used to create and delete collections, tokens, and admin tokens. This is useful to configure the service if no admin token was yet created. ```bash -- 2.52.0 From 639091474b4d0f1a2d01fd7e34851ebb0ba7cb1c Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 26 May 2026 18:06:35 +0200 Subject: [PATCH 41/64] add --admin-token-hash description to README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d4a05b0..f8a95f2 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,10 @@ The following command line parameters are supported: - `--log-level`: set the log level for the service, allowed values are `ERROR`, `WARNING`, `INFO`, `DEBUG`. The default-level is `WARNING`. -- `--admin_token-hash `: set an administrator token hash. This token can be used to create and delete collections, tokens, and admin tokens. This is useful to configure the service if no admin token was yet created. +- `--admin_token-hash `: set an administrator token hash. + This plaintext token can be used to create and delete collections, tokens, and admin tokens. + This is useful to configure the service if no admin token was yet created. + NOTE: an admin token in plaintext is read from the environment variable `DTS_ADMIN_TOKEN` if the variable is set and this option is not provided. ```bash -- 2.52.0 From df8ec7f3ac5497df0c88922d17ab59ffe1da3c92 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 26 May 2026 18:06:55 +0200 Subject: [PATCH 42/64] remove unused code and fix wording --- dump_things_service/abstract_config.py | 35 -------------------------- dump_things_service/main.py | 5 ++-- 2 files changed, 3 insertions(+), 37 deletions(-) diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index dff0477..cc1f874 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -154,41 +154,6 @@ class Configuration(BaseModel): pid: str = dump_things_config_iri -class TokenConfig_v1(StrictModel): - user_id: str - collections: dict[str, TokenCollectionConfig] - hashed: bool = False - - -class RecordDirBackendConfig_v1(StrictModel): - model_config = ConfigDict(use_enum_values=True) - type: Literal['record_dir', 'record_dir+stl'] - - -class SQLiteBackendConfig_v1(StrictModel): - type: Literal['sqlite', 'sqlite+stl'] - schema: str - - -class CollectionConfig_v1(StrictModel): - default_token: str - curated: Path - incoming: Path | None = None - backend: RecordDirBackendConfig_v1 | SQLiteBackendConfig_v1 | None = None - auth_sources: list[ForgejoAuthSpec | ConfigAuthSpec] = [ConfigAuthSpec()] - submission_tags: TagSpec = TagSpec() - use_classes: list[str] = [] - ignore_classes: list[str] = [] - audit_backends: list[GitAuditBackendConfig] = [] - - -class ConfigurationFile_v1(StrictModel): - type: Literal['collections'] - version: Literal[1] - collections: dict[str, CollectionConfig_v1] - tokens: dict[str, TokenConfig_v1] - - mode_mapping = { TokenModes.READ_CURATED: TokenPermission(curated_read=True), TokenModes.READ_COLLECTION: TokenPermission( diff --git a/dump_things_service/main.py b/dump_things_service/main.py index c2456b3..7b3e62c 100644 --- a/dump_things_service/main.py +++ b/dump_things_service/main.py @@ -11,7 +11,6 @@ import yaml from dump_things_service.abstract_config import store_config from dump_things_service.commands.load_config import convert_to_new_format -from dump_things_service.exceptions import ConfigError from dump_things_service.manifest import manifest_configuration # Perform the patching before importing any third-party libraries from dump_things_service.patches import enabled # noqa F401 -- used by generated code @@ -116,7 +115,9 @@ parser.add_argument( default='', help='The sha256 hash of an initial admin token that will allow to add or ' 'remove tokens, collections, and additional admin tokens (64 ' - 'characters hex-digit)', + 'characters hex-digit). NOTE: an admin token in plaintext is read ' + 'from the environment variable `DTS_ADMIN_TOKEN` if it is set, and ' + 'if this option is not provided.', ) parser.add_argument( '-c', -- 2.52.0 From f9b65ea0c9282fdbaa342ae6aef2aa4eadbda2d1 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 26 May 2026 18:08:19 +0200 Subject: [PATCH 43/64] fix wording in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f8a95f2..b3e529f 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ The following command line parameters are supported: - `--admin_token-hash `: set an administrator token hash. This plaintext token can be used to create and delete collections, tokens, and admin tokens. This is useful to configure the service if no admin token was yet created. - NOTE: an admin token in plaintext is read from the environment variable `DTS_ADMIN_TOKEN` if the variable is set and this option is not provided. + **NOTE**: an admin token in plaintext is read from the environment variable `DTS_ADMIN_TOKEN` if it is set and this option is not provided. ```bash -- 2.52.0 From 1a43b55081eb43fb6f1d6fef371de3bccddcf765 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 26 May 2026 18:17:08 +0200 Subject: [PATCH 44/64] add `dump-things-hash-token` description to README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index b3e529f..bc975c5 100644 --- a/README.md +++ b/README.md @@ -864,6 +864,9 @@ If any backend is a `record_dir+stl` backend, a schema has to be supplied via th - `dump-things-create-merged-schema`: this command creates a new schema that statically contains all schemas that the original schema imports. The new schema is fully self-contained and does not reference any other schemas. +- `dump-things-hash-token`: this command will generate a hash from a plain-text token that can be used with the `--admin-token-hash` option. + (one could also use the shell command `sha256sum` to generate the hash, but using `dump-things-hash-token` will ensure that the right hash algorithm is used) + ### Migrate to version 6 -- 2.52.0 From 9c92271aecbc0655d21a8929e9b442efa6e2a808 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 27 May 2026 09:55:42 +0200 Subject: [PATCH 45/64] improve configuration processing Add scanner error detection to configuration loading. Raise an error if the configuration YAML-file cannot be loaded. Detect and reject token definitions with write permissions and without incoming-labels or write access for collections that do not define an incoming directory. Rename `get_permissions` to `get_token_permissions` --- dump_things_service/abstract_config.py | 10 ++++++-- dump_things_service/auth/config.py | 4 ++-- dump_things_service/tests/test_config.py | 3 +-- dump_things_service/token_endpoints.py | 30 ++++++++++++++++++++---- 4 files changed, 37 insertions(+), 10 deletions(-) diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index cc1f874..8365320 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -17,6 +17,7 @@ from pydantic import ( ConfigDict, Field, ) +from yaml.scanner import ScannerError from dump_things_service import ( HTTP_404_NOT_FOUND, @@ -27,6 +28,7 @@ from dump_things_service.backends.record_dir import ( _RecordDirStore, RecordDirStore, ) +from dump_things_service.exceptions import ConfigError from dump_things_service.mapping_functions import ( MappingMethod, mapping_functions, @@ -183,7 +185,7 @@ mode_mapping = { } -def get_permissions(mode: str) -> TokenPermission: +def get_token_permissions(mode: str) -> TokenPermission: return mode_mapping[TokenModes(mode)] @@ -220,7 +222,11 @@ def read_config( if not g_abstract_configuration: config_backend, _ = get_config_backends(store_path) - record_info = config_backend.get_record_by_iri(dump_things_config_iri) + try: + record_info = config_backend.get_record_by_iri(dump_things_config_iri) + except ScannerError as sce: + msg = f'Configuration at {config_backend.root} not readable: {sce}' + raise ConfigError(msg) from sce g_abstract_configuration = ( Configuration(**(record_info.json_object)) if record_info diff --git a/dump_things_service/auth/config.py b/dump_things_service/auth/config.py index 916b2b4..8f8976e 100644 --- a/dump_things_service/auth/config.py +++ b/dump_things_service/auth/config.py @@ -7,7 +7,7 @@ from dump_things_service.auth import ( InvalidTokenError, ) from dump_things_service.abstract_config import ( - get_permissions, + get_token_permissions, get_token_config_for_representation_and_collection, ) @@ -38,7 +38,7 @@ class ConfigAuthenticationSource(AuthenticationSource): _, token_config, token_collection_config = result return AuthenticationInfo( - token_permission=get_permissions(token_collection_config.mode), + token_permission=get_token_permissions(token_collection_config.mode), user_id=token_config.user_id, incoming_label=token_collection_config.incoming_label, ) diff --git a/dump_things_service/tests/test_config.py b/dump_things_service/tests/test_config.py index fadbb33..5464941 100644 --- a/dump_things_service/tests/test_config.py +++ b/dump_things_service/tests/test_config.py @@ -98,8 +98,7 @@ tokens: collections: collection_1: mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - """, + incoming_label: incoming_anonymous""", Loader=yaml.SafeLoader, ) ) diff --git a/dump_things_service/token_endpoints.py b/dump_things_service/token_endpoints.py index b192e6a..313d62b 100644 --- a/dump_things_service/token_endpoints.py +++ b/dump_things_service/token_endpoints.py @@ -1,7 +1,6 @@ import logging import random import re -import sys from urllib.parse import quote from fastapi import ( @@ -20,12 +19,14 @@ from dump_things_service import ( from dump_things_service.abstract_config import ( AdminTokenConfig, StrictModel, + TokenCollectionConfig, TokenConfig, get_config, get_token_info_by_representation, + get_token_permissions, hash_token_representation, read_config, - store_config, TokenCollectionConfig, + store_config, ) from dump_things_service.admin import authenticate_admin from dump_things_service.api_key import api_key_header_scheme @@ -93,8 +94,29 @@ async def create_token( detail = f"No such collection: '{collection_name}'." raise HTTPException(status_code=HTTP_404_NOT_FOUND, detail=detail) - print(f'IMPLEMENT: check incoming label ({token_collection_info.incoming_label}), check mode ({token_collection_info.mode})', file=sys.stderr, flush=True) - # TODO: check mode(!), check incoming_label(?) + # Check that incoming areas are defined if the token allows writing. + token_permissions = get_token_permissions(token_collection_info.mode) + if token_permissions.incoming_write or token_permissions.zones_access: + + # Check for incoming definition in collection config + collection_info = abstract_config.collections[collection_name] + if not collection_info.incoming: + detail = ( + f"Cannot add token with write access to collection " + f"without `incoming` directory: '{collection_name}'" + ) + raise HTTPException( + status_code=HTTP_406_NOT_ACCEPTABLE, + detail=detail, + ) + + # Check for incoming label in token definition for the collection + if not token_collection_info.incoming_label: + detail = f"Incoming label missing for collection '{collection_name}'" + raise HTTPException( + status_code=HTTP_406_NOT_ACCEPTABLE, + detail=detail, + ) if body.representation: # We have a specific representation, check that it is not already used -- 2.52.0 From f231f190fdbf531e631f0f9a1fb47037fe6c4795 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 27 May 2026 10:54:20 +0200 Subject: [PATCH 46/64] add test for config file error detection Test that errors in the format of the configuration file are caught and converted to `ConfigError`. --- dump_things_service/tests/test_config.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/dump_things_service/tests/test_config.py b/dump_things_service/tests/test_config.py index 5464941..76bbe31 100644 --- a/dump_things_service/tests/test_config.py +++ b/dump_things_service/tests/test_config.py @@ -1,3 +1,4 @@ +import hashlib from pathlib import PurePosixPath import pytest @@ -7,7 +8,11 @@ from pydantic import ValidationError from yaml.scanner import ScannerError from dump_things_service import HTTP_409_CONFLICT -from dump_things_service.abstract_config import dump_things_private_collection_name +from dump_things_service.abstract_config import ( + config_backend_path, + dump_things_private_collection_name, dump_things_config_iri, read_config, + get_config_backends, +) from dump_things_service.collection_endpoints import CollectionRequest from dump_things_service.exceptions import ConfigError @@ -63,6 +68,23 @@ def test_collection_dir_reuse_detection(fastapi_client_simple): assert response.status_code == HTTP_409_CONFLICT +def test_scanner_error_detection(tmp_path_factory): + tmp_path = tmp_path_factory.mktemp('config_scanner_test') + + config_backend, audit_backend = get_config_backends(tmp_path) + config_backend.add_record( + iri=dump_things_config_iri, + class_name='DumpThingsConfig', + json_object={'pid': dump_things_config_iri} + ) + + md5_hexdigest = hashlib.md5(dump_things_config_iri.encode()).hexdigest() + config_file_path = config_backend.root / 'DumpThingsConfig' / f'{md5_hexdigest}.yaml' + config_file_path.write_text('collections: ::: -\n sdsdfsdf: xxx') + with pytest.raises(ConfigError): + x = read_config(tmp_path) + + def xxx_test_scanner_error_detection(tmp_path): config_file_path = tmp_path / 'config.yaml' config_file_path.write_text('type: col: le\n:xxx:') -- 2.52.0 From 753fe6a9faa34ffc6ca540ea44f378a7ae9686f7 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 27 May 2026 12:15:56 +0200 Subject: [PATCH 47/64] handle configuration file structure failures Do not allow unknown keys in configuration files. Add a test for structure error detection. --- dump_things_service/abstract_config.py | 18 +++++++++++------- dump_things_service/tests/test_config.py | 24 ++++++++++++++++-------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index 8365320..f7e0f25 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -15,7 +15,7 @@ from fastapi import HTTPException from pydantic import ( BaseModel, ConfigDict, - Field, + Field, ValidationError, ) from yaml.scanner import ScannerError @@ -149,7 +149,7 @@ class AdminTokenConfig(StrictModel): representation: str -class Configuration(BaseModel): +class Configuration(StrictModel): collections: dict[str, CollectionConfig] = {} tokens: dict[str, TokenConfig] = {} admin_tokens: dict[str, AdminTokenConfig] = {} @@ -227,11 +227,15 @@ def read_config( except ScannerError as sce: msg = f'Configuration at {config_backend.root} not readable: {sce}' raise ConfigError(msg) from sce - g_abstract_configuration = ( - Configuration(**(record_info.json_object)) - if record_info - else Configuration() - ) + try: + g_abstract_configuration = ( + Configuration(**(record_info.json_object)) + if record_info + else Configuration() + ) + except ValidationError as ve: + msg = f'Faulty configuration at {config_backend.root}: {ve}' + raise ConfigError(msg) from ve return g_abstract_configuration diff --git a/dump_things_service/tests/test_config.py b/dump_things_service/tests/test_config.py index 76bbe31..ca59fba 100644 --- a/dump_things_service/tests/test_config.py +++ b/dump_things_service/tests/test_config.py @@ -82,16 +82,24 @@ def test_scanner_error_detection(tmp_path_factory): config_file_path = config_backend.root / 'DumpThingsConfig' / f'{md5_hexdigest}.yaml' config_file_path.write_text('collections: ::: -\n sdsdfsdf: xxx') with pytest.raises(ConfigError): - x = read_config(tmp_path) + read_config(tmp_path) -def xxx_test_scanner_error_detection(tmp_path): - config_file_path = tmp_path / 'config.yaml' - config_file_path.write_text('type: col: le\n:xxx:') - global_dict = {} - with pytest.raises(ConfigError) as e: - process_config(tmp_path, config_file_path, [], global_dict) - assert isinstance(e.value.__cause__, ScannerError) +def test_structure_error_detection(tmp_path_factory): + tmp_path = tmp_path_factory.mktemp('config_scanner_test') + + config_backend, audit_backend = get_config_backends(tmp_path) + config_backend.add_record( + iri=dump_things_config_iri, + class_name='DumpThingsConfig', + json_object={'pid': dump_things_config_iri} + ) + + md5_hexdigest = hashlib.md5(dump_things_config_iri.encode()).hexdigest() + config_file_path = config_backend.root / 'DumpThingsConfig' / f'{md5_hexdigest}.yaml' + config_file_path.write_text('type: 1\n') + with pytest.raises(ConfigError): + read_config(tmp_path) def xxx_test_structure_error_detection(tmp_path): -- 2.52.0 From a753538b4cad7e48e0fdd34a9bbd25ae220f833d Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 28 May 2026 09:22:55 +0200 Subject: [PATCH 48/64] improve incoming-path verification Verify that incoming paths exist when a collection is created a token has write-access to the collection. Checks that incoming-label is set for the token. Add tests for incoming path validation. --- dump_things_service/__init__.py | 2 + dump_things_service/collection_endpoints.py | 38 +++++- dump_things_service/tests/test_config.py | 140 ++++++++++++++------ dump_things_service/token_endpoints.py | 2 +- 4 files changed, 137 insertions(+), 45 deletions(-) diff --git a/dump_things_service/__init__.py b/dump_things_service/__init__.py index 4bb4a39..9bb65a0 100644 --- a/dump_things_service/__init__.py +++ b/dump_things_service/__init__.py @@ -12,6 +12,7 @@ from starlette.status import ( HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND, + HTTP_406_NOT_ACCEPTABLE, HTTP_409_CONFLICT, HTTP_500_INTERNAL_SERVER_ERROR, HTTP_503_SERVICE_UNAVAILABLE, @@ -32,6 +33,7 @@ __all__ = [ 'HTTP_401_UNAUTHORIZED', 'HTTP_403_FORBIDDEN', 'HTTP_404_NOT_FOUND', + 'HTTP_406_NOT_ACCEPTABLE', 'HTTP_409_CONFLICT', 'HTTP_413_CONTENT_TOO_LARGE', 'HTTP_422_UNPROCESSABLE_CONTENT', diff --git a/dump_things_service/collection_endpoints.py b/dump_things_service/collection_endpoints.py index 15954fd..6ae493d 100644 --- a/dump_things_service/collection_endpoints.py +++ b/dump_things_service/collection_endpoints.py @@ -17,6 +17,7 @@ from pydantic import BaseModel from dump_things_service import ( HTTP_201_CREATED, HTTP_404_NOT_FOUND, + HTTP_406_NOT_ACCEPTABLE, HTTP_409_CONFLICT, reserved_collection_names, ) @@ -25,7 +26,7 @@ from dump_things_service.abstract_config import ( CollectionConfig, StrictModel, store_config, - get_config, + get_config, get_token_permissions, ) from dump_things_service.admin import authenticate_admin from dump_things_service.api_key import api_key_header_scheme @@ -99,11 +100,15 @@ async def create_collection( # Check for distinct directories for directory in (body.incoming, body.curated): - ensure_unique_directory( - abstract_config, - instance_state, - directory, - ) + if directory: + ensure_unique_directory( + abstract_config, + instance_state, + directory, + ) + + # Check for incoming directory if any of the tokens allows writing + validate_incoming_paths(abstract_config, body) # Update the abstract configuration abstract_config.collections[body.name] = body @@ -212,3 +217,24 @@ def ensure_unique_directory( status_code=HTTP_409_CONFLICT, detail=f"Directory '{collection_dir}' already used by collection '{collection_name}'.", ) + + +def validate_incoming_paths( + abstract_config: Configuration, + collection_request: CollectionRequest, +): + for token_name, token_info in abstract_config.tokens.items(): + token_collection_info = token_info.collections.get(collection_request.name) + if token_collection_info: + token_permissions = get_token_permissions(token_collection_info.mode) + if token_permissions.incoming_write or token_permissions.zones_access: + if not collection_request.incoming: + detail = ( + f"Cannot add collection '{collection_request.name}' without " + f"`incoming` path, because at least token '{token_name}' " + f" has write access to the collection" + ) + raise HTTPException( + status_code=HTTP_406_NOT_ACCEPTABLE, + detail=detail, + ) diff --git a/dump_things_service/tests/test_config.py b/dump_things_service/tests/test_config.py index ca59fba..b972070 100644 --- a/dump_things_service/tests/test_config.py +++ b/dump_things_service/tests/test_config.py @@ -4,19 +4,25 @@ from pathlib import PurePosixPath import pytest import yaml from json_flattener import GlobalConfig -from pydantic import ValidationError -from yaml.scanner import ScannerError +from starlette.status import HTTP_406_NOT_ACCEPTABLE -from dump_things_service import HTTP_409_CONFLICT +from dump_things_service import ( + HTTP_200_OK, + HTTP_201_CREATED, + HTTP_409_CONFLICT, +) from dump_things_service.abstract_config import ( - config_backend_path, - dump_things_private_collection_name, dump_things_config_iri, read_config, + TokenCollectionConfig, + TokenModes, + dump_things_config_iri, + dump_things_private_collection_name, get_config_backends, + read_config, ) from dump_things_service.collection_endpoints import CollectionRequest from dump_things_service.exceptions import ConfigError - -from . import schema_file +from dump_things_service.tests import schema_file +from dump_things_service.token_endpoints import TokenRequest collection_request_pattern = CollectionRequest( @@ -102,40 +108,98 @@ def test_structure_error_detection(tmp_path_factory): read_config(tmp_path) -def xxx_test_structure_error_detection(tmp_path): - config_file_path = tmp_path / 'config.yaml' - config_file_path.write_text('type: colle\n') - global_dict = {} - with pytest.raises(ConfigError) as e: - process_config(tmp_path, config_file_path, [], global_dict) - assert isinstance(e.value.__cause__, ValidationError) +def test_missing_incoming_detection(fastapi_client_simple): + test_client, _ = fastapi_client_simple - -def xxx_test_missing_incoming_detection(tmp_path): - config_object = GlobalConfig( - **yaml.load( - """ -type: collections -version: 1 -collections: - collection_1: - default_token: basic_access - curated: curated/collection_1 - -tokens: - basic_access: - user_id: anonymous - collections: - collection_1: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous""", - Loader=yaml.SafeLoader, - ) + # Add a collection without incoming + collection_request = CollectionRequest( + name='missing_incoming_detection_test', + default_token='Test XXXXX (CURATOR)', + curated=PurePosixPath('missing_incoming_detection'), + schema=str(schema_file), ) - global_dict = {} - with pytest.raises(ConfigError): - process_config_object(tmp_path, config_object, [], global_dict) + response = test_client.post( + '/collections', + json=collection_request.model_dump(mode='json', by_alias=True), + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_201_CREATED + + # Add a write token that references the collection, expect this to + # fail because the collection does not contain an incoming path + token_request = TokenRequest( + name='missing-incoming-token', + user_id='missing_incoming_user', + collections={ + 'missing_incoming_detection_test': TokenCollectionConfig( + mode=TokenModes.CURATOR, + incoming_label='', + ) + } + ) + + # Check that a write token for a collection without incoming path cannot + # be created. + response = test_client.post( + '/tokens', + json=token_request.model_dump(mode='json', by_alias=True), + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_406_NOT_ACCEPTABLE + + # Remove the collection without incoming path + response = test_client.delete( + '/collections/missing_incoming_detection_test', + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_200_OK + + # Add a collection with incoming path + collection_request.incoming = PurePosixPath('missing_incoming_detection_test_incoming') + response = test_client.post( + '/collections', + json=collection_request.model_dump(mode='json', by_alias=True), + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_201_CREATED + + # Check that a write token for a collection with an incoming path but a + # missing label cannot be created. + response = test_client.post( + '/tokens', + json=token_request.model_dump(mode='json', by_alias=True), + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_406_NOT_ACCEPTABLE + + # Check that a write token for a collection with an incoming path can be created + token_request.collections['missing_incoming_detection_test'] = TokenCollectionConfig( + mode=TokenModes.CURATOR, + incoming_label='test_incoming_label', + ) + response = test_client.post( + '/tokens', + json=token_request.model_dump(mode='json', by_alias=True), + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_201_CREATED + + # Remove the collection with the incoming path + response = test_client.delete( + '/collections/missing_incoming_detection_test', + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_200_OK + + # Check that a creation attempt for the collection without incoming path fails + collection_request.incoming = None + response = test_client.post( + '/collections', + json=collection_request.model_dump(mode='json', by_alias=True), + headers={'x-dumpthings-token': 'admin-1'}, + ) + assert response.status_code == HTTP_406_NOT_ACCEPTABLE def xxx_test_submission_tags_handling(dump_stores_simple): diff --git a/dump_things_service/token_endpoints.py b/dump_things_service/token_endpoints.py index 313d62b..85f308b 100644 --- a/dump_things_service/token_endpoints.py +++ b/dump_things_service/token_endpoints.py @@ -103,7 +103,7 @@ async def create_token( if not collection_info.incoming: detail = ( f"Cannot add token with write access to collection " - f"without `incoming` directory: '{collection_name}'" + f"'{collection_name}' without `incoming`." ) raise HTTPException( status_code=HTTP_406_NOT_ACCEPTABLE, -- 2.52.0 From 1747bf987ce9c703ba4978ac12d1611949482ee6 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 28 May 2026 09:38:47 +0200 Subject: [PATCH 49/64] add enforce_reload-option to abstract config loading --- dump_things_service/abstract_config.py | 3 ++- dump_things_service/tests/test_collection_administration.py | 1 + dump_things_service/tests/test_config.py | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index f7e0f25..bcd5058 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -217,10 +217,11 @@ def get_config_backends( def read_config( store_path: Path, + force_reload: bool = False, ) -> Configuration: global g_abstract_configuration - if not g_abstract_configuration: + if not g_abstract_configuration or force_reload: config_backend, _ = get_config_backends(store_path) try: record_info = config_backend.get_record_by_iri(dump_things_config_iri) diff --git a/dump_things_service/tests/test_collection_administration.py b/dump_things_service/tests/test_collection_administration.py index 53682db..e49509f 100644 --- a/dump_things_service/tests/test_collection_administration.py +++ b/dump_things_service/tests/test_collection_administration.py @@ -49,6 +49,7 @@ new_token_request = TokenRequest( collections={ new_collection_name: TokenCollectionConfig( mode=TokenModes.WRITE_COLLECTION, + incoming_label=f'{new_collection_name}_label', ) }, ) diff --git a/dump_things_service/tests/test_config.py b/dump_things_service/tests/test_config.py index b972070..278e9d2 100644 --- a/dump_things_service/tests/test_config.py +++ b/dump_things_service/tests/test_config.py @@ -88,7 +88,7 @@ def test_scanner_error_detection(tmp_path_factory): config_file_path = config_backend.root / 'DumpThingsConfig' / f'{md5_hexdigest}.yaml' config_file_path.write_text('collections: ::: -\n sdsdfsdf: xxx') with pytest.raises(ConfigError): - read_config(tmp_path) + read_config(tmp_path, force_reload=True) def test_structure_error_detection(tmp_path_factory): @@ -105,7 +105,7 @@ def test_structure_error_detection(tmp_path_factory): config_file_path = config_backend.root / 'DumpThingsConfig' / f'{md5_hexdigest}.yaml' config_file_path.write_text('type: 1\n') with pytest.raises(ConfigError): - read_config(tmp_path) + read_config(tmp_path, force_reload=True) def test_missing_incoming_detection(fastapi_client_simple): -- 2.52.0 From 2053997b53edf50e2a0e81e9134becda9b929f4c Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 10 Jun 2026 16:02:14 +0200 Subject: [PATCH 50/64] return admin_token in test fixture The test-fixture `fast_api_simple` now returns a tuple containing: - test_client instance - store path - admin token --- dump_things_service/__about__.py | 2 +- dump_things_service/tests/fixtures.py | 12 ++++--- dump_things_service/tests/test_basic.py | 32 +++++++++---------- .../tests/test_collection_administration.py | 26 +++++++-------- dump_things_service/tests/test_config.py | 26 +++++++-------- dump_things_service/tests/test_curated.py | 14 ++++---- .../tests/test_extract_inline.py | 4 +-- dump_things_service/tests/test_incoming.py | 14 ++++---- dump_things_service/tests/test_mapping.py | 2 +- dump_things_service/tests/test_modes.py | 2 +- .../tests/test_pid_resolution.py | 2 +- dump_things_service/tests/test_roundtrip.py | 4 +-- .../tests/test_roundtrip_flatsocial.py | 4 +-- .../tests/test_token_endpoints.py | 8 ++--- dump_things_service/tests/test_validate.py | 2 +- .../tests/test_web_interface.py | 6 ++-- 16 files changed, 81 insertions(+), 79 deletions(-) diff --git a/dump_things_service/__about__.py b/dump_things_service/__about__.py index 4ab72f5..72def07 100644 --- a/dump_things_service/__about__.py +++ b/dump_things_service/__about__.py @@ -1 +1 @@ -__version__ = '6.0.0b2' +__version__ = '6.0.0b3' diff --git a/dump_things_service/tests/fixtures.py b/dump_things_service/tests/fixtures.py index 1604562..e4305f2 100644 --- a/dump_things_service/tests/fixtures.py +++ b/dump_things_service/tests/fixtures.py @@ -341,16 +341,17 @@ def dump_stores_simple(tmp_path_factory): def fastapi_app_simple(dump_stores_simple): tmp_path, audit_tmp_path = dump_stores_simple + admin_token = 'admin-1' old_sys_argv = sys.argv sys.argv = [ 'test-runner', - '--admin-token-hash', hash_token_representation('admin-1'), + '--admin-token-hash', hash_token_representation(admin_token), str(tmp_path), ] from dump_things_service.main import app sys.argv = old_sys_argv - return app, tmp_path, audit_tmp_path + return app, tmp_path, audit_tmp_path, admin_token @pytest.fixture(scope='session') @@ -360,6 +361,7 @@ def fastapi_client_simple(fastapi_app_simple): test_client = TestClient(fastapi_app_simple[0]) store_path = fastapi_app_simple[1] audit_path = fastapi_app_simple[2] + admin_token = fastapi_app_simple[3] # Add an audit backend to the first collection in g_default_collections assert g_default_collections[0].name == 'collection_1' @@ -380,7 +382,7 @@ def fastapi_client_simple(fastapi_app_simple): mode='json', by_alias=True, ), - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == 201 @@ -389,7 +391,7 @@ def fastapi_client_simple(fastapi_app_simple): response = test_client.post( '/tokens', json=token_config.model_dump(exclude_unset=True, mode='json'), - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == 201 @@ -413,7 +415,7 @@ def fastapi_client_simple(fastapi_app_simple): pydantic_module, g_default_entries[collection_config.name], ) - return test_client, store_path + return test_client, store_path, admin_token def add_records_to_backend( diff --git a/dump_things_service/tests/test_basic.py b/dump_things_service/tests/test_basic.py index 186582c..b884334 100644 --- a/dump_things_service/tests/test_basic.py +++ b/dump_things_service/tests/test_basic.py @@ -40,7 +40,7 @@ unicode_record = { def test_search_by_pid(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 9): response = test_client.get( f'/collection_{i}/record?pid={pid}', @@ -55,7 +55,7 @@ def test_search_by_pid(fastapi_client_simple): def test_get_all(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 9): response = test_client.get( f'/collection_{i}/records/', @@ -73,7 +73,7 @@ def test_get_all(fastapi_client_simple): def test_delete(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.post( '/collection_1/record/Person', @@ -112,7 +112,7 @@ def test_delete(fastapi_client_simple): def test_hashed_token(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( f'/collection_1/record?pid={pid}', headers={'x-dumpthings-token': 'token-hashed'}, @@ -132,7 +132,7 @@ def test_hashed_token(fastapi_client_simple): def test_search_by_class(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 9): response = test_client.get( f'/collection_{i}/records/Thing', @@ -170,7 +170,7 @@ def test_search_by_class(fastapi_client_simple): def test_search_by_pid_no_token(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 9): response = test_client.get( f'/collection_{i}/record?pid={pid}', @@ -184,7 +184,7 @@ def test_search_by_pid_no_token(fastapi_client_simple): def test_store_record(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple # Store a record in two collections for i, token in basic_write_locations: @@ -252,7 +252,7 @@ def test_store_record(fastapi_client_simple): def test_encoding(fastapi_client_simple): - test_client, store_path = fastapi_client_simple + test_client, store_path, _ = fastapi_client_simple # Store a record with non-ASCII characters in collections via the API. that # will trigger the YAML-dumping, which should be checked @@ -272,7 +272,7 @@ def test_encoding(fastapi_client_simple): def test_global_store_write_fails(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 9): # Since we provide no token, the default token will be used. This will # only allow reading from curated, not posting. @@ -304,7 +304,7 @@ def test_token_store_adding(fastapi_client_simple): def test_funky_pid(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple record_pid = 'dlflatsocial:contributors/someone' for i, token in basic_write_locations: response = test_client.post( @@ -324,7 +324,7 @@ def test_funky_pid(fastapi_client_simple): def test_token_store_priority(fastapi_client_simple): - test_client, store_dir = fastapi_client_simple + test_client, store_dir, _ = fastapi_client_simple # Post a record with the same pid as the global store's test record, but # with different content. @@ -353,7 +353,7 @@ def test_token_store_priority(fastapi_client_simple): def test_unknown_token(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple # Check that fetching with an unknown token is handled gracefully response = test_client.get( @@ -372,7 +372,7 @@ def test_unknown_token(fastapi_client_simple): def test_curie_expansion(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple # Check that the pid is expanded correctly response = test_client.get( @@ -387,7 +387,7 @@ def test_curie_expansion(fastapi_client_simple): def test_server(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( '/server', @@ -431,7 +431,7 @@ def test_server(fastapi_client_simple): def test_ignore_classes(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for class_name in ('Organization', 'Project'): response = test_client.post( @@ -449,7 +449,7 @@ def test_ignore_classes(fastapi_client_simple): def test_maintenance(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple # Ensure that only curators can put a collection in maintenance mode response = test_client.post( diff --git a/dump_things_service/tests/test_collection_administration.py b/dump_things_service/tests/test_collection_administration.py index e49509f..6e5ada4 100644 --- a/dump_things_service/tests/test_collection_administration.py +++ b/dump_things_service/tests/test_collection_administration.py @@ -75,12 +75,12 @@ def _name_in_openapi_paths( def test_collection_adding(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, admin_token = fastapi_client_simple # Check that the collection does not yet exist response = test_client.get( f'/collections/{new_collection_name}', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_404_NOT_FOUND assert not _name_in_openapi_paths(test_client, new_collection_name) @@ -88,7 +88,7 @@ def test_collection_adding(fastapi_client_simple): # Add a new collection response = test_client.post( '/collections', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, json=new_collection_request.model_dump(mode='json', by_alias=True), ) assert response.status_code == HTTP_201_CREATED @@ -96,7 +96,7 @@ def test_collection_adding(fastapi_client_simple): response = test_client.get( f'/collections/{new_collection_name}', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_200_OK new_collection_config = new_collection_request.model_dump(mode='json', by_alias=True) @@ -106,7 +106,7 @@ def test_collection_adding(fastapi_client_simple): # Add a token to the collection response = test_client.post( '/tokens', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, json=new_token_request.model_dump(mode='json'), ) assert response.status_code == HTTP_201_CREATED @@ -114,7 +114,7 @@ def test_collection_adding(fastapi_client_simple): # Read the token back response = test_client.get( f'/tokens/{new_token_name}', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_200_OK assert response.json() == { @@ -148,7 +148,7 @@ def test_collection_adding(fastapi_client_simple): # Remove the token response = test_client.delete( f'/tokens/{new_token_name}', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_200_OK @@ -163,7 +163,7 @@ def test_collection_adding(fastapi_client_simple): # Remove the collection response = test_client.delete( f'/collections/{new_collection_name}', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_200_OK @@ -179,12 +179,12 @@ def test_collection_adding(fastapi_client_simple): def test_collection_reading(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, admin_token = fastapi_client_simple # Check that the new admin token is not yet working response = test_client.get( f'/collections', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_200_OK response_object = response.json() @@ -193,7 +193,7 @@ def test_collection_reading(fastapi_client_simple): def test_admin_token_management(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, admin_token = fastapi_client_simple # Check that the new admin token is not yet working response = test_client.get( @@ -205,7 +205,7 @@ def test_admin_token_management(fastapi_client_simple): # Add a new admin token response = test_client.post( '/admin_tokens', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, json=new_admin_token_request.model_dump(mode='json'), ) assert response.status_code == HTTP_201_CREATED @@ -234,7 +234,7 @@ def test_admin_token_management(fastapi_client_simple): response = test_client.get( f'/admin_tokens', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_200_OK assert new_admin_token_name not in response.json() diff --git a/dump_things_service/tests/test_config.py b/dump_things_service/tests/test_config.py index 278e9d2..ef55e2d 100644 --- a/dump_things_service/tests/test_config.py +++ b/dump_things_service/tests/test_config.py @@ -35,7 +35,7 @@ collection_request_pattern = CollectionRequest( def test_illegal_collection_name_detection(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, admin_token = fastapi_client_simple for name in ( 'collections', @@ -49,13 +49,13 @@ def test_illegal_collection_name_detection(fastapi_client_simple): **collection_request_pattern.model_dump(mode='json', by_alias=True), 'name': name, }, - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_409_CONFLICT def test_collection_dir_reuse_detection(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, admin_token = fastapi_client_simple for curated_path, incoming_path in ( ('curated/collection_1', 'incoming/XXXX'), @@ -69,7 +69,7 @@ def test_collection_dir_reuse_detection(fastapi_client_simple): 'curated': curated_path, 'incoming': incoming_path, }, - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_409_CONFLICT @@ -109,7 +109,7 @@ def test_structure_error_detection(tmp_path_factory): def test_missing_incoming_detection(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, admin_token = fastapi_client_simple # Add a collection without incoming collection_request = CollectionRequest( @@ -122,7 +122,7 @@ def test_missing_incoming_detection(fastapi_client_simple): response = test_client.post( '/collections', json=collection_request.model_dump(mode='json', by_alias=True), - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_201_CREATED @@ -144,14 +144,14 @@ def test_missing_incoming_detection(fastapi_client_simple): response = test_client.post( '/tokens', json=token_request.model_dump(mode='json', by_alias=True), - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_406_NOT_ACCEPTABLE # Remove the collection without incoming path response = test_client.delete( '/collections/missing_incoming_detection_test', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_200_OK @@ -160,7 +160,7 @@ def test_missing_incoming_detection(fastapi_client_simple): response = test_client.post( '/collections', json=collection_request.model_dump(mode='json', by_alias=True), - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_201_CREATED @@ -169,7 +169,7 @@ def test_missing_incoming_detection(fastapi_client_simple): response = test_client.post( '/tokens', json=token_request.model_dump(mode='json', by_alias=True), - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_406_NOT_ACCEPTABLE @@ -181,14 +181,14 @@ def test_missing_incoming_detection(fastapi_client_simple): response = test_client.post( '/tokens', json=token_request.model_dump(mode='json', by_alias=True), - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_201_CREATED # Remove the collection with the incoming path response = test_client.delete( '/collections/missing_incoming_detection_test', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_200_OK @@ -197,7 +197,7 @@ def test_missing_incoming_detection(fastapi_client_simple): response = test_client.post( '/collections', json=collection_request.model_dump(mode='json', by_alias=True), - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, ) assert response.status_code == HTTP_406_NOT_ACCEPTABLE diff --git a/dump_things_service/tests/test_curated.py b/dump_things_service/tests/test_curated.py index 5944a3b..47737a8 100644 --- a/dump_things_service/tests/test_curated.py +++ b/dump_things_service/tests/test_curated.py @@ -26,7 +26,7 @@ def test_read_curated_records( paginate, class_name, ): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( f'/collection_1/curated/records/{paginate}{class_name}', @@ -41,7 +41,7 @@ def test_read_curated_records( assert len(json_object) == 3 for pattern, count in (('%25wolf%25', 1), ('%25cura%25', 2)): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( f'/collection_8/curated/records/{paginate}{class_name}?matching={pattern}', headers={'x-dumpthings-token': 'token_1_xxxxx'}, @@ -59,7 +59,7 @@ pytest.mark.parametrize( ('abc:mode_test', 'abc:some_timee@x.com', 'abc:curated'), ) def test_read_curated_records_by_pid(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( '/no_such_collection/curated/records/', @@ -69,7 +69,7 @@ def test_read_curated_records_by_pid(fastapi_client_simple): def test_unknown_collection(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( '/no_such_collection/curated/records/', @@ -79,7 +79,7 @@ def test_unknown_collection(fastapi_client_simple): def test_curated_delete(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.post( '/collection_8/curated/record/Person', @@ -117,7 +117,7 @@ def test_curated_delete(fastapi_client_simple): def test_audit_backend(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple record_id = 'abc:audit-trailed' names = 'Frederick', 'Johny' @@ -153,7 +153,7 @@ def test_audit_backend(fastapi_client_simple): def test_audit_backend_auto_flush(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple record_id = 'abc:audit-trailed' names = 'Robert', 'Anton' diff --git a/dump_things_service/tests/test_extract_inline.py b/dump_things_service/tests/test_extract_inline.py index 2f9a128..6cb09b4 100644 --- a/dump_things_service/tests/test_extract_inline.py +++ b/dump_things_service/tests/test_extract_inline.py @@ -212,7 +212,7 @@ def test_dont_extract_empty_things_locally(): # relations @pytest.mark.xfail def test_inline_extraction_on_service(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 3): # Deposit JSON record @@ -257,7 +257,7 @@ def test_inline_extraction_on_service(fastapi_client_simple): # relations @pytest.mark.xfail def test_inline_ttl_processing(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 3): # Deposit TTL records diff --git a/dump_things_service/tests/test_incoming.py b/dump_things_service/tests/test_incoming.py index 68b3fb0..15b27a4 100644 --- a/dump_things_service/tests/test_incoming.py +++ b/dump_things_service/tests/test_incoming.py @@ -17,7 +17,7 @@ delete_record = { def test_incoming_labels(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 9): response = test_client.get( @@ -67,7 +67,7 @@ def test_read_incoming_records( paginate: str, class_name: str, ): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple fill_zones(test_client) @@ -120,7 +120,7 @@ pytest.mark.parametrize( ('abc:mode_test', 'abc:some_timee@x.com', 'abc:curated'), ) def test_read_incoming_records_by_pid(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( '/no_such_collection/curated/records/', @@ -130,7 +130,7 @@ def test_read_incoming_records_by_pid(fastapi_client_simple): def test_incoming_unknown_collection(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( '/no_such_collection/incoming/no_such_label/records/', @@ -140,7 +140,7 @@ def test_incoming_unknown_collection(fastapi_client_simple): def test_incoming_unknown_label(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( '/collection_1/incoming/no_such_label/records/', @@ -150,7 +150,7 @@ def test_incoming_unknown_label(fastapi_client_simple): def test_incoming_delete(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.post( '/collection_7/incoming/admin_common/record/Person', @@ -188,7 +188,7 @@ def test_incoming_delete(fastapi_client_simple): def test_incoming_on_disk_only(fastapi_client_simple): - test_client, data_root = fastapi_client_simple + test_client, data_root, _ = fastapi_client_simple # add a random directory to the incoming area of collection_1 random_part = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=6)) diff --git a/dump_things_service/tests/test_mapping.py b/dump_things_service/tests/test_mapping.py index dae22d9..417d3a3 100644 --- a/dump_things_service/tests/test_mapping.py +++ b/dump_things_service/tests/test_mapping.py @@ -19,7 +19,7 @@ record_b = { def test_mapping_functions_ignore_data(fastapi_client_simple): - test_client, store_path = fastapi_client_simple + test_client, store_path, _ = fastapi_client_simple for i, token in basic_write_locations: response = test_client.post( diff --git a/dump_things_service/tests/test_modes.py b/dump_things_service/tests/test_modes.py index 22805df..2b22206 100644 --- a/dump_things_service/tests/test_modes.py +++ b/dump_things_service/tests/test_modes.py @@ -50,7 +50,7 @@ def verify_modes( def test_token_modes(fastapi_client_simple): - test_client, store_dir = fastapi_client_simple + test_client, store_dir, _ = fastapi_client_simple # Post a record to incoming of collections `collection_1`. We use it to # validate read/write permissions on class-base diff --git a/dump_things_service/tests/test_pid_resolution.py b/dump_things_service/tests/test_pid_resolution.py index b6cb49a..d9caa0d 100644 --- a/dump_things_service/tests/test_pid_resolution.py +++ b/dump_things_service/tests/test_pid_resolution.py @@ -6,7 +6,7 @@ from .. import HTTP_422_UNPROCESSABLE_CONTENT @pytest.mark.parametrize('pid', ['unknown_prefix:test_pid', 'abc:test_öö_pid']) @pytest.mark.parametrize('url_part', ['', 'curated/', 'incoming/in_token_1/']) def test_store_record_validation(fastapi_client_simple, pid, url_part): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple # Store a record in two collections response = test_client.post( diff --git a/dump_things_service/tests/test_roundtrip.py b/dump_things_service/tests/test_roundtrip.py index bc54a46..175d154 100644 --- a/dump_things_service/tests/test_roundtrip.py +++ b/dump_things_service/tests/test_roundtrip.py @@ -54,7 +54,7 @@ new_json_pid = 'xyz:HenryBaites' def test_json_ttl_json(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple # Deposit JSON records response = test_client.post( @@ -96,7 +96,7 @@ def test_json_ttl_json(fastapi_client_simple): @freezegun.freeze_time('1970-01-01') def test_ttl_json_ttl(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple # Deposit a ttl record response = test_client.post( diff --git a/dump_things_service/tests/test_roundtrip_flatsocial.py b/dump_things_service/tests/test_roundtrip_flatsocial.py index 9fc4df9..8c7f8d9 100644 --- a/dump_things_service/tests/test_roundtrip_flatsocial.py +++ b/dump_things_service/tests/test_roundtrip_flatsocial.py @@ -73,7 +73,7 @@ new_json_pid = 'dlflatsocial:another_john_ttl' def test_json_ttl_json_dlflatsocial(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 3): # Deposit JSON records @@ -116,7 +116,7 @@ def test_json_ttl_json_dlflatsocial(fastapi_client_simple): @freezegun.freeze_time('1970-01-01') def test_ttl_json_ttl_dlflatsocial(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 3): # Deposit a ttl record diff --git a/dump_things_service/tests/test_token_endpoints.py b/dump_things_service/tests/test_token_endpoints.py index 35bebbd..0209eb3 100644 --- a/dump_things_service/tests/test_token_endpoints.py +++ b/dump_things_service/tests/test_token_endpoints.py @@ -4,7 +4,7 @@ from dump_things_service import HTTP_201_CREATED def test_token_creation(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, admin_token = fastapi_client_simple json_record = { 'name': 'a', @@ -21,7 +21,7 @@ def test_token_creation(fastapi_client_simple): # Create a token eith name 'a' response = test_client.post( '/tokens', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, json=json_record, ) assert response.status_code == HTTP_201_CREATED @@ -29,7 +29,7 @@ def test_token_creation(fastapi_client_simple): # Try to create another token with name 'a', should result in a 4ß9-error response = test_client.post( '/tokens', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, json=json_record, ) assert response.status_code == HTTP_409_CONFLICT @@ -39,7 +39,7 @@ def test_token_creation(fastapi_client_simple): json_record['name'] = 'b' response = test_client.post( '/tokens', - headers={'x-dumpthings-token': 'admin-1'}, + headers={'x-dumpthings-token': admin_token}, json=json_record, ) assert response.status_code == HTTP_409_CONFLICT diff --git a/dump_things_service/tests/test_validate.py b/dump_things_service/tests/test_validate.py index a5da6aa..800b17c 100644 --- a/dump_things_service/tests/test_validate.py +++ b/dump_things_service/tests/test_validate.py @@ -44,7 +44,7 @@ xyz:henry a abc:Person ; def test_validate_record(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for record, expected_status in json_records: response = test_client.post( diff --git a/dump_things_service/tests/test_web_interface.py b/dump_things_service/tests/test_web_interface.py index 039cacb..8096d5a 100644 --- a/dump_things_service/tests/test_web_interface.py +++ b/dump_things_service/tests/test_web_interface.py @@ -25,7 +25,7 @@ def test_web_interface_post_errors( format_name, ): """Check that no internal server error occurs with weird input""" - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple result = test_client.post( f'/{collection_name}/record/{class_name}?{query}={format_name}', headers={'x-dumpthings-token': 'token-all'}, @@ -46,7 +46,7 @@ def test_web_interface_get_class_errors( format_name, ): """Check that no internal server error occurs with weird input""" - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple result = test_client.get( f'/{collection_name}/records/{class_name}?{query}={format_name}', ) @@ -71,7 +71,7 @@ def test_web_interface_get_pid_errors( format_name, ): """Check that no internal server error occurs with weird input""" - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple result = test_client.get( f'/{collection_name}/records?{pid}&{query}={format_name}', ) -- 2.52.0 From 728552fd219dca48a1f6be52df85b047f7095cc6 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 10 Jun 2026 16:33:38 +0200 Subject: [PATCH 51/64] replace `schema`-attribute in pydantic-objects Replaces the `schema`-attribute with the attribute `schema_location`. That prevents shadowing of pydantic's internal `schema`-attribute and gets rid of pydantic-issues warnings. --- dump_things_service/abstract_config.py | 2 +- dump_things_service/collection.py | 2 +- dump_things_service/main.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index bcd5058..7722fac 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -107,7 +107,7 @@ class RecordDirConfigFileContent(BaseModel): model_config = ConfigDict(extra='forbid') type: Literal['records'] version: Literal[1] - schema: str + schema_location: str = Field(alias='schema') format: Literal['yaml'] idfx: MappingMethod diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index 54b73d6..f9ed186 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -415,7 +415,7 @@ def check_record_dir_compatibility( schema: str, ): record_dir_config = get_record_dir_config(store_path) - if record_dir_config.schema != schema: + if record_dir_config.schema_location != schema: raise ConfigCollisionError(f"Existing backend uses a different schema: '{record_dir_config.schema}'") stored_mapping_method = record_dir_config.idfx.value diff --git a/dump_things_service/main.py b/dump_things_service/main.py index 7b3e62c..d2cb0ed 100644 --- a/dump_things_service/main.py +++ b/dump_things_service/main.py @@ -7,14 +7,13 @@ import sys from pathlib import Path from typing import TYPE_CHECKING -import yaml - from dump_things_service.abstract_config import store_config from dump_things_service.commands.load_config import convert_to_new_format from dump_things_service.manifest import manifest_configuration # Perform the patching before importing any third-party libraries from dump_things_service.patches import enabled # noqa F401 -- used by generated code +import yaml import uvicorn from fastapi import ( Depends, @@ -30,6 +29,7 @@ from fastapi_pagination import ( from fastapi_pagination.utils import disable_installed_extensions_check from pydantic import ( BaseModel, + Field, ) from starlette.responses import ( PlainTextResponse, @@ -87,7 +87,7 @@ class MaintenanceRequest(BaseModel): class ServerCollectionResponse(BaseModel): name: str - schema: str + schema_location: str = Field(alias='schema') classes: list[str] -- 2.52.0 From 9398ac207f6eb0091ebdce925a7144d3cb7bb8bf Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Wed, 10 Jun 2026 16:53:10 +0200 Subject: [PATCH 52/64] remove unnecessary module `mapping_functions.py` --- dump_things_service/abstract_config.py | 68 ++++++++++++++++++-- dump_things_service/mapping_functions.py | 82 ------------------------ 2 files changed, 64 insertions(+), 86 deletions(-) delete mode 100644 dump_things_service/mapping_functions.py diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py index 7722fac..ca4dc36 100644 --- a/dump_things_service/abstract_config.py +++ b/dump_things_service/abstract_config.py @@ -1,11 +1,13 @@ import enum import hashlib import logging +from functools import partial from pathlib import ( Path, PurePosixPath, ) from typing import ( + Callable, Iterable, Literal, cast, @@ -29,10 +31,6 @@ from dump_things_service.backends.record_dir import ( RecordDirStore, ) from dump_things_service.exceptions import ConfigError -from dump_things_service.mapping_functions import ( - MappingMethod, - mapping_functions, -) logger = logging.getLogger('dump_things_service') @@ -73,6 +71,16 @@ class TagSpec(BaseModel): submission_time_tag: str = 'http://semanticscience.org/resource/SIO_001083' +class MappingMethod(enum.Enum): + digest_md5 = 'digest-md5' + digest_md5_p3 = 'digest-md5-p3' + digest_md5_p3_p3 = 'digest-md5-p3-p3' + digest_sha1 = 'digest-sha1' + digest_sha1_p3 = 'digest-sha1-p3' + digest_sha1_p3_p3 = 'digest-sha1-p3-p3' + after_last_colon = 'after-last-colon' + + class RecordDirBackendConfig(StrictModel): model_config = ConfigDict(use_enum_values=True) type: Literal['record_dir', 'record_dir+stl'] @@ -435,3 +443,55 @@ def get_mapping_function(record_dir_backend_config: RecordDirBackendConfig): def get_backend_and_extension(backend_type: str) -> tuple[str, str]: elements = backend_type.split('+') return (elements[0], elements[1]) if len(elements) > 1 else (elements[0], '') + + +def get_hex_digest(hasher: Callable, data: str) -> str: + hash_context = hasher(data.encode()) + return hash_context.hexdigest() + + +def mapping_digest_p3( + hasher: Callable, + pid: str, + suffix: str, +) -> Path: + hex_digest = get_hex_digest(hasher, pid) + return Path(hex_digest[:3]) / (hex_digest[3:] + '.' + suffix) + + +def mapping_digest_p3_p3( + hasher: Callable, + pid: str, + suffix: str, +) -> Path: + hex_digest = get_hex_digest(hasher, pid) + return Path(hex_digest[:3]) / hex_digest[3:6] / (hex_digest[6:] + '.' + suffix) + + +def mapping_digest(hasher: Callable, pid: str, suffix: str) -> Path: + hex_digest = get_hex_digest(hasher, pid) + return Path(hex_digest + '.' + suffix) + + +def mapping_after_last_colon(pid: str, suffix: str) -> Path: + plain_result = pid.split(':')[-1] + # Escape any colons and slashes in the pid + escaped_result = ( + plain_result.replace('_', '__').replace('/', '_s').replace('.', '_d') + ) + return Path(escaped_result + '.' + suffix) + + +mapping_functions = { + MappingMethod.digest_md5: partial(mapping_digest, hashlib.md5), + MappingMethod.digest_md5_p3: partial(mapping_digest_p3, hashlib.md5), + MappingMethod.digest_md5_p3_p3: partial(mapping_digest_p3_p3, hashlib.md5), + MappingMethod.digest_sha1: partial(mapping_digest, hashlib.sha1), + MappingMethod.digest_sha1_p3: partial(mapping_digest_p3, hashlib.sha1), + MappingMethod.digest_sha1_p3_p3: partial(mapping_digest_p3_p3, hashlib.sha1), + MappingMethod.after_last_colon: mapping_after_last_colon, +} + + +def get_mapping_function_by_name(mapping_function_name: str) -> Callable: + return mapping_functions[MappingMethod(mapping_function_name)] diff --git a/dump_things_service/mapping_functions.py b/dump_things_service/mapping_functions.py deleted file mode 100644 index 31587c6..0000000 --- a/dump_things_service/mapping_functions.py +++ /dev/null @@ -1,82 +0,0 @@ -import enum -import hashlib -from functools import partial -from pathlib import Path -from typing import Callable, Literal - -from pydantic import BaseModel, ConfigDict - - -class MappingMethod(enum.Enum): - digest_md5 = 'digest-md5' - digest_md5_p3 = 'digest-md5-p3' - digest_md5_p3_p3 = 'digest-md5-p3-p3' - digest_sha1 = 'digest-sha1' - digest_sha1_p3 = 'digest-sha1-p3' - digest_sha1_p3_p3 = 'digest-sha1-p3-p3' - after_last_colon = 'after-last-colon' - - -class CollectionDirConfigContent(BaseModel): - model_config = ConfigDict(extra='forbid', use_enum_values=True) - type: Literal['records'] - version: Literal[1] - schema: str - format: Literal['yaml'] - idfx: MappingMethod - - -def get_hex_digest(hasher: Callable, data: str) -> str: - hash_context = hasher(data.encode()) - return hash_context.hexdigest() - - -def mapping_digest_p3( - hasher: Callable, - pid: str, - suffix: str, -) -> Path: - hex_digest = get_hex_digest(hasher, pid) - return Path(hex_digest[:3]) / (hex_digest[3:] + '.' + suffix) - - -def mapping_digest_p3_p3( - hasher: Callable, - pid: str, - suffix: str, -) -> Path: - hex_digest = get_hex_digest(hasher, pid) - return Path(hex_digest[:3]) / hex_digest[3:6] / (hex_digest[6:] + '.' + suffix) - - -def mapping_digest(hasher: Callable, pid: str, suffix: str) -> Path: - hex_digest = get_hex_digest(hasher, pid) - return Path(hex_digest + '.' + suffix) - - -def mapping_after_last_colon(pid: str, suffix: str) -> Path: - plain_result = pid.split(':')[-1] - # Escape any colons and slashes in the pid - escaped_result = ( - plain_result.replace('_', '__').replace('/', '_s').replace('.', '_d') - ) - return Path(escaped_result + '.' + suffix) - - -mapping_functions = { - MappingMethod.digest_md5: partial(mapping_digest, hashlib.md5), - MappingMethod.digest_md5_p3: partial(mapping_digest_p3, hashlib.md5), - MappingMethod.digest_md5_p3_p3: partial(mapping_digest_p3_p3, hashlib.md5), - MappingMethod.digest_sha1: partial(mapping_digest, hashlib.sha1), - MappingMethod.digest_sha1_p3: partial(mapping_digest_p3, hashlib.sha1), - MappingMethod.digest_sha1_p3_p3: partial(mapping_digest_p3_p3, hashlib.sha1), - MappingMethod.after_last_colon: mapping_after_last_colon, -} - - -def get_mapping_function_by_name(mapping_function_name: str) -> Callable: - return mapping_functions[MappingMethod(mapping_function_name)] - - -def get_mapping_function(collection_config: CollectionDirConfigContent): - return mapping_functions[collection_config.idfx] -- 2.52.0 From edeccc5a038d418138fcbdca5f3608c9c687289c Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 11 Jun 2026 09:39:47 +0200 Subject: [PATCH 53/64] add compatibility check for git audit directories Add code to check whether the directory specified int the git-audit configuration is either non-existing, empty, or compatible with git-audit. --- dump_things_service/collection.py | 33 +++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index f9ed186..ce9761f 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -1,9 +1,14 @@ import logging +import os import shutil import sys from pathlib import Path from typing import Any +from datalad_core.runners import ( + call_git_oneline, + CommandError, +) from fastapi import ( Depends, FastAPI, @@ -181,7 +186,7 @@ def create_collection( for audit_backend in collection_configuration.audit_backends: audit_path = Path(instance_state.store_path / audit_backend.path) if audit_path.exists(): - check_audit_compatibility(audit_path) + check_git_audit_compatibility(audit_path) # We know now that all existing structures are compatible with the # collection specification. We record what we create in order to delete @@ -381,10 +386,6 @@ idfx: {backend_config.mapping_method} ) -def create_audit_store(*args, **kwargs): - return - - def check_store_compatibility( store_path: Path, backend_config: RecordDirBackendConfig | SQLiteBackendConfig, @@ -436,17 +437,33 @@ def check_sqlite_compatibility( return -def check_audit_compatibility( +def check_git_audit_compatibility( audit_path: Path, ): - """Check if an existing audit is compatible with the specs + """Check if an existing audit path is compatible with a git audit store :param audit_path: :return: """ + + # Non-existing or empty directories are valid gitaudit-locations if not audit_path.exists(): return - print('IMPLEMENT: check_audit_compatibility', file=sys.stderr, flush=True) + if not tuple(os.scandir(audit_path)): + return + + # A non-empty directory should contain bare git repository + try: + result = call_git_oneline( + ['rev-parse', '--is-bare-repository'], + cwd=audit_path, + force_c_locale=True, + ) + except CommandError as ce: + raise ConfigError(f'No git repository in gitaudit-path: {audit_path}') from ce + if result.strip().lower() != 'true': + raise ConfigError(f'No bare git repository in gitaudit-path: {audit_path}') + return def create_endpoint( -- 2.52.0 From 29117e1f188b0737ced553bf263f849a6cb819a8 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 11 Jun 2026 09:56:22 +0200 Subject: [PATCH 54/64] fix a typo in a comment --- dump_things_service/manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dump_things_service/manifest.py b/dump_things_service/manifest.py index 70fa2a9..71cb722 100644 --- a/dump_things_service/manifest.py +++ b/dump_things_service/manifest.py @@ -155,7 +155,7 @@ def manifest_configuration( # authentication sources and the configuration-file authentication source # is just one possible authentication source. Other authentication sources # have unknown means to create incoming area labels. - # Incoming areas are therefore create when a write request for a token + # Incoming areas are therefore created when a write request for a token # is authorized. -- 2.52.0 From 76b14f70288a4948220387b87f1043d1e6711307 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 11 Jun 2026 13:42:51 +0200 Subject: [PATCH 55/64] remove duplicated code --- dump_things_service/collection.py | 107 ++++++------------------------ dump_things_service/utils.py | 77 ++++++++++++++------- 2 files changed, 74 insertions(+), 110 deletions(-) diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py index ce9761f..ea386f0 100644 --- a/dump_things_service/collection.py +++ b/dump_things_service/collection.py @@ -1,7 +1,6 @@ import logging import os import shutil -import sys from pathlib import Path from typing import Any @@ -38,26 +37,15 @@ from dump_things_service.abstract_config import ( ForgejoAuthSpec, RecordDirBackendConfig, SQLiteBackendConfig, - TagSpec, read_config, check_collection, - get_backend_and_extension, get_default_token_representation, - get_mapping_function, ) from dump_things_service.audit.gitaudit import GitAuditBackend from dump_things_service.auth.config import ConfigAuthenticationSource from dump_things_service.auth.forgejo import ForgejoAuthenticationSource -from dump_things_service.backends.record_dir import ( - _RecordDirStore, - RecordDirStore, -) -from dump_things_service.backends.sqlite import ( - _SQLiteBackend, - SQLiteBackend, - record_file_name as sqlite_db_filename, -) -from dump_things_service.backends.schema_type_layer import SchemaTypeLayer +from dump_things_service.backends.record_dir_index import index_file_name +from dump_things_service.backends.sqlite import record_file_name as sqlite_db_filename from dump_things_service.instance_state import ( InstanceState, InstanceStateCollectionInfo, @@ -66,10 +54,6 @@ from dump_things_service.instance_state import ( get_schema_info, record_dir_config_file_name, ) -from dump_things_service.store.model_store import ( - _ModelStore, - ModelStore, -) from dump_things_service.converter import FormatConverter from dump_things_service.exceptions import ( ConfigError, @@ -79,10 +63,11 @@ from dump_things_service.exceptions import ( from dump_things_service.model import get_model_for_schema from dump_things_service.utils import ( combine_ttl, + create_store, get_token_store, join_default_token_permissions, - wrap_http_exception, var_escape, + wrap_http_exception, ) @@ -215,12 +200,11 @@ def create_collection( # Create the curated store curated_store = create_store( - instance_state, - curated_path, - collection_configuration.backend, - collection_configuration.schema_location, - collection_configuration.submission_tags + abstract_configuration=configuration, + instance_state=instance_state, + collection_name=collection_name, ) + instance_state.curated_stores[collection_name] = curated_store # Incoming stores are created on demand when a token is authenticated @@ -278,64 +262,6 @@ def create_collection( ) -def create_store( - instance_state: InstanceState, - relative_path: Path, - backend_config: RecordDirBackendConfig | SQLiteBackendConfig, - schema: str, - submission_tags: TagSpec, -) -> _ModelStore: - - backend_type, extension = get_backend_and_extension(backend_config.type) - if isinstance(backend_config, RecordDirBackendConfig): - backend = create_record_dir_backend(instance_state, relative_path, backend_config, schema) - elif isinstance(backend_config, SQLiteBackendConfig): - backend = create_sqlite_backend(instance_state, relative_path) - else: - msg = f'Unsupported backend configuration type: {backend_type} ({type(backend_config)})' - raise ConfigError(msg) - - if extension == 'stl': - backend = SchemaTypeLayer(backend=backend, schema=schema) - - return ModelStore( - schema=schema, - backend=backend, - tags={ - 'id': submission_tags.submitter_id_tag, - 'time': submission_tags.submission_time_tag, - }, - ) - - -def create_record_dir_backend( - instance_state: InstanceState, - relative_path: Path, - backend_config: RecordDirBackendConfig, - schema: str, -) -> _RecordDirStore: - path = instance_state.store_path / relative_path - write_record_dir_config(path, backend_config, schema) - backend = RecordDirStore( - root=path, - pid_mapping_function=get_mapping_function(backend_config), - suffix='yaml', - order_by=instance_state.order_by, - ) - backend.build_index_if_needed(schema=schema) - return backend - - -def create_sqlite_backend( - instance_state: InstanceState, - relative_path: Path, -) -> _SQLiteBackend: - return SQLiteBackend( - db_path=instance_state.store_path / relative_path / sqlite_db_filename, - order_by=instance_state.order_by, - ) - - def create_authentication_source( abstract_configuration: Configuration, collection_name: str, @@ -403,7 +329,7 @@ def check_store_compatibility( if isinstance(backend_config, RecordDirBackendConfig): check_record_dir_compatibility(store_path, backend_config, schema) elif isinstance(backend_config, SQLiteBackendConfig): - check_sqlite_compatibility(store_path, backend_config, schema) + check_sqlite_compatibility(store_path) else: msg = f"Unsupported backend config type: '{type(backend_config)}'" raise ConfigError(msg) @@ -415,9 +341,20 @@ def check_record_dir_compatibility( backend_config: RecordDirBackendConfig, schema: str, ): + + # Non-existing or empty record_dir-directories are compatible + if not store_path.exists(): + return + + # A record_dir-directory is considered to be empty, if it contains no + # files or only an record_dir-index file + files_in_dir = tuple(map(lambda dir_entry: dir_entry.name, os.scandir(store_path))) + if files_in_dir in ((), (index_file_name,)): + return + record_dir_config = get_record_dir_config(store_path) if record_dir_config.schema_location != schema: - raise ConfigCollisionError(f"Existing backend uses a different schema: '{record_dir_config.schema}'") + raise ConfigCollisionError(f"Existing backend uses a different schema: '{record_dir_config.schema_location}'") stored_mapping_method = record_dir_config.idfx.value if stored_mapping_method != backend_config.mapping_method: @@ -428,8 +365,6 @@ def check_record_dir_compatibility( def check_sqlite_compatibility( store_path: Path, - backend_config: SQLiteBackendConfig, - schema: str, ): sqlite_db_path = Path(store_path / sqlite_db_filename) if not sqlite_db_path.exists(): diff --git a/dump_things_service/utils.py b/dump_things_service/utils.py index c023a85..cbdb276 100644 --- a/dump_things_service/utils.py +++ b/dump_things_service/utils.py @@ -37,6 +37,7 @@ from dump_things_service.abstract_config import ( check_collection, get_default_token_config, get_token_config_for_representation_and_collection, + get_mapping_function_by_name, ) from dump_things_service.auth import ( AuthenticationError, @@ -271,6 +272,20 @@ def get_token_store( return instance_state.incoming_stores[collection_name][token_representation] +def create_store( + abstract_configuration: Configuration, + instance_state: InstanceState, + collection_name: str, +) -> _ModelStore: + collection_curated_path = abstract_configuration.collections[collection_name].curated + return create_token_store( + abstract_configuration=abstract_configuration, + instance_state=instance_state, + collection_name=collection_name, + store_dir=instance_state.store_path / collection_curated_path, + ) + + def create_token_store( abstract_configuration: Configuration, instance_state: InstanceState, @@ -305,47 +320,43 @@ def create_token_store( schema_uri = abstract_configuration.collections[collection_name].schema_location # We get the backend information from the abstract configuration - backend_type = abstract_configuration.collections[collection_name].backend.type - backend_name, extension = get_backend_and_extension(backend_type) - - backend = instance_state.curated_stores[collection_name].backend + backend_config = abstract_configuration.collections[collection_name].backend + backend_name, extension = get_backend_and_extension(backend_config.type) if backend_name == 'record_dir': - # The configuration routines have read the backend configuration of the - # curated store from disk and stored it in `instance_state`. We fetch - # it from there. - if extension == 'stl': - backend = backend.backend - token_store = create_record_dir_token_store( + backend = create_record_dir_token_store_backend( store_dir=store_dir, - order_by=backend.order_by, + order_by=instance_state.order_by, schema_uri=schema_uri, - mapping_function=backend.pid_mapping_function, - suffix=backend.suffix, + mapping_function=get_mapping_function_by_name(backend_config.mapping_method), + suffix='yaml', ) elif backend_name == 'sqlite': - token_store = create_sqlite_token_store( + backend = create_sqlite_token_store_backend( store_dir=store_dir, - order_by=backend.order_by, + order_by=instance_state.order_by, ) else: # This should not happen because we base our decision on already # existing backends. - msg = f'Unsupported backend type: `{backend_type}`.' + msg = f'Unsupported backend type: `{backend_name}`.' raise ConfigError(msg) if extension == 'stl': - token_store = SchemaTypeLayer(backend=token_store, schema=schema_uri) + backend = SchemaTypeLayer(backend=backend, schema=schema_uri) submission_tags = abstract_configuration.collections[collection_name].submission_tags - tags = { - 'id': submission_tags.submitter_id_tag, - 'time': submission_tags.submission_time_tag, - } - return ModelStore(backend=token_store, schema=schema_uri, tags=tags) + return ModelStore( + schema=schema_uri, + backend=backend, + tags={ + 'id': submission_tags.submitter_id_tag, + 'time': submission_tags.submission_time_tag, + }, + ) -def create_record_dir_token_store( +def create_record_dir_token_store_backend( store_dir: Path, order_by: list[str], schema_uri: str, @@ -364,7 +375,25 @@ def create_record_dir_token_store( return store_backend -def create_sqlite_token_store( +def write_record_dir_config( + path: Path, + backend_config: RecordDirBackendConfig, + schema: str, +): + assert isinstance(backend_config, RecordDirBackendConfig) + + record_dir_config_file_path = path / record_dir_config_file_name + if not record_dir_config_file_path.exists(): + record_dir_config_file_path.write_text(f"""# RecordDir Config +type: records +version: 1 +schema: {schema} +format: yaml +idfx: {backend_config.mapping_method} +""", + ) + +def create_sqlite_token_store_backend( store_dir: Path, order_by: list[str], ) -> _SQLiteBackend: -- 2.52.0 From dc88bce026b8b687e08705a27a0ad7f528f388fc Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 12 Jun 2026 09:37:31 +0200 Subject: [PATCH 56/64] remove unused code, fix typos in comments --- dump_things_service/utils.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/dump_things_service/utils.py b/dump_things_service/utils.py index cbdb276..894102c 100644 --- a/dump_things_service/utils.py +++ b/dump_things_service/utils.py @@ -20,7 +20,6 @@ from typing import ( import fsspec from fastapi import HTTPException from rdflib import Graph -from starlette.status import HTTP_500_INTERNAL_SERVER_ERROR from dump_things_service import ( HTTP_400_BAD_REQUEST, @@ -31,6 +30,7 @@ from dump_things_service import ( ) from dump_things_service.abstract_config import ( Configuration, + RecordDirBackendConfig, TokenModes, TokenPermission, mode_mapping, @@ -39,6 +39,7 @@ from dump_things_service.abstract_config import ( get_token_config_for_representation_and_collection, get_mapping_function_by_name, ) +from dump_things_service.instance_state import record_dir_config_file_name from dump_things_service.auth import ( AuthenticationError, AuthenticationInfo, @@ -70,7 +71,7 @@ def sys_path(paths: list[str | Path]): def read_url(url: str) -> str: """ - Read the content of an URL into memory. + Read the content of a URL into memory. """ open_file = fsspec.open(url, 'rt') with open_file as f: @@ -94,17 +95,6 @@ def combine_ttl(documents: list[str]) -> str: return reduce(lambda g1, g2: g1 + g2, graphs).serialize(format='ttl') -def get_schema_type_curie( - instance_state: InstanceState, - collection: str, - class_name: str, -) -> str: - schema_url = instance_state.schemas[collection] - schema_module = instance_state.conversion_objects[schema_url]['schema_module'] - class_object = getattr(schema_module, class_name) - return class_object.class_class_curie - - @contextmanager def wrap_http_exception( exception_class: type[BaseException] = ValueError, @@ -141,7 +131,7 @@ def join_default_token_permissions( # We allow inconsistencies in token/collection configuration space. This # allows an administrator to create tokens and collections in two separate - # steps. Therefore we have to check whether the referred default token + # steps. Therefore, we have to check whether the referred default token # is actually defined for the collection. if collection not in abstract_configuration.tokens[default_token_name].collections: return result -- 2.52.0 From c056ce9dfd51a20319261d77a066bfc19560fea3 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 12 Jun 2026 10:47:29 +0200 Subject: [PATCH 57/64] remove unused code --- dump_things_service/export/__init__.py | 7 -- dump_things_service/export/json.py | 138 ------------------------- dump_things_service/export/tree.py | 120 --------------------- 3 files changed, 265 deletions(-) delete mode 100644 dump_things_service/export/__init__.py delete mode 100644 dump_things_service/export/json.py delete mode 100644 dump_things_service/export/tree.py diff --git a/dump_things_service/export/__init__.py b/dump_things_service/export/__init__.py deleted file mode 100644 index 7f1c00b..0000000 --- a/dump_things_service/export/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .json import export_json -from .tree import export_tree - -exporter_info = { - 'json': export_json, - 'tree': export_tree, -} diff --git a/dump_things_service/export/json.py b/dump_things_service/export/json.py deleted file mode 100644 index c33292a..0000000 --- a/dump_things_service/export/json.py +++ /dev/null @@ -1,138 +0,0 @@ -import json -import sys -from pathlib import Path -from typing import TextIO - -from dump_things_service.abstract_config import Configuration -from dump_things_service.lazy_list import LazyList -from dump_things_service.model import get_classes -from dump_things_service.store.model_store import ModelStore - -level_width = 2 - - -# The _lookahead function is taken from: -# https://stackoverflow.com/questions/1630320/what-is-the-pythonic-way-to-detect-the-last-element-in-a-for-loop -# with small changes -def _lookahead(iterable): - """Pass through all values from the given iterable, augmented by the - information if there are more values to come after the current one - (True), or if it is the last value (False). - """ - # Get an iterator and pull the first value. - it = iter(iterable) - try: - last = next(it) - except StopIteration: - return - # Run the iterator to exhaustion (starting from the second value). - for val in it: - # Report the *previous* value (more to come). - yield last, False - last = val - # Report the last value. - yield last, True - - -def export_json( - abstract_config: Configuration, - destination: str, -): - if destination == '-': - output = sys.stdout - else: - output = Path(destination).open('wt', encoding='utf-8') # noqa: SIM115 - - output.write('{\n') - for collection, is_last in _lookahead(abstract_config.collections): - output.write(f'{level_width * " "}"{collection}": {{\n') - export_collection(abstract_config, collection, 2 * level_width, output) - if is_last: - output.write(f'\n{level_width * " "}}}\n') - else: - output.write(f'\n{level_width * " "}}},\n') - output.write('}\n') - - -def export_collection( - abstract_config: Configuration, - collection: str, - indent: int, - output: TextIO, -): - output.write(f'{indent * " "}"schema": "{abstract_config.collections[collection].schema_location}",\n') - output.write(f'{indent * " "}"curated": {{\n') - append_classes( - instance_config.curated_stores[collection], indent + level_width, output - ) - output.write(f'\n{indent * " "}}}') - - # Determine stores for incoming zones - zones = { - label: instance_config.token_stores[token]['collections'] - .get(collection, {}) - .get('store') - for token, label in instance_config.zones.get(collection, {}).items() - if instance_config.token_stores[token]['collections'] - .get(collection, {}) - .get('store') - is not None - } - - if zones: - # Put a comma between "curated" and "incoming". - output.write(f',\n{indent * " "}"incoming": {{\n') - indent_zone = indent + level_width - indent_classes = indent_zone + level_width - for (zone, store), is_last in _lookahead(zones.items()): - output.write(f'{indent_zone * " "}"{zone}": {{\n') - append_classes(store, indent_classes, output) - if is_last: - output.write(f'\n{(indent + level_width) * " "}}}') - else: - output.write(f'\n{(indent + level_width) * " "}}},\n') - - # End the "incoming" dictionary - output.write(f'\n{indent * " "}}}') - - -def append_classes( - store: ModelStore, - indent: int, - output: TextIO, -): - """Append instances of all classes to the file""" - class_names = get_classes(store.model) - - first = True - for class_name in class_names: - # We know that pure `Thing` instances are not stored in the store. - if class_name == 'Thing': - continue - - class_instances = store.get_objects_of_class( - class_name, include_subclasses=False - ) - if class_instances: - if not first: - output.write(',\n') - first = False - output.write(f'{indent * " "}"{class_name}": [\n') - append_instances( - class_instances, - output, - indent + level_width, - ) - output.write(f'\n{indent * " "}]') - - -def append_instances( - instances: LazyList, - output: TextIO, - indent: int, -): - for instance, is_last in _lookahead(instances): - json_string = json.dumps(instance.json_object, ensure_ascii=False) - output.write(f'{(indent + level_width) * " "}{json_string}') - if not is_last: - output.write(',\n') diff --git a/dump_things_service/export/tree.py b/dump_things_service/export/tree.py deleted file mode 100644 index 457e6ed..0000000 --- a/dump_things_service/export/tree.py +++ /dev/null @@ -1,120 +0,0 @@ -from pathlib import Path - -import yaml - - -from dump_things_service.abstract_config import ( - Configuration, - get_mapping_function_by_name, -) -from dump_things_service.instance_state import InstanceState -from dump_things_service.model import get_classes -from dump_things_service.store.model_store import ModelStore - -idfx = get_mapping_function_by_name('digest-md5-p3-p3') - - -def export_tree( - abstract_config: Configuration, - instance_state: InstanceState, - destination: str, -): - destination = Path(destination) - if destination.exists() and not destination.is_dir(): - msg = 'The export_tree destination path must be a directory.' - raise ValueError(msg) - - destination.mkdir(parents=True, exist_ok=True) - for collection_name in abstract_config.collections: - export_collection( - abstract_config, - instance_state, - collection_name, - destination, - ) - - -def export_collection( - abstract_config: Configuration, - instance_state: InstanceState, - collection: str, - destination: Path, -): - collection_destination = destination / collection - collection_destination.mkdir(parents=True, exist_ok=True) - - config_content = ( - 'type: records\n' - 'version: 1\n' - f'schema: {abstract_config.collections[collection].schema_location}\n' - 'format: yaml\n' - 'idfx: digest-md5-p3-p3\n' - ) - - curated_destination = collection_destination / 'curated' - curated_destination.mkdir(parents=True, exist_ok=True) - (curated_destination / '.dumpthings.yaml').write_text(config_content) - exported_stores = { - id(instance_state.curated_stores[collection]): curated_destination - } - export_classes(instance_state.curated_stores[collection], curated_destination) - - # Determine stores for incoming zones - zones = { - label: instance_config.token_stores[token]['collections'] - .get(collection, {}) - .get('store') - for token, label in instance_config.zones.get(collection, {}).items() - if instance_config.token_stores[token]['collections'] - .get(collection, {}) - .get('store') - is not None - } - - if zones: - incoming_destination = collection_destination / 'incoming' - for zone, store in zones.items(): - zone_destination = incoming_destination / zone - if id(store) in exported_stores: - # Already exported this store, make `zone_destination` a link - # to the existing export. - zone_destination.parent.mkdir(parents=True, exist_ok=True) - zone_destination.symlink_to(exported_stores[id(store)]) - continue - exported_stores[id(store)] = zone_destination = ( - collection_destination / 'incoming' / zone - ) - zone_destination.mkdir(parents=True, exist_ok=True) - (zone_destination / '.dumpthings.yaml').write_text(config_content) - export_classes(store, zone_destination) - - -def export_classes( - store: ModelStore, - destination: Path, -): - class_names = get_classes(store.model) - for class_name in class_names: - # We know that pure `Thing` instances are not stored in the store. - if class_name == 'Thing': - continue - - record_infos = store.get_objects_of_class(class_name, include_subclasses=False) - if record_infos: - class_destination = destination / class_name - class_destination.mkdir(parents=True, exist_ok=True) - for record_info in record_infos: - json_object = record_info.json_object - instance_destination = class_destination / idfx( - json_object['pid'], - 'yaml', - ) - instance_destination.parent.mkdir(parents=True, exist_ok=True) - instance_destination.write_text( - yaml.dump( - data=json_object, - sort_keys=False, - allow_unicode=True, - default_flow_style=False, - ) - ) -- 2.52.0 From 5bd4dc1dadfd4d8f1e38a318b9efe934865ddf96 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 12 Jun 2026 10:47:37 +0200 Subject: [PATCH 58/64] fix circular import --- dump_things_service/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dump_things_service/utils.py b/dump_things_service/utils.py index 894102c..5c9cd24 100644 --- a/dump_things_service/utils.py +++ b/dump_things_service/utils.py @@ -39,7 +39,6 @@ from dump_things_service.abstract_config import ( get_token_config_for_representation_and_collection, get_mapping_function_by_name, ) -from dump_things_service.instance_state import record_dir_config_file_name from dump_things_service.auth import ( AuthenticationError, AuthenticationInfo, @@ -370,6 +369,8 @@ def write_record_dir_config( backend_config: RecordDirBackendConfig, schema: str, ): + from dump_things_service.instance_state import record_dir_config_file_name + assert isinstance(backend_config, RecordDirBackendConfig) record_dir_config_file_path = path / record_dir_config_file_name -- 2.52.0 From 7e51656e501b1bda7efc51841c36c0e6d41677c7 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 12 Jun 2026 11:48:15 +0200 Subject: [PATCH 59/64] ensure that record-dir config files are created If a record-dir backend is created on a directory, check if the config file exists, if not write it. --- dump_things_service/utils.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/dump_things_service/utils.py b/dump_things_service/utils.py index 5c9cd24..328e3e9 100644 --- a/dump_things_service/utils.py +++ b/dump_things_service/utils.py @@ -317,7 +317,7 @@ def create_token_store( store_dir=store_dir, order_by=instance_state.order_by, schema_uri=schema_uri, - mapping_function=get_mapping_function_by_name(backend_config.mapping_method), + mapping_function=backend_config.mapping_method, suffix='yaml', ) elif backend_name == 'sqlite': @@ -349,14 +349,23 @@ def create_record_dir_token_store_backend( store_dir: Path, order_by: list[str], schema_uri: str, - mapping_function: Callable, + mapping_function: str, suffix: str, ) -> _RecordDirStore: + from dump_things_service.instance_state import record_dir_config_file_name from dump_things_service.backends.record_dir import RecordDirStore + # Write the configuration to the store, if it does not yet exist. + if not (store_dir / record_dir_config_file_name).exists(): + write_record_dir_config( + path=store_dir, + mapping_function=mapping_function, + schema=schema_uri, + ) + store_backend = RecordDirStore( root=store_dir, - pid_mapping_function=mapping_function, + pid_mapping_function=get_mapping_function_by_name(mapping_function), suffix=suffix, order_by=order_by, ) @@ -366,13 +375,11 @@ def create_record_dir_token_store_backend( def write_record_dir_config( path: Path, - backend_config: RecordDirBackendConfig, + mapping_function: str, schema: str, ): from dump_things_service.instance_state import record_dir_config_file_name - assert isinstance(backend_config, RecordDirBackendConfig) - record_dir_config_file_path = path / record_dir_config_file_name if not record_dir_config_file_path.exists(): record_dir_config_file_path.write_text(f"""# RecordDir Config @@ -380,9 +387,10 @@ type: records version: 1 schema: {schema} format: yaml -idfx: {backend_config.mapping_method} +idfx: {mapping_function} """, - ) + ) + def create_sqlite_token_store_backend( store_dir: Path, -- 2.52.0 From 4aa902259b32e68212fb85b1e716c7c49e082488 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 12 Jun 2026 11:49:23 +0200 Subject: [PATCH 60/64] addapt `dump-things-pid-check` to the new structure Adapt the code of `dump-things-pid-check` to the dynamic configuration changes. --- dump_things_service/commands/check_pids.py | 52 ++++++++++------------ 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/dump_things_service/commands/check_pids.py b/dump_things_service/commands/check_pids.py index 764ca44..620e647 100644 --- a/dump_things_service/commands/check_pids.py +++ b/dump_things_service/commands/check_pids.py @@ -5,16 +5,20 @@ from argparse import ArgumentParser from collections.abc import Iterable from pathlib import Path -from dump_things_service import config_file_name -from dump_things_service.abstract_config import read_config +from fastapi import FastAPI + +from dump_things_service.abstract_config import ( + get_config_labels, + read_config, +) from dump_things_service.backends.schema_type_layer import _SchemaTypeLayer from dump_things_service.backends.sqlite import _SQLiteBackend -from dump_things_service.config import get_config, process_config from dump_things_service.exceptions import CurieResolutionError +from dump_things_service.instance_state import create_instance_state +from dump_things_service.manifest import manifest_configuration from dump_things_service.store.model_store import _ModelStore from dump_things_service.utils import ( create_token_store, - get_config_labels, get_on_disk_labels, ) @@ -27,13 +31,6 @@ parser.add_argument( 'store', help='The root directory of the store.', ) -parser.add_argument( - '-c', - '--config', - metavar='CONFIG_FILE', - help="Read the configuration from 'CONFIG_FILE' instead of looking for " - 'it in the root directory of the store.', -) def show_backend(model_store: _ModelStore): @@ -63,10 +60,16 @@ def check_pids_in_stores( return result -def check_pids(): - - instance_state = get_config() - abstract_config = read_config(instance_state.store_path) +def check_pids( + store_path: Path, +): + abstract_config = read_config(store_path) + instance_state = create_instance_state( + store_path=store_path, + bootstrap_token='', + fastapi_app=FastAPI(), + ) + manifest_configuration(abstract_config, instance_state) result = 0 @@ -77,11 +80,10 @@ def check_pids(): # configuration, or can be generated by external authentication sources. # In the latter case, they are manifest as directories in the incoming area # of a collection. - for collection, collection_info in instance_state.collections.items(): - - configured_labels = get_config_labels(instance_state, collection) + for collection, collection_info in abstract_config.collections.items(): + configured_labels = get_config_labels(abstract_config, collection) on_disk_labels = get_on_disk_labels( - store_path=instance_state.store_path, + store_path=store_path, abstract_config=abstract_config, collection=collection, ) @@ -89,6 +91,7 @@ def check_pids(): token_stores = [ create_token_store( + abstract_config, instance_state, collection, instance_state.store_path / collection_info.incoming / label @@ -102,16 +105,7 @@ def check_pids(): def main(): arguments = parser.parse_args() - - store_path = Path(arguments.store).absolute() - process_config( - store_path=store_path, - config_file=Path(arguments.config or (store_path / config_file_name)), - order_by=['pid'], - globals_dict=globals(), - ) - - result = check_pids() + result = check_pids(Path(arguments.store).absolute()) if result > 0: print(f'found {result} unresolvable pids', file=sys.stderr) return 1 -- 2.52.0 From 84561137e19ddb3964dfca948bcf1327cc0271a2 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 12 Jun 2026 12:11:46 +0200 Subject: [PATCH 61/64] add submission-tag tests --- dump_things_service/tests/fixtures.py | 11 +- dump_things_service/tests/test_config.py | 145 ++++------------------- 2 files changed, 30 insertions(+), 126 deletions(-) diff --git a/dump_things_service/tests/fixtures.py b/dump_things_service/tests/fixtures.py index e4305f2..21f3b87 100644 --- a/dump_things_service/tests/fixtures.py +++ b/dump_things_service/tests/fixtures.py @@ -12,7 +12,7 @@ from dump_things_service.abstract_config import ( GitAuditBackendConfig, SQLiteBackendConfig, TokenCollectionConfig, - TokenModes, hash_token_representation, + TokenModes, hash_token_representation, TagSpec, ) from dump_things_service.backends import StorageBackend from dump_things_service.backends.record_dir import RecordDirStore @@ -57,6 +57,11 @@ g_default_collections = [ for i in range(1, 8) ] +g_default_collections[6].submission_tags = TagSpec( + submitter_id_tag='abc:id', + submission_time_tag='abc:time', +) + g_default_collections.append( CollectionRequest( name=f'collection_8', @@ -66,6 +71,10 @@ g_default_collections.append( incoming=PurePosixPath(f'{incoming}/collection_8'), backend=SQLiteBackendConfig( type='sqlite', + ), + submission_tags=TagSpec( + submitter_id_tag='no_default_id_tag', + submission_time_tag='no_default_time_tag', ) ) ) diff --git a/dump_things_service/tests/test_config.py b/dump_things_service/tests/test_config.py index ef55e2d..3006aba 100644 --- a/dump_things_service/tests/test_config.py +++ b/dump_things_service/tests/test_config.py @@ -3,6 +3,7 @@ from pathlib import PurePosixPath import pytest import yaml +from fastapi_pagination import response from json_flattener import GlobalConfig from starlette.status import HTTP_406_NOT_ACCEPTABLE @@ -202,131 +203,25 @@ def test_missing_incoming_detection(fastapi_client_simple): assert response.status_code == HTTP_406_NOT_ACCEPTABLE -def xxx_test_submission_tags_handling(dump_stores_simple): - config_object = GlobalConfig( - **yaml.load( - """ -type: collections -version: 1 -collections: - collection_1: - default_token: basic_access - curated: curated/in_token_1 - incoming: contributions - submission_tags: - submitter_id_tag: no_default_id_tag - submission_time_tag: no_default_time_tag - collection_2: - default_token: basic_access - curated: curated/collection_2 - incoming: contributions -tokens: - basic_access: - user_id: anonymous - collections: - collection_1: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - collection_2: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - """, - Loader=yaml.SafeLoader, - ) +def test_submission_tags_handling(fastapi_client_simple): + test_client, _, admin_token = fastapi_client_simple + + response = test_client.get( + '/collections/collection_8', + headers={'x-dumpthings-token': admin_token}, ) + json_obj = response.json() + assert json_obj['submission_tags'] == { + 'submitter_id_tag': 'no_default_id_tag', + 'submission_time_tag': 'no_default_time_tag', + } - global_dict = {} - config = process_config_object(dump_stores_simple, config_object, [], global_dict) - # Check for specified tags in collection `collection_1` - assert config.collections['collection_1'].submission_tags.submission_time_tag == 'no_default_time_tag' - assert config.collections['collection_1'].submission_tags.submitter_id_tag == 'no_default_id_tag' - # Check for default tags in collection `collection_2` - assert config.collections['collection_2'].submission_tags.submission_time_tag == 'http://semanticscience.org/resource/SIO_001083' - assert config.collections['collection_2'].submission_tags.submitter_id_tag == 'http://purl.obolibrary.org/obo/NCIT_C54269' - - -def xxx_test_submission_tags_resolving(dump_stores_simple): - config_object = GlobalConfig( - **yaml.load( - """ -type: collections -version: 1 -collections: - collection_1: - default_token: basic_access - curated: curated/in_token_1 - incoming: contributions - submission_tags: - submitter_id_tag: abc:id - submission_time_tag: abc:time -tokens: - basic_access: - user_id: anonymous - collections: - collection_1: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - """, - Loader=yaml.SafeLoader, - ) + response = test_client.get( + '/collections/collection_1', + headers={'x-dumpthings-token': admin_token}, ) - - global_dict = {} - process_config_object(dump_stores_simple, config_object, [], global_dict) - - -def xxx_test_submission_tags_resolving_error(dump_stores_simple): - config_object = GlobalConfig( - **yaml.load( - """ -type: collections -version: 1 -collections: - collection_1: - default_token: basic_access - curated: curated/in_token_1 - incoming: contributions - submission_tags: - submitter_id_tag: non-existing:id - collection_2: - default_token: basic_access - curated: curated/in_token_1 - incoming: contributions - submission_tags: - submission_time_tag: non-existing:time - collection_3: - default_token: basic_access - curated: curated/in_token_1 - incoming: contributions - submission_tags: - submitter_id_tag: http://something/non-existing - collection_4: - default_token: basic_access - curated: curated/in_token_1 - incoming: contributions - submission_tags: - submission_time_tag: http://something/non-existing -tokens: - basic_access: - user_id: anonymous - collections: - collection_1: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - collection_2: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - collection_3: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - collection_4: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - """, - Loader=yaml.SafeLoader, - ) - ) - - global_dict = {} - with pytest.raises(ConfigError) as e: - process_config_object(dump_stores_simple, config_object, [], global_dict) + json_obj = response.json() + assert json_obj['submission_tags'] == { + 'submitter_id_tag': 'http://purl.obolibrary.org/obo/NCIT_C54269', + 'submission_time_tag': 'http://semanticscience.org/resource/SIO_001083', + } -- 2.52.0 From 0005ee77e175e3619920873e6cdca0206e30e103 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 12 Jun 2026 12:22:54 +0200 Subject: [PATCH 62/64] bump version to 6.0.0 --- dump_things_service/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dump_things_service/__about__.py b/dump_things_service/__about__.py index 72def07..98d739c 100644 --- a/dump_things_service/__about__.py +++ b/dump_things_service/__about__.py @@ -1 +1 @@ -__version__ = '6.0.0b3' +__version__ = '6.0.0' -- 2.52.0 From 0b61928067cc544916cd3daf645eba9cdbb91e37 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 12 Jun 2026 12:28:39 +0200 Subject: [PATCH 63/64] clean up test code --- dump_things_service/tests/test_basic.py | 1 - dump_things_service/tests/test_config.py | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/dump_things_service/tests/test_basic.py b/dump_things_service/tests/test_basic.py index b884334..140db21 100644 --- a/dump_things_service/tests/test_basic.py +++ b/dump_things_service/tests/test_basic.py @@ -1,4 +1,3 @@ -from pathlib import Path import pytest # F401 diff --git a/dump_things_service/tests/test_config.py b/dump_things_service/tests/test_config.py index 3006aba..5b72fad 100644 --- a/dump_things_service/tests/test_config.py +++ b/dump_things_service/tests/test_config.py @@ -2,14 +2,11 @@ import hashlib from pathlib import PurePosixPath import pytest -import yaml -from fastapi_pagination import response -from json_flattener import GlobalConfig -from starlette.status import HTTP_406_NOT_ACCEPTABLE from dump_things_service import ( HTTP_200_OK, HTTP_201_CREATED, + HTTP_406_NOT_ACCEPTABLE, HTTP_409_CONFLICT, ) from dump_things_service.abstract_config import ( -- 2.52.0 From 15e2f67631f5ccac9fcf077a1ffa403423befd99 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 12 Jun 2026 12:30:16 +0200 Subject: [PATCH 64/64] add test for unicode IRI handling --- dump_things_service/tests/test_unicode.py | 39 +++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 dump_things_service/tests/test_unicode.py diff --git a/dump_things_service/tests/test_unicode.py b/dump_things_service/tests/test_unicode.py new file mode 100644 index 0000000..15cf7cc --- /dev/null +++ b/dump_things_service/tests/test_unicode.py @@ -0,0 +1,39 @@ +from pathlib import Path + +from .. import HTTP_200_OK + + +# Path to a local simple test schema +schema_file = Path(__file__).parent / 'testschema.yaml' + +extra_record = { + 'schema_type': 'abc:Person', + 'pid': 'abc:aaaa', + 'given_name': 'DavidÖÄÜ', +} +delete_record = { + 'schema_type': 'abc:Person', + 'pid': 'abc:delete-me', + 'given_name': 'Detlef', +} +unicode_name = 'AlienÖÄÜ-ß👽' +unicode_bytes = unicode_name.encode('utf-8') +unicode_record = { + 'schema_type': 'abc:Person', + 'pid': 'abc:unicode-test', + 'given_name': unicode_name, +} + + +def test_unicode_iri(fastapi_client_simple): + test_client, _, _ = fastapi_client_simple + + response = test_client.post( + '/collection_1/record/Person', + headers={'x-dumpthings-token': 'token-1'}, + json = { + 'pid': 'https://en.wikipedia.org/wiki/Universita_degli_Studi_eCampus', + 'given_name': 'Università degli Studi eCampus (Italy)', + } + ) + assert response.status_code == HTTP_200_OK -- 2.52.0