diff --git a/CHANGELOG.md b/CHANGELOG.md index c60cc75..6cf4205 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,55 @@ +# 6.0.0b2 (26.4.2026) + +# New feature + +- Support `-c/--config`-option. If this option is provided and if the service + has an empty persistent configuration store, the configuration store and the + service configuration will be initialized from the configuration file content. + If the configuration file is `version: 1`, it will be converted to the updated + configuration format. + + +# 6.0.0b1 (22.5.2026) + +## New features + +- Allow dynamic creation and deletion of collections, tokens and + administration-tokens via the endpoints: `/collections`, `/tokens`, and + `/admin_tokens`. + +- The configuration is persisted in the store of the service. An audit log + log of all configuration changes is kept. + +- A new token type is introduced: admin-tokens. Admin-tokens are required to + perform the creation- and deletion-operations described above. + +- The tool `dump-things-load-config` was added to manifest a configuration that + is defined in a configuration file on a running server. + +- The tool `dump-things-hash-token` was added to calculate the hashed + representation of a token. This representation can be used to create a + "hashed"-token, i.e., a token that is stored in hashed format. So the plain + token is never stored in the dump-things server configuration + +## Breaking changes + +- Configuration structure has changed: + 1. `schema` is now an attribute of a collection. It is no longer an + attribute of a sqlite-backend configuration. + 2. The keys in the top-level mapping `tokens` are now token names and no + longer token representations. Token representations are now defined in + the values of the `tokens`-mapping + 3. The top-level mapping `admin_tokens` was added. + +- Configuration files are no longer read when the service is started. Instead + the service reads its configuration from the store, if it is present. Thw tool + (`dump-things-load-config`) can read an existing configuration + file and manifest the described configuration on a running dump-things server. + It supports pre version 6 config files and converts them to the new + configuration structure (except from hashed tokens, which cannot be + automatically converted). + + # 5.6.1 (2026-03-20) ## Bugfixes @@ -47,7 +99,7 @@ supported by the collections, i.e., classes for which storage- and validation-endpoints exist. -- Add `/maintenance`-endpoint to temporarilly lock collections for non-curator +- Add `/maintenance`-endpoint to temporarily lock collections for non-curator access. @@ -569,7 +621,7 @@ ## New features -- Factor out a Schema Type Layer (STL) from the `record_dir` backend." The STL +- Factor out a Schema Type Layer (STL) from the `record_dir` backend. The STL can be used with every backend. It removes top-level `schema_type`-entries from records before they are stored. It also adds the correct top-level `schema_type`-entry to records that are read from a store. This functionality diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..fc7de03 --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +things-graph-renderer, including all examples, code snippets and attached +documentation is covered by the MIT license. + + The MIT License + + Copyright (c) 2026- Michael Hanke + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. diff --git a/README.md b/README.md index f126c06..bc975c5 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,9 @@ The following command line parameters are supported: - `--port `: The port on which the service should accept connections (default: `8000`). -- `-c/--config `: provide a path to the configuration file. The configuration file in `/.dumpthings.yaml` will be ignored, if it exists at all. +- `-c/--config `: provide a path to a configuration file (configuration file version 1). + If no dynamically managed configuration is found in the data store, the dynamically managed configuration and the service state will be initialized with the content of the configuration file. + This allows an easy transition from dump-things-servers of version 5 and lower to version 6. - `--origins `: add a CORS origin hosts (repeat to add multiple CORS origin URLs).` @@ -55,6 +57,12 @@ The following command line parameters are supported: - `--log-level`: set the log level for the service, allowed values are `ERROR`, `WARNING`, `INFO`, `DEBUG`. The default-level is `WARNING`. +- `--admin_token-hash `: set an administrator token hash. + This plaintext token can be used to create and delete collections, tokens, and admin tokens. + This is useful to configure the service if no admin token was yet created. + **NOTE**: an admin token in plaintext is read from the environment variable `DTS_ADMIN_TOKEN` if it is set and this option is not provided. + + ```bash dump-things-service /data-storage/store --host 127.0.0.1 --port 8000 ``` @@ -63,16 +71,18 @@ The above command runs the service on the network location `127.0.0.1:8000` and ### Configuration file -The service is configured via a configuration file that defines collections, paths for incoming and curated data for each collection, as well as token properties. +The service provides the tool `dump-things-load-config` which can load configurations from a file and manifest those configurations on a running service via the administration endpoints. + +A configuration defines collections, paths for incoming and curated data for each collection, as well as token properties. Token properties include a submitter identification and for each collection an incoming zone specifier, permissions for reading and writing of the incoming zone and permission for reading the curated data of the collection. -A "formal" definition of the configuration file is provided by the class `GlobalConfig` in the file `dumpthings-server/config.py`. +A "formal" definition of the configuration file is provided by the class `Configuration` in the file `dumpthings-server/abstract_config.py`. -Configurations are read in YAML format. The following is an example configuration file that illustrates all options: +Configurations are read in YAML format. The following is an example configuration file (version 6 and higher) that illustrates all options: ```yaml type: collections # has to be "collections" -version: 1 # has to be 1 +version: 2 # has to be 2 # All collections are listed in "collections" collections: @@ -86,6 +96,9 @@ collections: # client provided token. default_token: no_access + # The schema that is used by the collection + schema: https://concepts.inm7.de/s/flat-data/unreleased.yaml + # The path to the curated data of the collection. This path should contain the # ".dumpthings.yaml"-configuration for collections that is described # here: . @@ -134,13 +147,24 @@ collections: # All tokens are listed in "tokens" tokens: - # The following entry defines the token "basic_access". This token allows read-only - # access to the two collections: "rooms_and_buildings" and "fixed_data". + # The following entry defines the token "basic_access". basic_access: + # The representation of the token, this is the value that the user has to + # provide in the `x-dumpthings-token`-header to authenticate with this token. + representation: anonymous + + # If hashed is `True`, the representation must be a 40-hexdigit number, + # representing the hash of the plain token. Setting `hashed` to `True` ensures + # that the plain-text token is not stored in the configuration store of the + # running server. + # + # The tool `dump-things-hash-token` can be used to calculate the correct hash. + hashed: False + # The value of "user_id" will be added as an annotation to each record that is # uploaded with this token. - user_id: anonymous + user_id: anonymous_user # The collections for which the token holds rights are defined in "collections" collections: @@ -155,8 +179,9 @@ tokens: # A token and collection-specific label, that defines "zones" in which incoming # records are stored. Multiple tokens can share the same zone, for example if # many clients with individual tokens work together to build a collection. - # (Since this token does not allow write access, "incoming_label" is ignored and - # left empty here (TODO: it should not be required in this case)). + # (Since this token does not allow write access, "incoming_label" is ignored. It + # is set to an empty string here in order to document it, but it could as well + # be omitted) incoming_label: '' # The rights that "basic_access" carries for the collection "fixed_data" @@ -168,6 +193,9 @@ tokens: # The following entry defines the token "no_access". This token does not allow # any access and is used as a default token for the collection "personal_records". no_access: + + representation: no_access + user_id: nobody collections: @@ -175,9 +203,10 @@ tokens: mode: NOTHING incoming_label: '' - # The following entry defines the token "admin". It gives full access rights to - # the collection "personal_records". - admin: + # The following entry defines a token with the name "admin_token" and the plain + # representation: "admin". It gives full access rights to the collection "personal_records". + admin_token: + representation: admin user_id: Admin collections: personal_records: @@ -187,6 +216,7 @@ tokens: # The following entry defines the token "contributor_bob". It gives full access # to "rooms_and_buildings" for a user with the id "Bob". contributor_bob: + representation: bob user_id: Bob collections: rooms_and_buildings: @@ -198,6 +228,7 @@ tokens: # same incoming-zone, i.e. "new_rooms_and_buildings". That means they can read # incoming records that the other one posted. contributor_alice: + representation: alice user_id: Alice collections: rooms_and_buildings: @@ -205,14 +236,17 @@ tokens: incoming_label: new_rooms_and_buildings # The following entry defines a hashed token because the key `hashed` is set - # to `True`. A hashed token has the structure - # `-`. It will match an incoming token if the incoming token has - # the structure `-` and if sha256(``) equals ``. - # In this example, if the client presents the token `bob-hello`, he will be - # granted access because `sha256('hello')` equals - # `2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824` - bob-2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824: + # to `True`. A hashed token representation is the hex-digit representation of + # the sha-256 checksum of the plain token. + # In this example, if the client presents the token `hello`, he will be + # granted access because `dump-things-hash-token 'hello'` yields + # `90b1b286043f1b7612e423c74608f5ea2f676340507f0b67219b20d09fc4777b`, i.e. + # sha256('hello') == 90b1b286043f1b7612e423c74608f5ea2f676340507f0b67219b20d09fc4777b + # is true. + hashed_token_1: + representation: 90b1b286043f1b7612e423c74608f5ea2f676340507f0b67219b20d09fc4777b hashed: True + user_id: Walter collections: rooms_and_buildings: mode: WRITE_COLLECTION @@ -250,6 +284,7 @@ collections: collection_with_default_record_dir+stl_backend: # This is a collection with the default backend, i.e. `record_dir+stl` and # the default authentication, i.e. config-based authentication. + schema: https://concepts.inm7.de/s/flat-data/unreleased.yaml default_token: anon_read curated: collection_1/curated @@ -268,6 +303,7 @@ collections: # example by the forgejo-instance at `https://forgejo.example.com`. # If there is more than one authentication source, they will be tried # in the order they are defined in the config file. + schema: https://concepts.inm7.de/s/flat-data/unreleased.yaml default_token: anon_read # We still need a default token curated: collection_2/curated @@ -313,6 +349,7 @@ collections: # permissions, user-id, and incoming from the config file. collection_with_explicit_record_dir+stl_backend: + schema: https://concepts.inm7.de/s/flat-data/unreleased.yaml default_token: anon_read curated: collection_3/curated backend: @@ -322,6 +359,7 @@ collections: type: record_dir+stl collection_with_sqlite_backend: + schema: https://concepts.inm7.de/s/flat-data/unreleased.yaml default_token: anon_read curated: collection_4/curated backend: @@ -330,9 +368,18 @@ collections: # that holds the URL of the schema that should # be used in this backend. type: sqlite - schema: https://concepts.inm7.de/s/flat-data/unreleased.yaml ``` +#### Reserved names + +The following collection names are reserved and must not be used: + +- collections +- tokens +- admin_tokens +- __dump_things__ + + #### Authentication and authorization To authenticate and authorize a user based on tokens, dumpthing-service uses @@ -683,6 +730,59 @@ A `CURATOR`-token required to access these endpoints. Details about the curation endpoints can be found in [this issue](https://codeberg.org/datalink/dump-things-server/issues/118). +#### Administration endpoints + +Operations on the endpoints described in this section require an administrator token. +If desired, use `dump-things-load-config` to read the configuration from a file and +generate respective POST-requests. `dump-things-load-config` can also be used to +generate a configuration from an old, i.e. dump-things version < 6, configuration file. + +##### Collections + +- `POST /collections`: create a new collection from the posted configuration object. + For a specification of the configuration object see the object `CollectionRequest` in the file `dump_things_service/collection_endpoints.py` + (Use `dump-things-load-config` to read the configuration from a file and generate respective POST-requests) + +- `GET /collections`: get information about the currently existing collections. + +- `GET /collections/`: get information about the collection with name ``. + +- `DELETE /collections/`: delete the collection with the given name. + Note: deleting a collection does not delete any records or any storage dir, it just removes the + collection from the internal state of the service. Recreating it (via `POST /collections`) will make + all data reachable again through the Web-API. + + +##### Tokens + +- `POST /tokens`: create a new token from the posted configuration object. + For a specification of the configuration object see the object `TokenRequest` in the file `dump_things_service/token_endpoints.py` + NOTE: Before a token for configuration can be generated, the collection must exist. + +- `GET /tokens`: get information about the currently existing tokens. + +- `GET /tokens/`: get information about the token with name ``. + +- `DELETE /tokens/`: delete the token with the given name. + +- this endpoint (ending on `.../p/`) provides the same functionality as the endpoint `GET //records/` (without `.../p/`) but supports result pagination. In addition to the query parameters `format` and `matching`, it supports the query parameters `page` and `size`. + + +##### Admin Tokens + +- `POST /admin_tokens`: create a new admin token from the posted configuration object. + For a specification of the configuration object see the object `AdminTokenRequest` in the file `dump_things_service/token_endpoints.py` + Note that admin token are always stored as hashed values. + Therefore the representation in the request should be `sha256()` + +- `GET /admin_tokens`: get information about the currently existing admin tokens. + +- `GET /admin_tokens/`: get information about the admin token with name ``. + +- `DELETE /admin_tokens/`: delete the admin token with the given name. + + + ### Tips & Tricks @@ -702,15 +802,16 @@ collections: incoming: datamgt tokens: - anon_read: + anon_read: # The name of the token (serves also as representation if no representation is defines) user_id: anonymous collections: - datamgt: - mode: READ_CURATED - incoming_label: "" + datamgt: # per collection token configuration; contains: + mode: READ_CURATED # - token mode + incoming_label: "" # - the label for the incoming area for this token and this collection, i.e., collection: "datamgt". trusted-submitter-token: user_id: trusted_submitter + representation: 00112233445566778899aabbccdd # The representation that the client has to send in an `x-dumpthings-token`-header (if not given, the token name will be the representation) collections: datamgt: mode: WRITE_COLLECTION @@ -763,6 +864,36 @@ If any backend is a `record_dir+stl` backend, a schema has to be supplied via th - `dump-things-create-merged-schema`: this command creates a new schema that statically contains all schemas that the original schema imports. The new schema is fully self-contained and does not reference any other schemas. +- `dump-things-hash-token`: this command will generate a hash from a plain-text token that can be used with the `--admin-token-hash` option. + (one could also use the shell command `sha256sum` to generate the hash, but using `dump-things-hash-token` will ensure that the right hash algorithm is used) + + +### Migrate to version 6 + +Migration to version 6 is simple. It involves the following steps: + + +1. Start the version 6 service on the store that you used and provide an +administrator token +``` +> dump-things-service --admin-token admin-1 +``` + +2. Use `dump-things-load-config` to load the old configuration from the +configuration file that you used in the old version. + +``` +> export DTS_ADMIN_TOKEN=admin-1 +> dump-things-load-config --send-to "https://" --old-config /.dumpthings.yaml +``` + +At this point the service should be running and be configured exactly as +before. The configuration is persisted and will be etablished next time the +service starts. + + + + ### If things go wrong #### Delete a record manually diff --git a/dump_things_service/__about__.py b/dump_things_service/__about__.py index 2c06c79..98d739c 100644 --- a/dump_things_service/__about__.py +++ b/dump_things_service/__about__.py @@ -1 +1 @@ -__version__ = '5.6.1' +__version__ = '6.0.0' diff --git a/dump_things_service/__init__.py b/dump_things_service/__init__.py index 5fe7ad2..9bb65a0 100644 --- a/dump_things_service/__init__.py +++ b/dump_things_service/__init__.py @@ -6,11 +6,14 @@ from typing import ( from starlette.status import ( HTTP_200_OK, + HTTP_201_CREATED, HTTP_300_MULTIPLE_CHOICES, HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND, + HTTP_406_NOT_ACCEPTABLE, + HTTP_409_CONFLICT, HTTP_500_INTERNAL_SERVER_ERROR, HTTP_503_SERVICE_UNAVAILABLE, ) @@ -24,11 +27,14 @@ from starlette.status import ( __all__ = [ 'Format', 'HTTP_200_OK', + 'HTTP_201_CREATED', 'HTTP_300_MULTIPLE_CHOICES', 'HTTP_400_BAD_REQUEST', 'HTTP_401_UNAUTHORIZED', 'HTTP_403_FORBIDDEN', 'HTTP_404_NOT_FOUND', + 'HTTP_406_NOT_ACCEPTABLE', + 'HTTP_409_CONFLICT', 'HTTP_413_CONTENT_TOO_LARGE', 'HTTP_422_UNPROCESSABLE_CONTENT', 'HTTP_500_INTERNAL_SERVER_ERROR', @@ -36,6 +42,7 @@ __all__ = [ 'JSON', 'YAML', 'config_file_name', + 'reserved_collection_names', ] @@ -48,3 +55,13 @@ JSON = Union[dict[str, Any], list[Any], str, int, float, None] YAML = JSON config_file_name = '.dumpthings.yaml' + +dump_things_private_collection_name = '__dump_things__' + + +reserved_collection_names = ( + 'collections', + 'tokens', + 'admin_tokens', + dump_things_private_collection_name, +) diff --git a/dump_things_service/abstract_config.py b/dump_things_service/abstract_config.py new file mode 100644 index 0000000..ca4dc36 --- /dev/null +++ b/dump_things_service/abstract_config.py @@ -0,0 +1,497 @@ +import enum +import hashlib +import logging +from functools import partial +from pathlib import ( + Path, + PurePosixPath, +) +from typing import ( + Callable, + Iterable, + Literal, + cast, +) + +from fastapi import HTTPException +from pydantic import ( + BaseModel, + ConfigDict, + Field, ValidationError, +) +from yaml.scanner import ScannerError + +from dump_things_service import ( + HTTP_404_NOT_FOUND, + dump_things_private_collection_name, +) +from dump_things_service.audit.gitaudit import GitAuditBackend +from dump_things_service.backends.record_dir import ( + _RecordDirStore, + RecordDirStore, +) +from dump_things_service.exceptions import ConfigError + + +logger = logging.getLogger('dump_things_service') + +g_abstract_configuration = None + +dump_things_config_iri = 'dump_things:config' +dump_things_private_path = Path(dump_things_private_collection_name) +config_backend_path = dump_things_private_path / 'config_store' +config_audit_path = dump_things_private_path / 'config_audit' +config_backend = None +config_audit = None + + +class StrictModel(BaseModel): + model_config = ConfigDict( + extra='forbid', + use_enum_values=True, + ) + + +class ConfigAuthSpec(BaseModel): + type: Literal['config'] = 'config' + + +class ForgejoAuthSpec(BaseModel): + type: Literal['forgejo'] + url: str + organization: str + team: str + label_type: Literal['team', 'user'] + instance_id: str | None = None + repository: str | None = None + + +class TagSpec(BaseModel): + submitter_id_tag: str = 'http://purl.obolibrary.org/obo/NCIT_C54269' + submission_time_tag: str = 'http://semanticscience.org/resource/SIO_001083' + + +class MappingMethod(enum.Enum): + digest_md5 = 'digest-md5' + digest_md5_p3 = 'digest-md5-p3' + digest_md5_p3_p3 = 'digest-md5-p3-p3' + digest_sha1 = 'digest-sha1' + digest_sha1_p3 = 'digest-sha1-p3' + digest_sha1_p3_p3 = 'digest-sha1-p3-p3' + after_last_colon = 'after-last-colon' + + +class RecordDirBackendConfig(StrictModel): + model_config = ConfigDict(use_enum_values=True) + type: Literal['record_dir', 'record_dir+stl'] + mapping_method: str = MappingMethod.digest_md5.value + + +class SQLiteBackendConfig(StrictModel): + type: Literal['sqlite', 'sqlite+stl'] + + +class GitAuditBackendConfig(StrictModel): + type: Literal['gitaudit'] + path: Path + auto_flush_timeout: int = 60 + + +class CollectionConfig(BaseModel): + model_config = ConfigDict(extra='forbid', use_enum_values=True) + default_token: str + curated: PurePosixPath + schema_location: str = Field(alias='schema') + incoming: PurePosixPath | None = None + backend: RecordDirBackendConfig | SQLiteBackendConfig = RecordDirBackendConfig(type='record_dir+stl') + auth_sources: list[ForgejoAuthSpec | ConfigAuthSpec] = [ConfigAuthSpec()] + audit_backends: list[GitAuditBackendConfig] = [] + submission_tags: TagSpec = TagSpec() + use_classes: list[str] = [] + ignore_classes: list[str] = [] + + +class RecordDirConfigFileContent(BaseModel): + model_config = ConfigDict(extra='forbid') + type: Literal['records'] + version: Literal[1] + schema_location: str = Field(alias='schema') + format: Literal['yaml'] + idfx: MappingMethod + + +class TokenModes(enum.Enum): + READ_CURATED = 'READ_CURATED' + READ_COLLECTION = 'READ_COLLECTION' + WRITE_COLLECTION = 'WRITE_COLLECTION' + READ_SUBMISSIONS = 'READ_SUBMISSIONS' + WRITE_SUBMISSIONS = 'WRITE_SUBMISSIONS' + SUBMIT = 'SUBMIT' + SUBMIT_ONLY = 'SUBMIT_ONLY' + NOTHING = 'NOTHING' + CURATOR = 'CURATOR' + + +class TokenPermission(BaseModel): + curated_read: bool = False + incoming_read: bool = False + incoming_write: bool = False + curated_write: bool = False + zones_access: bool = False + + +class TokenCollectionConfig(StrictModel): + model_config = ConfigDict(extra='forbid', use_enum_values=True) + mode: TokenModes + incoming_label: str = '' + + +class TokenConfig(StrictModel): + user_id: str + collections: dict[str, TokenCollectionConfig] + hashed: bool = False + representation: str = '' + + +class AdminTokenConfig(StrictModel): + representation: str + + +class Configuration(StrictModel): + collections: dict[str, CollectionConfig] = {} + tokens: dict[str, TokenConfig] = {} + admin_tokens: dict[str, AdminTokenConfig] = {} + pid: str = dump_things_config_iri + + +mode_mapping = { + TokenModes.READ_CURATED: TokenPermission(curated_read=True), + TokenModes.READ_COLLECTION: TokenPermission( + curated_read=True, + incoming_read=True, + ), + TokenModes.WRITE_COLLECTION: TokenPermission( + curated_read=True, + incoming_read=True, + incoming_write=True, + ), + TokenModes.READ_SUBMISSIONS: TokenPermission(incoming_read=True), + TokenModes.WRITE_SUBMISSIONS: TokenPermission( + incoming_read=True, + incoming_write=True, + ), + TokenModes.SUBMIT: TokenPermission(curated_read=True, incoming_write=True), + TokenModes.SUBMIT_ONLY: TokenPermission(incoming_write=True), + TokenModes.NOTHING: TokenPermission(), + TokenModes.CURATOR: TokenPermission( + curated_read=True, + incoming_read=True, + incoming_write=True, + curated_write=True, + zones_access=True, + ), +} + + +def get_token_permissions(mode: str) -> TokenPermission: + return mode_mapping[TokenModes(mode)] + + +def get_config_backends( + store_path: Path, +) -> tuple[_RecordDirStore, GitAuditBackend]: + global config_audit + global config_backend + + config_path = store_path / config_backend_path + if not config_path.exists(): + config_path.mkdir(parents=True) + + if config_backend is None: + config_backend = RecordDirStore( + config_path, + mapping_functions[MappingMethod.digest_md5], + 'yaml' + ) + + audit_path = store_path / config_audit_path + if not audit_path.exists(): + audit_path.mkdir(parents=True) + + if config_audit is None: + config_audit = GitAuditBackend(audit_path) + return config_backend, config_audit + + +def read_config( + store_path: Path, + force_reload: bool = False, +) -> Configuration: + global g_abstract_configuration + + if not g_abstract_configuration or force_reload: + config_backend, _ = get_config_backends(store_path) + try: + record_info = config_backend.get_record_by_iri(dump_things_config_iri) + except ScannerError as sce: + msg = f'Configuration at {config_backend.root} not readable: {sce}' + raise ConfigError(msg) from sce + try: + g_abstract_configuration = ( + Configuration(**(record_info.json_object)) + if record_info + else Configuration() + ) + except ValidationError as ve: + msg = f'Faulty configuration at {config_backend.root}: {ve}' + raise ConfigError(msg) from ve + return g_abstract_configuration + + +def get_config() -> Configuration: + global g_abstract_configuration + + if not g_abstract_configuration: + msg = 'Configuration not yet loaded' + raise RuntimeError(msg) + return cast(Configuration, g_abstract_configuration) + + +def store_config( + store_path, + config: Configuration, +): + global g_abstract_configuration + + config_backend, audit_backend = get_config_backends(store_path) + json_object = config.model_dump(mode='json', exclude_none=True, by_alias=True) + json_object['pid'] = dump_things_config_iri + config_backend.add_record( + iri=dump_things_config_iri, + class_name='DumpThingsConfig', + json_object=json_object + ) + audit_backend.add_record( + record=json_object, + committer_id='__dump_things_server__', + ) + g_abstract_configuration = config + + +def tokens_for_collection( + config: Configuration, + collection: str, +) -> Iterable[TokenConfig]: + yield from ( + token + for token_name, token in config.tokens.items() + if collection in token.collections + ) + + +def check_collection( + abstract_config: Configuration, + collection: str, +): + if collection not in abstract_config.collections: + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f"No such collection: '{collection}'.", + ) + + +def check_label( + store_path: Path, + abstract_config: Configuration, + collection: str, + label: str, +): + from dump_things_service.utils import get_on_disk_labels + + """Check that a label exists in a collection configuration or on disk""" + if ( + label not in get_config_labels(abstract_config, collection) + and label not in get_on_disk_labels(store_path, abstract_config, collection) + ): + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f"No incoming label: '{label}' in collection: '{collection}'.", + ) + + +def get_config_labels( + abstract_config: Configuration, + collection: str, +) -> set[str]: + check_collection(abstract_config, collection) + return { + token.collections[collection].incoming_label + for token in tokens_for_collection(abstract_config, collection) + if token.collections[collection].incoming_label + } + + +def get_default_token_name( + abstract_config: Configuration, + collection: str +) -> str: + check_collection(abstract_config, collection) + return abstract_config.collections[collection].default_token + + +def get_token_info_by_representation( + abstract_config: Configuration, + token_representation: str, +) -> tuple[str, TokenConfig] | None: + """Get the name of the token given in `token_representation`""" + hashed_representation = hash_token_representation(token_representation) + for token_name, token_config in abstract_config.tokens.items(): + if token_config.hashed: + compare_representation = hashed_representation + else: + compare_representation = token_representation + if compare_representation == token_config.representation: + return token_name, token_config + return None + + +def hash_token_representation( + token_representation: str, +) -> str: + return hashlib.sha256(token_representation.encode()).hexdigest() + + +def get_token_config_by_name( + abstract_config: Configuration, + token_name: str, +) -> TokenConfig | None: + return abstract_config.tokens.get(token_name) + + +def get_token_infos_for_collection( + abstract_config: Configuration, + collection_name: str, +) -> Iterable[tuple[str, TokenConfig, TokenCollectionConfig]]: + + yield from { + (token_name, token_config, token_collection_config) + for token_name, token_config in abstract_config.tokens.items() + for token_collection_config in token_config.collections.get(collection_name) + if token_config is not None + } + + +def get_token_config_for_representation_and_collection( + abstract_config: Configuration, + collection_name: str, + token_representation: str, +) -> tuple[str, TokenConfig, TokenCollectionConfig] | None: + + token_info = get_token_info_by_representation( + abstract_config=abstract_config, + token_representation=token_representation, + ) + if token_info: + token_name, token_config = token_info + if collection_name in token_config.collections: + return token_name, token_config, token_config.collections[collection_name] + + return None + + +def get_collection_config_by_name( + abstract_config: Configuration, + collection_name: str, +) -> CollectionConfig: + collection_config = abstract_config.collections.get(collection_name) + if not collection_config: + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f"No such collection: '{collection_name}'", + ) + return collection_config + + +def get_default_token_config( + abstract_config: Configuration, + collection: str, +) -> TokenConfig | None: + + default_token_name = get_collection_config_by_name( + abstract_config, + collection, + ).default_token + + return get_token_config_by_name(abstract_config, default_token_name) + + +def get_default_token_representation( + abstract_config: Configuration, + collection: str, +) -> str | None: + default_token_config = get_default_token_config( + abstract_config, + collection, + ) + return default_token_config.representation if default_token_config else None + + +def get_mapping_function(record_dir_backend_config: RecordDirBackendConfig): + return mapping_functions[MappingMethod(record_dir_backend_config.mapping_method)] + + +def get_backend_and_extension(backend_type: str) -> tuple[str, str]: + elements = backend_type.split('+') + return (elements[0], elements[1]) if len(elements) > 1 else (elements[0], '') + + +def get_hex_digest(hasher: Callable, data: str) -> str: + hash_context = hasher(data.encode()) + return hash_context.hexdigest() + + +def mapping_digest_p3( + hasher: Callable, + pid: str, + suffix: str, +) -> Path: + hex_digest = get_hex_digest(hasher, pid) + return Path(hex_digest[:3]) / (hex_digest[3:] + '.' + suffix) + + +def mapping_digest_p3_p3( + hasher: Callable, + pid: str, + suffix: str, +) -> Path: + hex_digest = get_hex_digest(hasher, pid) + return Path(hex_digest[:3]) / hex_digest[3:6] / (hex_digest[6:] + '.' + suffix) + + +def mapping_digest(hasher: Callable, pid: str, suffix: str) -> Path: + hex_digest = get_hex_digest(hasher, pid) + return Path(hex_digest + '.' + suffix) + + +def mapping_after_last_colon(pid: str, suffix: str) -> Path: + plain_result = pid.split(':')[-1] + # Escape any colons and slashes in the pid + escaped_result = ( + plain_result.replace('_', '__').replace('/', '_s').replace('.', '_d') + ) + return Path(escaped_result + '.' + suffix) + + +mapping_functions = { + MappingMethod.digest_md5: partial(mapping_digest, hashlib.md5), + MappingMethod.digest_md5_p3: partial(mapping_digest_p3, hashlib.md5), + MappingMethod.digest_md5_p3_p3: partial(mapping_digest_p3_p3, hashlib.md5), + MappingMethod.digest_sha1: partial(mapping_digest, hashlib.sha1), + MappingMethod.digest_sha1_p3: partial(mapping_digest_p3, hashlib.sha1), + MappingMethod.digest_sha1_p3_p3: partial(mapping_digest_p3_p3, hashlib.sha1), + MappingMethod.after_last_colon: mapping_after_last_colon, +} + + +def get_mapping_function_by_name(mapping_function_name: str) -> Callable: + return mapping_functions[MappingMethod(mapping_function_name)] diff --git a/dump_things_service/admin.py b/dump_things_service/admin.py new file mode 100644 index 0000000..20a19c0 --- /dev/null +++ b/dump_things_service/admin.py @@ -0,0 +1,38 @@ +import logging + +from fastapi import HTTPException + +from dump_things_service import HTTP_401_UNAUTHORIZED +from dump_things_service.abstract_config import ( + Configuration, + hash_token_representation, +) +from dump_things_service.instance_state import InstanceState + + +logger = logging.getLogger('dump_things_service') + + +def authenticate_admin( + instance_state: InstanceState, + abstract_config: Configuration, + api_key: str, +): + if api_key: + hashed_token_representation = hash_token_representation(api_key) + if hashed_token_representation == instance_state.bootstrap_token: + logger.info('authenticate_admin: using bootstrap token') + return + + for token_name, token_config in abstract_config.admin_tokens.items(): + if token_config.representation == hashed_token_representation: + logger.info( + "authenticate_admin: using token '%s'", + token_name, + ) + return + + raise HTTPException( + status_code=HTTP_401_UNAUTHORIZED, + detail='Invalid admin token', + ) diff --git a/dump_things_service/audit/__init__.py b/dump_things_service/audit/__init__.py index c7520ce..58b8458 100644 --- a/dump_things_service/audit/__init__.py +++ b/dump_things_service/audit/__init__.py @@ -20,14 +20,7 @@ class AuditBackend(metaclass=ABCMeta): :param committer_id: the ID of the user who adds the record. :param author_id: the ID of the user who modified the record, defaults to `committer_id` if not given. - :return: A dictionary where the keys are time stamps of the changes, - the values are tuples containing the elements: - (user_id, diff, resulting_record), where user_id is the - `user_id` that was used in `add_record`, `resulting_record` is - the YAML-representation of `record` that was given to - `add_record`, and diff is path the transfers the previous - version of the record to the version provided in `record` (in - git-diff format). + :return: None """ raise NotImplementedError diff --git a/dump_things_service/auth/__init__.py b/dump_things_service/auth/__init__.py index e2d87b1..051720f 100644 --- a/dump_things_service/auth/__init__.py +++ b/dump_things_service/auth/__init__.py @@ -15,7 +15,7 @@ import dataclasses from typing import TYPE_CHECKING if TYPE_CHECKING: - from dump_things_service.token import TokenPermission + from dump_things_service.abstract_config import TokenPermission class AuthenticationError(Exception): diff --git a/dump_things_service/auth/config.py b/dump_things_service/auth/config.py index 3dc50ba..8f8976e 100644 --- a/dump_things_service/auth/config.py +++ b/dump_things_service/auth/config.py @@ -1,56 +1,44 @@ -"""Use configuration information to fetch token permissions, ids, and incomng_label """ +"""Use configuration information to fetch token permissions, ids, and incoming_label """ +from dump_things_service.abstract_config import Configuration from dump_things_service.auth import ( AuthenticationInfo, AuthenticationSource, InvalidTokenError, ) -from dump_things_service.config import ( - InstanceConfig, +from dump_things_service.abstract_config import ( + get_token_permissions, + get_token_config_for_representation_and_collection, ) -from dump_things_service.token import ( - get_token_parts, - hash_token, -) - -missing = {} class ConfigAuthenticationSource(AuthenticationSource): def __init__( - self, - instance_config: InstanceConfig, - collection: str, + self, + abstract_configuration: Configuration, + collection_name: str, ): - self.instance_config = instance_config - self.collection = collection + self.abstract_configuration = abstract_configuration + self.collection_name = collection_name def authenticate( - self, - token: str, + self, + token_representation: str, ) -> AuthenticationInfo: - token = self._resolve_hashed_token(token) - token_info = self.instance_config.tokens.get(self.collection, {}).get(token, missing) - if token_info is missing: - msg = f'Token not valid for collection `{self.collection}`' - raise InvalidTokenError(msg) - - return AuthenticationInfo( - token_permission=token_info['permissions'], - user_id=token_info['user_id'], - incoming_label=token_info['incoming_label'], + result = get_token_config_for_representation_and_collection( + self.abstract_configuration, + self.collection_name, + token_representation, ) - def _resolve_hashed_token( - self, - token: str - ) -> str: + if not result: + msg = f'Token not valid for collection `{self.collection_name}`' + raise InvalidTokenError(msg) - try: - token_id, _ = get_token_parts(token) - if token_id in self.instance_config.hashed_tokens[self.collection]: - return hash_token(token) - except ValueError: - pass - return token + _, token_config, token_collection_config = result + return AuthenticationInfo( + token_permission=get_token_permissions(token_collection_config.mode), + user_id=token_config.user_id, + incoming_label=token_collection_config.incoming_label, + ) diff --git a/dump_things_service/auth/forgejo.py b/dump_things_service/auth/forgejo.py index d088ff4..3b99524 100644 --- a/dump_things_service/auth/forgejo.py +++ b/dump_things_service/auth/forgejo.py @@ -28,7 +28,7 @@ from dump_things_service.auth import ( AuthenticationSource, InvalidTokenError, ) -from dump_things_service.config import TokenPermission +from dump_things_service.abstract_config import TokenPermission logger = logging.getLogger('dump_things_service') diff --git a/dump_things_service/backends/__init__.py b/dump_things_service/backends/__init__.py index 37d013e..901b9e8 100644 --- a/dump_things_service/backends/__init__.py +++ b/dump_things_service/backends/__init__.py @@ -83,12 +83,12 @@ class BackendResultList(LazyList): @abstractmethod def generate_result( - self, - index: int, - iri: str, - class_name: str, - sort_key: str, - private: Any, + self, + index: int, + iri: str, + class_name: str, + sort_key: str, + private: Any, ) -> RecordInfo: """ Generate a record info object from the provided parameters. @@ -105,23 +105,23 @@ class BackendResultList(LazyList): class StorageBackend(metaclass=ABCMeta): def __init__( - self, - order_by: Iterable[str] | None = None, + self, + order_by: Iterable[str] | None = None, ): self.order_by = order_by or ['pid'] @abstractmethod def get_uri( - self + self ) -> str: raise NotImplementedError @abstractmethod def add_record( - self, - iri: str, - class_name: str, - json_object: dict, + self, + iri: str, + class_name: str, + json_object: dict, ): raise NotImplementedError @@ -139,37 +139,37 @@ class StorageBackend(metaclass=ABCMeta): @abstractmethod def remove_record( - self, - iri: str, + self, + iri: str, ) -> bool: raise NotImplementedError @abstractmethod def get_record_by_iri( - self, - iri: str, + self, + iri: str, ) -> RecordInfo | None: raise NotImplementedError @abstractmethod def get_records_of_classes( - self, - class_names: Iterable[str], - pattern: str | None = None, + self, + class_names: Iterable[str], + pattern: str | None = None, ) -> BackendResultList: raise NotImplementedError @abstractmethod def get_all_records( - self, - pattern: str | None = None, + self, + pattern: str | None = None, ) -> BackendResultList: raise NotImplementedError def create_sort_key( - json_object: dict[str, Any], - order_by: Iterable[str], + json_object: dict[str, Any], + order_by: Iterable[str], ) -> str: return '-'.join( str(json_object.get(key)) if json_object.get(key) is not None else chr(0x10FFFF) diff --git a/dump_things_service/backends/mongo.py b/dump_things_service/backends/mongo.py deleted file mode 100644 index e69de29..0000000 diff --git a/dump_things_service/backends/record_dir.py b/dump_things_service/backends/record_dir.py index 320ec59..16e0258 100644 --- a/dump_things_service/backends/record_dir.py +++ b/dump_things_service/backends/record_dir.py @@ -27,10 +27,10 @@ from dump_things_service.backends.record_dir_index import RecordDirIndex if TYPE_CHECKING: from collections.abc import Iterable - from types import ModuleType __all__ = [ + '_RecordDirStore', 'RecordDirStore', ] @@ -45,12 +45,12 @@ class RecordDirResultList(BackendResultList): """ def generate_result( - self, - _: int, - iri: str, - class_name: str, - sort_key: str, - path: Path, + self, + _: int, + iri: str, + class_name: str, + sort_key: str, + path: Path, ) -> RecordInfo: """ Generate a JSON representation of the record at index `index`. @@ -76,11 +76,11 @@ class _RecordDirStore(StorageBackend): """Store records in a directory structure""" def __init__( - self, - root: Path, - pid_mapping_function: Callable, - suffix: str, - order_by: Iterable[str] | None = None, + self, + root: Path, + pid_mapping_function: Callable, + suffix: str, + order_by: Iterable[str] | None = None, ): super().__init__(order_by=order_by) if not root.is_absolute(): @@ -92,27 +92,27 @@ class _RecordDirStore(StorageBackend): self.index = RecordDirIndex(root, suffix) def get_uri( - self + self ) -> str: return f'file://{self.root!s}' def build_index( - self, - schema: str, + self, + schema: str, ): self.index.rebuild_index(schema, self.order_by) def build_index_if_needed( - self, - schema: str, + self, + schema: str, ): self.index.rebuild_if_needed(schema, self.order_by) def add_record( - self, - iri: str, - class_name: str, - json_object: dict, + self, + iri: str, + class_name: str, + json_object: dict, ): pid = json_object['pid'] @@ -148,8 +148,8 @@ class _RecordDirStore(StorageBackend): self.index.add_iri_info(iri, class_name, str(storage_path), sort_string) def get_record_by_iri( - self, - iri: str, + self, + iri: str, ) -> RecordInfo | None: index_entry = self.index.get_info_for_iri(iri) if index_entry is None: @@ -165,9 +165,9 @@ class _RecordDirStore(StorageBackend): ) def get_records_of_classes( - self, - class_names: list[str], - pattern: str | None = None, + self, + class_names: list[str], + pattern: str | None = None, ) -> RecordDirResultList: return RecordDirResultList().add_info( sorted( @@ -186,8 +186,8 @@ class _RecordDirStore(StorageBackend): ) def get_all_records( - self, - pattern: str | None = None, + self, + pattern: str | None = None, ) -> RecordDirResultList: return RecordDirResultList().add_info( sorted( @@ -205,8 +205,8 @@ class _RecordDirStore(StorageBackend): ) def remove_record( - self, - iri: str, + self, + iri: str, ) -> bool: index_entry = self.index.get_info_for_iri(iri) if index_entry is None: @@ -226,10 +226,10 @@ _existing_stores = {} def RecordDirStore( # noqa: N802 - root: Path, - pid_mapping_function: Callable, - suffix: str, - order_by: Iterable[str] | None = None, + root: Path, + pid_mapping_function: Callable, + suffix: str, + order_by: Iterable[str] | None = None, ) -> _RecordDirStore: """Get a record directory store for the given root directory.""" existing_store = _existing_stores.get(root) @@ -255,10 +255,3 @@ def RecordDirStore( # noqa: N802 raise ValueError(msg) return existing_store - - -def _get_schema_type( - class_name: str, - schema_module: ModuleType, -) -> str: - return getattr(schema_module, class_name).class_class_curie diff --git a/dump_things_service/backends/record_dir_index.py b/dump_things_service/backends/record_dir_index.py index 416c0ef..d872dd6 100644 --- a/dump_things_service/backends/record_dir_index.py +++ b/dump_things_service/backends/record_dir_index.py @@ -65,11 +65,11 @@ class IndexEntry(Base): class RecordDirIndex: def __init__( - self, - store_dir: Path, - suffix: str, - *, - echo: bool = False, + self, + store_dir: Path, + suffix: str, + *, + echo: bool = False, ): if not store_dir.is_absolute(): msg = f'Not an absolute path: {store_dir}' @@ -91,11 +91,11 @@ class RecordDirIndex: Base.metadata.create_all(self.engine) def add_iri_info( - self, - iri: str, - class_name: str, - path: str, - sort_key: str, + self, + iri: str, + class_name: str, + path: str, + sort_key: str, ): with Session(self.engine) as session, session.begin(): self.add_iri_info_with_session( @@ -107,12 +107,12 @@ class RecordDirIndex: ) def add_iri_info_with_session( - self, - session: Session, - iri: str, - class_name: str, - path: str, - sort_key: str, + self, + session: Session, + iri: str, + class_name: str, + path: str, + sort_key: str, ): existing_record = session.query(IndexEntry).filter_by(iri=iri).first() if existing_record: @@ -131,8 +131,8 @@ class RecordDirIndex: ) def get_info_for_iri( - self, - iri: str, + self, + iri: str, ) -> tuple | None: with Session(self.engine) as session, session.begin(): statement = select(IndexEntry).filter_by(iri=iri) @@ -142,8 +142,8 @@ class RecordDirIndex: return None def get_info_for_class( - self, - class_name: str, + self, + class_name: str, ) -> Generator[IndexEntry]: with Session(self.engine) as session, session.begin(): statement = select(IndexEntry).filter_by(class_name=class_name) @@ -152,7 +152,7 @@ class RecordDirIndex: yield row[0] def get_info_for_all_classes( - self, + self, ) -> Generator[IndexEntry]: statement = select(IndexEntry) with Session(self.engine) as session, session.begin(): @@ -161,8 +161,8 @@ class RecordDirIndex: yield row[0] def remove_iri_info( - self, - iri: str, + self, + iri: str, ) -> bool: statement = delete(IndexEntry).where(IndexEntry.iri == iri) with Session(self.engine) as session, session.begin(): @@ -170,9 +170,9 @@ class RecordDirIndex: return result.rowcount == 1 def rebuild_index( - self, - schema: str, - order_by: Iterable[str] | None = None, + self, + schema: str, + order_by: Iterable[str] | None = None, ): """Rebuild the index from the records in the directory.""" lgr.info('Building IRI index for records in %s', self.store_dir) @@ -223,15 +223,18 @@ class RecordDirIndex: self.needs_rebuild = False def rebuild_if_needed( - self, - schema: str, - order_by: Iterable[str] | None = None, + self, + schema: str, + order_by: Iterable[str] | None = None, ): if self.needs_rebuild: self.rebuild_index(schema=schema, order_by=order_by) self.needs_rebuild = False - def _get_class_name(self, path: Path) -> str: + def _get_class_name( + self, + path: Path, + ) -> str: """Get the class name from the path.""" rel_path = path.absolute().relative_to(self.store_dir) return rel_path.parts[0] diff --git a/dump_things_service/backends/schema_type_layer.py b/dump_things_service/backends/schema_type_layer.py index bc992c9..640519a 100644 --- a/dump_things_service/backends/schema_type_layer.py +++ b/dump_things_service/backends/schema_type_layer.py @@ -34,15 +34,16 @@ if TYPE_CHECKING: __all__ = [ + '_SchemaTypeLayer', 'SchemaTypeLayer', ] class SchemaTypeLayerResultList(BackendResultList): def __init__( - self, - origin_list: BackendResultList, - schema_model: ModuleType, + self, + origin_list: BackendResultList, + schema_model: ModuleType, ): super().__init__() self.schema_model = schema_model @@ -50,12 +51,12 @@ class SchemaTypeLayerResultList(BackendResultList): self.list_info = self.origin_list.list_info def generate_result( - self, - index: int, - iri: str, - class_name: str, - sort_key: str, - private: Any, + self, + index: int, + iri: str, + class_name: str, + sort_key: str, + private: Any, ) -> RecordInfo: origin_element = self.origin_list.generate_result( index, iri, class_name, sort_key, private @@ -72,9 +73,9 @@ class _SchemaTypeLayer(StorageBackend): """Proxy backend that removes `schema_type` from stored records""" def __init__( - self, - backend: StorageBackend, - schema: str, + self, + backend: StorageBackend, + schema: str, ): super().__init__() self.backend = backend @@ -86,10 +87,10 @@ class _SchemaTypeLayer(StorageBackend): return self.backend.get_uri() def add_record( - self, - iri: str, - class_name: str, - json_object: dict, + self, + iri: str, + class_name: str, + json_object: dict, ): # Remove the top level `schema_type` from the JSON object because we # don't want to store it in the files. We add `schema_type` after @@ -104,14 +105,14 @@ class _SchemaTypeLayer(StorageBackend): ) def remove_record( - self, - iri: str, + self, + iri: str, ) -> bool: return self.backend.remove_record(iri=iri) def get_record_by_iri( - self, - iri: str, + self, + iri: str, ) -> RecordInfo | None: origin_result = self.backend.get_record_by_iri(iri) if origin_result and 'schema_type' not in origin_result.json_object: @@ -122,9 +123,9 @@ class _SchemaTypeLayer(StorageBackend): return origin_result def get_records_of_classes( - self, - class_names: list[str], - pattern: str | None = None, + self, + class_names: list[str], + pattern: str | None = None, ) -> BackendResultList: return SchemaTypeLayerResultList( origin_list=self.backend.get_records_of_classes( @@ -135,8 +136,8 @@ class _SchemaTypeLayer(StorageBackend): ) def get_all_records( - self, - pattern: str | None = None, + self, + pattern: str | None = None, ) -> BackendResultList: return SchemaTypeLayerResultList( origin_list=self.backend.get_all_records(pattern), @@ -149,8 +150,8 @@ class _SchemaTypeLayer(StorageBackend): def _get_schema_type( - class_name: str, - schema_module: ModuleType, + class_name: str, + schema_module: ModuleType, ) -> str: return getattr(schema_module, class_name).class_class_curie @@ -160,8 +161,8 @@ _existing_layers = {} def SchemaTypeLayer( # noqa: N802 - backend: StorageBackend, - schema: str, + backend: StorageBackend, + schema: str, ) -> _SchemaTypeLayer: existing_layer, _ = _existing_layers.get(id(backend), (None, None)) if not existing_layer: diff --git a/dump_things_service/backends/sqlite.py b/dump_things_service/backends/sqlite.py index 7b10ddb..5dd8523 100644 --- a/dump_things_service/backends/sqlite.py +++ b/dump_things_service/backends/sqlite.py @@ -60,6 +60,12 @@ if TYPE_CHECKING: from collections.abc import Iterable from pathlib import Path + +__all__ = [ + '_SQLiteBackend', + 'SQLiteBackend', +] + logger = logging.getLogger('dump_things_service') old_record_file_name = '.sqlite-records.db' @@ -82,19 +88,19 @@ class Thing(Base): class SQLResultList(BackendResultList): def __init__( - self, - engine: Any, + self, + engine: Any, ): super().__init__() self.engine = engine def generate_result( - self, - _: int, - iri: str, - class_name: str, - sort_key: str, - db_id: int, + self, + _: int, + iri: str, + class_name: str, + sort_key: str, + db_id: int, ) -> RecordInfo: """ Generate a JSON representation of the record at index `index`. @@ -118,12 +124,15 @@ class SQLResultList(BackendResultList): class _SQLiteBackend(StorageBackend): def __init__( - self, - db_path: Path, - *, - order_by: Iterable[str] | None = None, - echo: bool = False, + self, + db_path: Path, + *, + order_by: Iterable[str] | None = None, + echo: bool = False, ) -> None: + assert db_path.is_absolute(), f'db_path not absolute {db_path}' + if db_path.exists(): + assert db_path.is_file(), f'db_path not a file {db_path}' super().__init__(order_by=order_by) self.db_path = db_path self.perform_file_name_conversion() @@ -152,10 +161,10 @@ class _SQLiteBackend(StorageBackend): shutil.move(str(old_path), str(self.db_path)) def add_record( - self, - iri: str, - class_name: str, - json_object: dict, + self, + iri: str, + class_name: str, + json_object: dict, ): with Session(self.engine) as session, session.begin(): self._add_record_with_session( @@ -166,8 +175,8 @@ class _SQLiteBackend(StorageBackend): ) def add_records_bulk( - self, - record_infos: Iterable[RecordInfo], + self, + record_infos: Iterable[RecordInfo], ): with Session(self.engine) as session, session.begin(): for record_info in record_infos: @@ -179,8 +188,8 @@ class _SQLiteBackend(StorageBackend): ) def remove_record( - self, - iri: str, + self, + iri: str, ) -> bool: statement = delete(Thing).where(Thing.iri == iri) with Session(self.engine) as session, session.begin(): @@ -188,11 +197,11 @@ class _SQLiteBackend(StorageBackend): return result.rowcount == 1 def _add_record_with_session( - self, - session: Session, - iri: str, - class_name: str, - json_object: dict, + self, + session: Session, + iri: str, + class_name: str, + json_object: dict, ): sort_key = create_sort_key(json_object, self.order_by) existing_record = session.query(Thing).filter_by(iri=iri).first() @@ -211,8 +220,8 @@ class _SQLiteBackend(StorageBackend): ) def get_record_by_iri( - self, - iri: str, + self, + iri: str, ) -> RecordInfo | None: with Session(self.engine) as session, session.begin(): statement = select(Thing).filter_by(iri=iri) @@ -227,9 +236,9 @@ class _SQLiteBackend(StorageBackend): return None def get_records_of_classes( - self, - class_names: Iterable[str], - pattern: str | None = None, + self, + class_names: Iterable[str], + pattern: str | None = None, ) -> SQLResultList: class_list = ', '.join(f"'{cn}'" for cn in class_names) @@ -262,8 +271,8 @@ class _SQLiteBackend(StorageBackend): ) def get_all_records( - self, - pattern: str | None = None, + self, + pattern: str | None = None, ) -> SQLResultList: if pattern is None: statement = text( @@ -297,7 +306,7 @@ _existing_sqlite_backends = {} def SQLiteBackend( # noqa: N802 - db_path: Path, *, order_by: Iterable[str] | None = None, echo: bool = False + db_path: Path, *, order_by: Iterable[str] | None = None, echo: bool = False ) -> _SQLiteBackend: existing_backend = _existing_sqlite_backends.get(db_path) if not existing_backend: diff --git a/dump_things_service/collection.py b/dump_things_service/collection.py new file mode 100644 index 0000000..ea386f0 --- /dev/null +++ b/dump_things_service/collection.py @@ -0,0 +1,626 @@ +import logging +import os +import shutil +from pathlib import Path +from typing import Any + +from datalad_core.runners import ( + call_git_oneline, + CommandError, +) +from fastapi import ( + Depends, + FastAPI, + HTTPException, +) +from pydantic import ( + BaseModel, + TypeAdapter, + ValidationError, +) +from starlette.responses import ( + JSONResponse, + PlainTextResponse, +) +from starlette.status import HTTP_401_UNAUTHORIZED + +from dump_things_service import ( + Format, + HTTP_400_BAD_REQUEST, + HTTP_403_FORBIDDEN, + HTTP_422_UNPROCESSABLE_CONTENT, +) +from dump_things_service.abstract_config import ( + CollectionConfig, + Configuration, + ConfigAuthSpec, + ForgejoAuthSpec, + RecordDirBackendConfig, + SQLiteBackendConfig, + read_config, + check_collection, + get_default_token_representation, +) +from dump_things_service.audit.gitaudit import GitAuditBackend +from dump_things_service.auth.config import ConfigAuthenticationSource +from dump_things_service.auth.forgejo import ForgejoAuthenticationSource +from dump_things_service.backends.record_dir_index import index_file_name +from dump_things_service.backends.sqlite import record_file_name as sqlite_db_filename +from dump_things_service.instance_state import ( + InstanceState, + InstanceStateCollectionInfo, + get_record_dir_config, + get_instance_state, + get_schema_info, + record_dir_config_file_name, +) +from dump_things_service.converter import FormatConverter +from dump_things_service.exceptions import ( + ConfigError, + ConfigCollisionError, + CurieResolutionError, +) +from dump_things_service.model import get_model_for_schema +from dump_things_service.utils import ( + combine_ttl, + create_store, + get_token_store, + join_default_token_permissions, + var_escape, + wrap_http_exception, +) + + +# This following lines are required for dynamic endpoint generation +from typing import Annotated # noqa 401 -- used by autogenerated code +from fastapi import Body # noqa 401 -- used by autogenerated code +from dump_things_service.api_key import api_key_header_scheme # noqa 401 -- used by autogenerated code +from dump_things_service.curated import store_curated_record # noqa 401 -- used by autogenerated code +from dump_things_service.incoming import store_incoming_record # noqa 401 -- used by autogenerated code +from dump_things_service.validate import validate_record # noqa 401 -- used by autogenerated code + + +logger = logging.getLogger('dump_things_service') + +_endpoint_template = """ +def {name}( + data: {model_var_name}.{class_name} | Annotated[str, Body(media_type='text/plain')], + api_key: str = Depends(api_key_header_scheme), + format: Format = Format.json, +) -> JSONResponse | PlainTextResponse: + logger.info('{name}(%s, %s, %s, %s)', repr(data), repr('{class_name}'), repr({model_var_name}), repr(format)) + return {handler}('{collection}', data, '{class_name}', {model_var_name}, format, api_key) +""" + +_endpoint_curated_template = """ +def {name}( + data: {model_var_name}.{class_name}, + author_id: str | None = None, + api_key: str = Depends(api_key_header_scheme), +) -> JSONResponse: + logger.info( + '{name}(%s, %s, %s)', + repr(data), + repr(author_id), + repr({model_var_name}), + ) + return store_curated_record( + '{collection}', + data, + '{class_name}', + author_id, + api_key, + ) +""" + +_endpoint_incoming_template = """ +async def {name}( + data: {model_var_name}.{class_name}, + label: str, + api_key: str = Depends(api_key_header_scheme), +) -> JSONResponse: + logger.info( + '{name}(%s, %s, %s)', + repr(data), + repr(label), + repr({model_var_name}), + ) + return await store_incoming_record( + '{collection}', + label, + data, + '{class_name}', + api_key, + ) +""" + + +def create_collection( + instance_state: InstanceState, + configuration: Configuration, + collection_name: str, +): + """Create a collection instance as specified by `collection_configuration` + + Reuse existing disk structures, if they are compatible. If they are not + compatible, raise an error. + + :param instance_state: + :param configuration: + :param collection_name: + :return: + """ + + collection_configuration = configuration.collections[collection_name] + curated_path = Path(instance_state.store_path / collection_configuration.curated) + incoming_path = ( + None + if collection_configuration.incoming is None + else Path(instance_state.store_path / collection_configuration.incoming) + ) + + # Check for compatibility of all existing stores before creating any + # structures on disk. + if curated_path.exists(): + check_store_compatibility( + curated_path, + collection_configuration.backend, + collection_configuration.schema_location, + ) + + for audit_backend in collection_configuration.audit_backends: + audit_path = Path(instance_state.store_path / audit_backend.path) + if audit_path.exists(): + check_git_audit_compatibility(audit_path) + + # We know now that all existing structures are compatible with the + # collection specification. We record what we create in order to delete + # it in case of an error. + created_directories = [] + try: + if not curated_path.exists(): + curated_path.mkdir(parents=True) + created_directories.append(curated_path) + + if incoming_path and not incoming_path.exists(): + incoming_path.mkdir(parents=True) + created_directories.append(incoming_path) + + for audit_backend in collection_configuration.audit_backends: + audit_path = Path(instance_state.store_path / audit_backend.path) + if not audit_path.exists(): + audit_path.mkdir(parents=True) + created_directories.append(audit_path) + + except ConfigError as e: + # Delete all directories that were created in this + for directory in created_directories: + shutil.rmtree(directory) + raise + + # Create the curated store + curated_store = create_store( + abstract_configuration=configuration, + instance_state=instance_state, + collection_name=collection_name, + ) + + instance_state.curated_stores[collection_name] = curated_store + + # Incoming stores are created on demand when a token is authenticated + instance_state.incoming_stores[collection_name] = {} + + # Create the schema modules, schema view, and conversion objects + schema_location = collection_configuration.schema_location + instance_state.schema_info[schema_location] = get_schema_info(schema_location) + + # Determine the active classes based on the classes defined in the schema + # and the configuration of the collection + active_classes = set(instance_state.schema_info[schema_location].classes) + if collection_configuration.use_classes: + active_classes &= set(collection_configuration.use_classes) + if collection_configuration.ignore_classes: + active_classes -= set(collection_configuration.ignore_classes) + instance_state.collections[collection_name] = InstanceStateCollectionInfo( + active_classes=active_classes, + tag_info=dict(), + ) + + # Create a validator for the collection + instance_state.validators[collection_name] = FormatConverter( + schema=collection_configuration.schema_location, + input_format=Format.json, + output_format=Format.ttl, + ) + + # Create the authentication sources + for authentication_spec in collection_configuration.auth_sources: + create_authentication_source( + configuration, + collection_name, + authentication_spec, + instance_state, + ) + + # Create the audit-backends + instance_state.audit_backends[collection_name] = [] + for audit_backend_config in collection_configuration.audit_backends: + instance_state.audit_backends[collection_name].append( + GitAuditBackend( + path=Path(instance_state.store_path / audit_backend_config.path), + auto_flush_timeout=audit_backend_config.auto_flush_timeout, + ) + ) + + # Create the dynamic endpoints for record storing & validation, for + # inbox-storing, and for curated area storing. + create_endpoints_for_collection( + instance_state, + collection_name, + collection_configuration, + instance_state.fastapi_app, + ) + + +def create_authentication_source( + abstract_configuration: Configuration, + collection_name: str, + authentication_spec: ConfigAuthSpec | ForgejoAuthSpec, + instance_state: InstanceState, +): + if collection_name not in instance_state.auth_sources: + instance_state.auth_sources[collection_name] = [] + + auth_sources = instance_state.auth_sources[collection_name] + if isinstance(authentication_spec, ConfigAuthSpec): + auth_source = ConfigAuthenticationSource( + abstract_configuration=abstract_configuration, + collection_name=collection_name, + ) + elif isinstance(authentication_spec, ForgejoAuthSpec): + auth_source = ForgejoAuthenticationSource( + api_url=authentication_spec.url, + organization=authentication_spec.organization, + team=authentication_spec.team, + label_type=authentication_spec.label_type, + instance_id=authentication_spec.instance_id, + repository=authentication_spec.repository, + ) + else: + msg = f"Unsupported authentication config type: '{type(authentication_spec)}'" + raise ConfigError(msg) + + auth_sources.append(auth_source) + + +def write_record_dir_config( + path: Path, + backend_config: RecordDirBackendConfig, + schema: str, +): + assert isinstance(backend_config, RecordDirBackendConfig) + + record_dir_config_file_path = path / record_dir_config_file_name + if not record_dir_config_file_path.exists(): + record_dir_config_file_path.write_text(f"""# RecordDir Config +type: records +version: 1 +schema: {schema} +format: yaml +idfx: {backend_config.mapping_method} +""", + ) + + +def check_store_compatibility( + store_path: Path, + backend_config: RecordDirBackendConfig | SQLiteBackendConfig, + schema: str, +): + """Check if an existing store is compatible with the specs in `backend_config` + + :param store_path: + :param backend_config: + :param schema: + :return: + """ + if not store_path.exists(): + return + if isinstance(backend_config, RecordDirBackendConfig): + check_record_dir_compatibility(store_path, backend_config, schema) + elif isinstance(backend_config, SQLiteBackendConfig): + check_sqlite_compatibility(store_path) + else: + msg = f"Unsupported backend config type: '{type(backend_config)}'" + raise ConfigError(msg) + return + + +def check_record_dir_compatibility( + store_path: Path, + backend_config: RecordDirBackendConfig, + schema: str, +): + + # Non-existing or empty record_dir-directories are compatible + if not store_path.exists(): + return + + # A record_dir-directory is considered to be empty, if it contains no + # files or only an record_dir-index file + files_in_dir = tuple(map(lambda dir_entry: dir_entry.name, os.scandir(store_path))) + if files_in_dir in ((), (index_file_name,)): + return + + record_dir_config = get_record_dir_config(store_path) + if record_dir_config.schema_location != schema: + raise ConfigCollisionError(f"Existing backend uses a different schema: '{record_dir_config.schema_location}'") + + stored_mapping_method = record_dir_config.idfx.value + if stored_mapping_method != backend_config.mapping_method: + msg = f"Configuration specifies mapping method '{backend_config.mapping_method}', existing backend uses mapping method: '{stored_mapping_method}'" + raise ConfigCollisionError(msg) + return + + +def check_sqlite_compatibility( + store_path: Path, +): + sqlite_db_path = Path(store_path / sqlite_db_filename) + if not sqlite_db_path.exists(): + raise ConfigError('No sqlite database found in existing store') + return + + +def check_git_audit_compatibility( + audit_path: Path, +): + """Check if an existing audit path is compatible with a git audit store + + :param audit_path: + :return: + """ + + # Non-existing or empty directories are valid gitaudit-locations + if not audit_path.exists(): + return + if not tuple(os.scandir(audit_path)): + return + + # A non-empty directory should contain bare git repository + try: + result = call_git_oneline( + ['rev-parse', '--is-bare-repository'], + cwd=audit_path, + force_c_locale=True, + ) + except CommandError as ce: + raise ConfigError(f'No git repository in gitaudit-path: {audit_path}') from ce + if result.strip().lower() != 'true': + raise ConfigError(f'No bare git repository in gitaudit-path: {audit_path}') + return + + +def create_endpoint( + operation_name: str, + operation_path: str, + instance_state: InstanceState, + collection_name: str, + collection_config: CollectionConfig, + template: str, + handler: str, + tag_group: str, + tag_name: str, + app: FastAPI, +): + logger.info( + f'Creating %s-endpoints for collection: "%s"', + operation_name, + collection_name, + ) + + instance_state.collections[collection_name].tag_info[tag_group] = tag_name + + # TODO: get schema_info from instance_state!? + model, classes, model_var_name = get_model_for_schema(collection_config.schema_location) + globals()[model_var_name] = model + + active_classes = instance_state.collections[collection_name].active_classes + for class_name in active_classes: + endpoint_name = f'_endpoint_{var_escape(collection_name)}_{operation_name}_{class_name}' + endpoint_source = template.format( + name=endpoint_name, + model_var_name=model_var_name, + class_name=class_name, + collection=collection_name, + info=f"'{operation_name} {collection_name}/{class_name} objects'", + handler=handler, + ) + exec(endpoint_source, globals()) # noqa S102 + + # Create an API route for the endpoint + app.add_api_route( + path=f'/{collection_name}/{operation_path}/{class_name}', + endpoint=globals()[endpoint_name], + methods=['POST'], + name=f'{operation_name} "{class_name}" object (schema: {model.linkml_meta["id"]})', + response_model=None, + tags=[tag_name] + ) + + logger.info( + 'Creation of %d %s-endpoints completed.', + len(active_classes), + operation_name, + ) + + +def create_endpoints_for_collection( + instance_state: InstanceState, + collection_name: str, + collection_config: CollectionConfig, + app: FastAPI, +): + for ( + operation_name, + operation_path, + template, + handler, + tag_group, + tag_name, + ) in ( + ('store', 'record', _endpoint_template, 'store_record', 'write', f'Write records to collection "{collection_name}"'), + ('validate', 'validate/record', _endpoint_template, 'validate_record', 'validate', f'Validate records for collection "{collection_name}"'), + ('curated', 'curated/record', _endpoint_curated_template, 'store_curated_record', 'curated_write', f'Curated area: store records in curated area of collection "{collection_name}"'), + ('incoming', 'incoming/{label}/record', _endpoint_incoming_template, 'store_incoming_record', 'incoming_write', f'Incoming area: store records in incoming area "{{label}}" of collection "{collection_name}"'), + ): + create_endpoint( + operation_name=operation_name, + operation_path=operation_path, + instance_state=instance_state, + collection_name=collection_name, + collection_config=collection_config, + template=template, + handler=handler, + tag_group=tag_group, + tag_name=tag_name, + app=app, + ) + + +def delete_endpoints_for_collection( + instance_state: InstanceState, + collection_name: str, +): + + active_classes = instance_state.collections[collection_name].active_classes + + for operation_path in ( + 'record', + 'validate/record', + 'curated/record', + 'incoming/{label}/record' + ): + delete_endpoint( + collection_name=collection_name, + active_classes=active_classes, + operation_path=operation_path, + app=instance_state.fastapi_app, + ) + + +def delete_endpoint( + collection_name: str, + active_classes: set[str], + operation_path: str, + app: FastAPI, +): + + remove_paths_set = set( + f'/{collection_name}/{operation_path}/{class_name}' + for class_name in active_classes + ) + + remove_indices = [ + index + for index, api_route in enumerate(app.router.routes) + if api_route.path in remove_paths_set + ] + for index in sorted(remove_indices, reverse=True): + del app.router.routes[index] + + +def store_record( + collection: str, + data: BaseModel | str, + class_name: str, + model: Any, + input_format: Format, + api_key: str | None = Depends(api_key_header_scheme), +) -> JSONResponse | PlainTextResponse: + if input_format == Format.json and isinstance(data, str): + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail='Invalid JSON data provided.' + ) + + if input_format == Format.ttl and not isinstance(data, str): + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail='Invalid ttl data provided.' + ) + + instance_state = get_instance_state() + abstract_config = read_config(instance_state.store_path) + check_collection(abstract_config, collection) + + token_representation = get_default_token_representation( + abstract_config, + collection, + ) if api_key is None else api_key + + if not token_representation: + raise HTTPException( + status_code=HTTP_401_UNAUTHORIZED, + detail=f'Not authorized to submit to collection "{collection}"', + ) + + # Get the token permissions and extend them by the default permissions. + # This call will also convert plaintext tokens into the hashed version of + # the token, if the token is hashed. This is necessary because we do not + # store the plaintext token, so all token-information is associated with + # the hashed representation of the token. + store, token_permissions, user_id = get_token_store( + abstract_config, + instance_state, + collection, + token_representation, + ) + final_permissions = join_default_token_permissions( + abstract_config, + instance_state, + token_permissions, + collection, + ) + if not final_permissions.incoming_write: + raise HTTPException( + status_code=HTTP_403_FORBIDDEN, + detail=f"Not authorized to submit to collection '{collection}'.", + ) + + if input_format == Format.ttl: + with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Conversion error'): + json_object = FormatConverter( + abstract_config.collections[collection].schema_location, + input_format=Format.ttl, + output_format=Format.json, + ).convert(data, class_name) + with wrap_http_exception(ValidationError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): + record = TypeAdapter(getattr(model, class_name)).validate_python(json_object) + else: + record = data + + with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): + instance_state.validators[collection].validate(record) + + with wrap_http_exception(CurieResolutionError): + stored_records = store.store_object(obj=record, submitter=user_id) + + if input_format == Format.ttl: + format_converter = FormatConverter( + abstract_config.collections[collection].schema_location, + input_format=Format.json, + output_format=Format.ttl, + ) + with wrap_http_exception(ValueError, header='Conversion error'): + return PlainTextResponse( + combine_ttl( + [ + format_converter.convert( + record, + class_name, + ) + for class_name, record in stored_records + ] + ), + media_type='text/turtle', + ) + return JSONResponse([record for _, record in stored_records]) diff --git a/dump_things_service/collection_endpoints.py b/dump_things_service/collection_endpoints.py new file mode 100644 index 0000000..6ae493d --- /dev/null +++ b/dump_things_service/collection_endpoints.py @@ -0,0 +1,240 @@ +import logging +from pathlib import ( + Path, + PurePosixPath, +) +from typing import Literal +from urllib.parse import quote + +from fastapi import ( + APIRouter, + Depends, + HTTPException, + Response, +) +from pydantic import BaseModel + +from dump_things_service import ( + HTTP_201_CREATED, + HTTP_404_NOT_FOUND, + HTTP_406_NOT_ACCEPTABLE, + HTTP_409_CONFLICT, + reserved_collection_names, +) +from dump_things_service.abstract_config import ( + Configuration, + CollectionConfig, + StrictModel, + store_config, + get_config, get_token_permissions, +) +from dump_things_service.admin import authenticate_admin +from dump_things_service.api_key import api_key_header_scheme +from dump_things_service.instance_state import get_instance_state, InstanceState +from dump_things_service.manifest import manifest_configuration +from dump_things_service.exceptions import ConfigError +from dump_things_service.utils import wrap_http_exception + + +logger = logging.getLogger('dump_things_service') + +router = APIRouter() + + +class ConfigAuthSpec(BaseModel): + type: Literal['config'] = 'config' + + +class ForgejoAuthSpec(BaseModel): + type: Literal['forgejo'] + url: str + organization: str + team: str + label_type: Literal['team', 'user'] + repository: str | None = None + + +class TagSpec(BaseModel): + submitter_id_tag: str = 'http://purl.obolibrary.org/obo/NCIT_C54269' + submission_time_tag: str = 'http://semanticscience.org/resource/SIO_001083' + + +from pydantic import ConfigDict, Field +from dump_things_service.abstract_config import RecordDirBackendConfig, SQLiteBackendConfig, GitAuditBackendConfig + +class CollectionRequest(CollectionConfig): + name: str + + +@router.post( + '/collections', + tags=['Administration interface'], + name='Create a new collection', + status_code=HTTP_201_CREATED, +) +async def create_collection( + response: Response, + body: CollectionRequest, + api_key: str = Depends(api_key_header_scheme), +): + + instance_state = get_instance_state() + abstract_config = get_config() + + # Check admin rights + authenticate_admin(instance_state, abstract_config, api_key) + + # Check for existing collection name + if body.name in abstract_config.collections: + raise HTTPException( + status_code=HTTP_409_CONFLICT, + detail=f"Collection with name '{body.name}' already exists.", + ) + + # Check for reserved collection names + if body.name in reserved_collection_names: + raise HTTPException( + status_code=HTTP_409_CONFLICT, + detail=f"Collection name '{body.name}' is reserved and cannot be created.", + ) + + # Check for distinct directories + for directory in (body.incoming, body.curated): + if directory: + ensure_unique_directory( + abstract_config, + instance_state, + directory, + ) + + # Check for incoming directory if any of the tokens allows writing + validate_incoming_paths(abstract_config, body) + + # Update the abstract configuration + abstract_config.collections[body.name] = body + + # Manifest the abstract configuration + with wrap_http_exception(ConfigError): + manifest_configuration(abstract_config, instance_state) + + # Persist the abstract configuration + store_config( + store_path=instance_state.store_path, + config=abstract_config, + ) + + response.headers['Location'] = f'/collections/{quote(body.name)}' + + +@router.get( + '/collections', + tags=['Administration interface'], + name='Get existing collections', +) +async def get_collections( + api_key: str = Depends(api_key_header_scheme), +) -> dict[str, CollectionConfig]: + + instance_state = get_instance_state() + abstract_config = get_config() + + # Check admin rights + authenticate_admin(instance_state, abstract_config, api_key) + return abstract_config.collections + + +@router.get( + '/collections/{collection_name}', + tags=['Administration interface'], + name='Get existing collection by name', +) +async def get_collection_with_name( + collection_name: str, + api_key: str = Depends(api_key_header_scheme), +) -> CollectionConfig: + + instance_state = get_instance_state() + abstract_config = get_config() + + # Check admin rights + authenticate_admin(instance_state, abstract_config, api_key) + + if collection_name not in abstract_config.collections: + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f"Collection with name '{collection_name}' does not exist.", + ) + return abstract_config.collections[collection_name] + + +@router.delete( + '/collections/{collection_name}', + tags=['Administration interface'], + name='Delete collection with name', +) +async def delete_collection( + collection_name: str, + api_key: str = Depends(api_key_header_scheme), +): + + instance_state = get_instance_state() + abstract_config = get_config() + + # Check admin rights + authenticate_admin(instance_state, abstract_config, api_key) + + if collection_name not in abstract_config.collections: + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f"Collection with name '{collection_name}' does not exist.", + ) + + # Update the abstract configuration + del abstract_config.collections[collection_name] + + # Manifest the abstract configuration + with wrap_http_exception(ConfigError): + manifest_configuration(abstract_config, instance_state) + + # Persist the abstract configuration + store_config( + store_path=instance_state.store_path, + config=abstract_config, + ) + + +def ensure_unique_directory( + abstract_config: Configuration, + instance_state: InstanceState, + existing_dir: PurePosixPath, +): + abs_existing_dir = (instance_state.store_path / Path(existing_dir)).absolute() + for collection_name, collection_config in abstract_config.collections.items(): + for collection_dir in collection_config.curated, collection_config.incoming: + abs_collection_dir = (instance_state.store_path / Path(collection_dir)).absolute() + if abs_collection_dir == abs_existing_dir: + raise HTTPException( + status_code=HTTP_409_CONFLICT, + detail=f"Directory '{collection_dir}' already used by collection '{collection_name}'.", + ) + + +def validate_incoming_paths( + abstract_config: Configuration, + collection_request: CollectionRequest, +): + for token_name, token_info in abstract_config.tokens.items(): + token_collection_info = token_info.collections.get(collection_request.name) + if token_collection_info: + token_permissions = get_token_permissions(token_collection_info.mode) + if token_permissions.incoming_write or token_permissions.zones_access: + if not collection_request.incoming: + detail = ( + f"Cannot add collection '{collection_request.name}' without " + f"`incoming` path, because at least token '{token_name}' " + f" has write access to the collection" + ) + raise HTTPException( + status_code=HTTP_406_NOT_ACCEPTABLE, + detail=detail, + ) diff --git a/dump_things_service/commands/check_pids.py b/dump_things_service/commands/check_pids.py index 9dcad65..620e647 100644 --- a/dump_things_service/commands/check_pids.py +++ b/dump_things_service/commands/check_pids.py @@ -5,15 +5,20 @@ from argparse import ArgumentParser from collections.abc import Iterable from pathlib import Path -from dump_things_service import config_file_name +from fastapi import FastAPI + +from dump_things_service.abstract_config import ( + get_config_labels, + read_config, +) from dump_things_service.backends.schema_type_layer import _SchemaTypeLayer from dump_things_service.backends.sqlite import _SQLiteBackend -from dump_things_service.config import get_config, process_config from dump_things_service.exceptions import CurieResolutionError +from dump_things_service.instance_state import create_instance_state +from dump_things_service.manifest import manifest_configuration from dump_things_service.store.model_store import _ModelStore from dump_things_service.utils import ( create_token_store, - get_config_labels, get_on_disk_labels, ) @@ -26,13 +31,6 @@ parser.add_argument( 'store', help='The root directory of the store.', ) -parser.add_argument( - '-c', - '--config', - metavar='CONFIG_FILE', - help="Read the configuration from 'CONFIG_FILE' instead of looking for " - 'it in the root directory of the store.', -) def show_backend(model_store: _ModelStore): @@ -62,30 +60,41 @@ def check_pids_in_stores( return result -def check_pids(): - - instance_config = get_config() +def check_pids( + store_path: Path, +): + abstract_config = read_config(store_path) + instance_state = create_instance_state( + store_path=store_path, + bootstrap_token='', + fastapi_app=FastAPI(), + ) + manifest_configuration(abstract_config, instance_state) result = 0 # Check pids in curated stores - result += check_pids_in_stores(instance_config.curated_stores.values()) + result += check_pids_in_stores(instance_state.curated_stores.values()) # Check pids in incoming stores. Incoming stores can be defined in the # configuration, or can be generated by external authentication sources. # In the latter case, they are manifest as directories in the incoming area # of a collection. - for collection, collection_info in instance_config.collections.items(): - - configured_labels = get_config_labels(instance_config, collection) - on_disk_labels = get_on_disk_labels(instance_config, collection) + for collection, collection_info in abstract_config.collections.items(): + configured_labels = get_config_labels(abstract_config, collection) + on_disk_labels = get_on_disk_labels( + store_path=store_path, + abstract_config=abstract_config, + collection=collection, + ) all_labels = configured_labels.union(on_disk_labels) token_stores = [ create_token_store( - instance_config, + abstract_config, + instance_state, collection, - instance_config.store_path / collection_info.incoming / label + instance_state.store_path / collection_info.incoming / label ) for label in all_labels ] @@ -96,16 +105,7 @@ def check_pids(): def main(): arguments = parser.parse_args() - - store_path = Path(arguments.store).absolute() - process_config( - store_path=store_path, - config_file=Path(arguments.config or (store_path / config_file_name)), - order_by=['pid'], - globals_dict=globals(), - ) - - result = check_pids() + result = check_pids(Path(arguments.store).absolute()) if result > 0: print(f'found {result} unresolvable pids', file=sys.stderr) return 1 diff --git a/dump_things_service/commands/copy_store.py b/dump_things_service/commands/copy_store.py index 6585625..8eeda94 100644 --- a/dump_things_service/commands/copy_store.py +++ b/dump_things_service/commands/copy_store.py @@ -17,7 +17,7 @@ from dump_things_service.backends.sqlite import ( from dump_things_service.backends.sqlite import ( record_file_name as sqlite_record_file_name, ) -from dump_things_service.config import get_backend_and_extension +from dump_things_service.abstract_config import get_backend_and_extension if TYPE_CHECKING: from dump_things_service.backends import StorageBackend diff --git a/dump_things_service/commands/hash_token.py b/dump_things_service/commands/hash_token.py new file mode 100644 index 0000000..5f5d478 --- /dev/null +++ b/dump_things_service/commands/hash_token.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +import sys +from argparse import ArgumentParser + +from dump_things_service.abstract_config import hash_token_representation + + +parser = ArgumentParser( + prog='Hash a plain text token to create a hashed token in a dump-things server', + description='Hash a token and print the calculated hash value. The hash value ' + 'can be used to create a hashed token via the `/tokens`-endpoint ' + 'of a dump-things-server.', +) +parser.add_argument( + 'token', + type=str, + help='The plain text token', +) + + +def main(): + arguments = parser.parse_args() + + token = arguments.token.strip() + if any(map(lambda s: s.isspace(), token)): + print('Whitespace are not allowed in token', file=sys.stderr, flush=True) + return 1 + + print(hash_token_representation(token)) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/dump_things_service/commands/load_config.py b/dump_things_service/commands/load_config.py new file mode 100644 index 0000000..11e73ad --- /dev/null +++ b/dump_things_service/commands/load_config.py @@ -0,0 +1,236 @@ +from __future__ import annotations + +import os +import sys +from argparse import ArgumentParser +from itertools import count +from pathlib import Path + +import requests +import yaml + +from dump_things_service.instance_state import get_record_dir_config + + +parser = ArgumentParser( + prog='Establish a configuration in a running service', + description='Read a configuration from a dump-things configuration-file ' + 'and instantiate its elements on a running server. Objects that ' + 'already exist on the server are left unchanged. ' + ' ' + 'An admin token has to be provided in the environment variable ' + '`DTS_ADMIN_TOKEN`.', +) +parser.add_argument( + 'config_file', + help='The path to the config file', +) +parser.add_argument( + '--send-to', + help='The base URL of the server API', +) +parser.add_argument( + '--old-format', + action='store_true', + help='If provided, assume that the configuration is in the old format ' + 'and convert it to the new format internally (in old format: tokens ' + 'had no `hashed`-attribute and no `representation`-attribute, the token ' + 'representation was the key of the token configuration, ' + 'collections had no `schema`-attribute, and `sqlite`-backends had ' + 'a `schema`-attribute).', +) +parser.add_argument( + '--store', + default=None, + help='If --old-format is provided, this option can be used to specify a ' + 'store directory. The store directory will be used to load `RecordDir` ' + 'configurations, if a collection defines are `RecordDir`-backend. ' + '(This option has no effect if no collection in the old configuration ' + 'uses a `RecordDir`-backend.)', +) + + +def main(): + arguments = parser.parse_args() + + with open(arguments.config_file) as config_file: + configuration = yaml.safe_load(config_file) + + assert configuration['type'] == 'collections', '`type`-entry missing in old config-file' + if arguments.old_format: + configuration = convert_to_new_format(configuration, arguments.store) + else: + if arguments.store: + print( + 'Warning: ignoring `--store` option because `--old-format` ' + 'is not provided.', + file=sys.stderr, + flush=True, + ) + + assert configuration['version'] == 2, '`version: 2` missing in config-file' + + if arguments.send_to: + admin_token = os.environ.get('DTS_ADMIN_TOKEN') + if not admin_token: + print( + 'An admin token not provided in the environment variable `DTS_ADMIN_TOKEN`', + file=sys.stderr, + flush=True, + ) + return 1 + + try: + establish_configuration( + configuration, + arguments.send_to[:-1] + if arguments.send_to.endswith('/') + else arguments.send_to, + admin_token, + ) + return 0 + except RuntimeError as rte: + print(f'{rte.args[0]}', file=sys.stderr, flush=True) + return 2 + + print( + yaml.dump( + data=configuration, + sort_keys=False, + allow_unicode=True, + default_flow_style=False, + ) + ) + return 0 + + +def convert_to_new_format( + old_configuration: dict, + store_path: str | Path, +) -> dict: + + assert old_configuration['version'] == 1, '`version: 1` missing in old config-file' + + counter = count(1) + new_tokens_dict = { + f'token_{next(counter)}': { + **old_token_config.copy(), + 'representation': token_representation, + 'hashed': False + } + for token_representation, old_token_config in old_configuration['tokens'].items() + } + + old_to_new_token_mapping = { + token_config['representation']: token_name + for token_name, token_config in new_tokens_dict.items() + } + + store_path = Path(store_path) + for collection_name, collection_config in old_configuration['collections'].items(): + backend = collection_config.get('backend') + if backend and backend['type'].startswith('sqlite'): + collection_config['schema'] = backend['schema'] + del backend['schema'] + elif not backend or backend['type'].startswith('record_dir'): + if store_path is None: + msg = '--store has to be provided to convert collection with record_dir-backends' + raise ValueError(msg) + record_dir_config = get_record_dir_config(store_path / collection_config['curated']) + collection_config['schema'] = record_dir_config.schema + backend = { + 'type': 'record_dir+stl' if not backend else backend['type'], + 'mapping_method': record_dir_config.idfx.value + } + collection_config['backend'] = backend + collection_config['default_token'] = old_to_new_token_mapping[collection_config['default_token']] + + new_configuration = { + 'type': old_configuration['type'], + 'version': 2, + 'tokens': new_tokens_dict, + 'collections': old_configuration['collections'], + 'admin_tokens': {}, + } + return new_configuration + + +def establish_configuration( + configuration: dict, + api_url: str, + admin_token: str, +): + create_collections(configuration, api_url, admin_token) + create_tokens(configuration, api_url, admin_token) + create_admin_tokens(configuration, api_url, admin_token) + + +def create_tokens( + configuration: dict, + api_url: str, + admin_token: str, +): + for token_name, token_config in configuration['tokens'].items(): + _post_data( + url=api_url + '/tokens', + data={ + **token_config, + 'name': token_name, + }, + token=admin_token, + content_class='token', + content_name=token_name, + ) + + +def create_collections( + configuration: dict, + api_url: str, + admin_token: str, +): + for collection_name, collection_config in configuration['collections'].items(): + _post_data( + url=api_url + '/collections', + data={ + **collection_config, + 'name': collection_name, + }, + token=admin_token, + content_class='collection', + content_name=collection_name, + ) + + +def create_admin_tokens( + configuration: dict, + api_url: str, + admin_token: str, +): + for admin_token_name, admin_token_config in configuration['admin_tokens'].items(): + _post_data( + url=api_url + '/admin_tokens', + data={ + **admin_token_config, + 'name': admin_token_name, + }, + token=admin_token, + content_class='admin token', + content_name=admin_token_name, + ) + + +def _post_data( + url: str, + data: dict, + token: str, + content_class: str, + content_name: str, +): + result = requests.post(url, headers={'x-dumpthings-token': token}, json=data,) + if result.status_code >= 300: + msg = f'Error uploading {content_class}: {content_name}: {result.text}' + raise RuntimeError(msg) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/dump_things_service/commands/rebuild_index.py b/dump_things_service/commands/rebuild_index.py index f909d22..0825dc4 100644 --- a/dump_things_service/commands/rebuild_index.py +++ b/dump_things_service/commands/rebuild_index.py @@ -8,7 +8,8 @@ import yaml from dump_things_service import config_file_name from dump_things_service.backends.record_dir_index import RecordDirIndex -from dump_things_service.config import CollectionDirConfig +from dump_things_service.abstract_config import RecordDirConfigFileContent + parser = ArgumentParser( prog='Rebuild the index of a `record_dir`-store', @@ -59,7 +60,7 @@ def process_config(arguments) -> tuple[Path, str, str]: config_path = ( Path(arguments.config) if arguments.config else store / config_file_name ) - config_object = CollectionDirConfig( + config_object = RecordDirConfigFileContent( **yaml.load(config_path.read_text(), Loader=yaml.SafeLoader) ) return ( diff --git a/dump_things_service/config.py b/dump_things_service/config.py deleted file mode 100644 index aa373ca..0000000 --- a/dump_things_service/config.py +++ /dev/null @@ -1,666 +0,0 @@ -from __future__ import annotations - -import dataclasses -import enum -import hashlib -import logging -from functools import partial -from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Literal, -) - -import yaml -from fastapi import HTTPException -from pydantic import ( - BaseModel, - ConfigDict, - Field, - ValidationError, -) -from yaml.scanner import ScannerError - -from dump_things_service import ( - HTTP_404_NOT_FOUND, - Format, -) -from dump_things_service.audit.gitaudit import GitAuditBackend -from dump_things_service.backends.record_dir import RecordDirStore -from dump_things_service.backends.schema_type_layer import SchemaTypeLayer -from dump_things_service.backends.sqlite import SQLiteBackend -from dump_things_service.backends.sqlite import ( - record_file_name as sqlite_record_file_name, -) -from dump_things_service.converter import FormatConverter, get_conversion_objects -from dump_things_service.exceptions import ( - ConfigError, - CurieResolutionError, -) -from dump_things_service.model import get_model_for_schema -from dump_things_service.resolve_curie import resolve_curie -from dump_things_service.store.model_store import ModelStore -from dump_things_service.token import ( - TokenPermission, - get_token_parts, - hash_token, -) -from dump_things_service.utils import check_collection - -if TYPE_CHECKING: - import types - -logger = logging.getLogger('dump_things_service') - -config_file_name = '.dumpthings.yaml' -ignored_files = {'.', '..', config_file_name} - - -_global_config_instance = None - - -class StrictModel(BaseModel): - model_config = ConfigDict(extra='forbid') - - -class MappingMethod(enum.Enum): - digest_md5 = 'digest-md5' - digest_md5_p3 = 'digest-md5-p3' - digest_md5_p3_p3 = 'digest-md5-p3-p3' - digest_sha1 = 'digest-sha1' - digest_sha1_p3 = 'digest-sha1-p3' - digest_sha1_p3_p3 = 'digest-sha1-p3-p3' - after_last_colon = 'after-last-colon' - - -class CollectionDirConfig(StrictModel): - type: Literal['records'] - version: Literal[1] - schema: str - format: Literal['yaml'] - idfx: MappingMethod - - -class TokenModes(enum.Enum): - READ_CURATED = 'READ_CURATED' - READ_COLLECTION = 'READ_COLLECTION' - WRITE_COLLECTION = 'WRITE_COLLECTION' - READ_SUBMISSIONS = 'READ_SUBMISSIONS' - WRITE_SUBMISSIONS = 'WRITE_SUBMISSIONS' - SUBMIT = 'SUBMIT' - SUBMIT_ONLY = 'SUBMIT_ONLY' - NOTHING = 'NOTHING' - CURATOR = 'CURATOR' - - -class TokenCollectionConfig(BaseModel): - model_config = ConfigDict(extra='forbid') - mode: TokenModes - incoming_label: str = Field(strict=True) - - -class TokenConfig(StrictModel): - user_id: str - collections: dict[str, TokenCollectionConfig] - hashed: bool = False - - -class BackendConfigRecordDir(StrictModel): - type: Literal['record_dir', 'record_dir+stl'] - - -class BackendConfigSQLite(StrictModel): - type: Literal['sqlite', 'sqlite+stl'] - schema: str - - -class ForgejoAuthConfig(StrictModel): - type: Literal['forgejo'] - url: str - organization: str - team: str - label_type: Literal['team', 'user'] - instance_id: str | None = None - repository: str | None = None - - -class ConfigAuthConfig(StrictModel): - type: Literal['config'] = 'config' - - -class GitAuditBackendConfig(StrictModel): - type: Literal['gitaudit'] - path: Path - auto_flush_timeout: int = 60 - - -class TagConfig(StrictModel): - submitter_id_tag: str = 'http://purl.obolibrary.org/obo/NCIT_C54269' - submission_time_tag: str = 'http://semanticscience.org/resource/SIO_001083' - - -class CollectionConfig(StrictModel): - default_token: str - curated: Path - incoming: Path | None = None - backend: BackendConfigRecordDir | BackendConfigSQLite | None = None - auth_sources: list[ForgejoAuthConfig | ConfigAuthConfig] = [ConfigAuthConfig()] - submission_tags: TagConfig = TagConfig() - use_classes: list[str] = dataclasses.field(default_factory=list) - ignore_classes: list[str] = dataclasses.field(default_factory=list) - audit_backends: list[GitAuditBackendConfig] = dataclasses.field(default_factory=list) - - -class GlobalConfig(StrictModel): - model_config = ConfigDict(strict=True) - - type: Literal['collections'] - version: Literal[1] - collections: dict[str, CollectionConfig] - tokens: dict[str, TokenConfig] - - -@dataclasses.dataclass -class InstanceConfig: - store_path: Path - collections: dict = dataclasses.field(default_factory=dict) - all_stores: dict = dataclasses.field(default_factory=dict) - curated_stores: dict = dataclasses.field(default_factory=dict) - incoming: dict = dataclasses.field(default_factory=dict) - zones: dict = dataclasses.field(default_factory=dict) - permissions: dict = dataclasses.field(default_factory=dict) - model_info: dict = dataclasses.field(default_factory=dict) - token_stores: dict = dataclasses.field(default_factory=dict) - schemas: dict = dataclasses.field(default_factory=dict) - conversion_objects: dict = dataclasses.field(default_factory=dict) - backend: dict = dataclasses.field(default_factory=dict) - auth_providers: dict = dataclasses.field(default_factory=dict) - tokens: dict = dataclasses.field(default_factory=dict) - hashed_tokens: dict = dataclasses.field(default_factory=dict) - validators: dict = dataclasses.field(default_factory=dict) - use_classes: dict = dataclasses.field(default_factory=dict) - maintenance_mode: set = dataclasses.field(default_factory=set) - audit_backends: dict = dataclasses.field(default_factory=dict) - -mode_mapping = { - TokenModes.READ_CURATED: TokenPermission(curated_read=True), - TokenModes.READ_COLLECTION: TokenPermission( - curated_read=True, - incoming_read=True, - ), - TokenModes.WRITE_COLLECTION: TokenPermission( - curated_read=True, - incoming_read=True, - incoming_write=True, - ), - TokenModes.READ_SUBMISSIONS: TokenPermission(incoming_read=True), - TokenModes.WRITE_SUBMISSIONS: TokenPermission( - incoming_read=True, - incoming_write=True, - ), - TokenModes.SUBMIT: TokenPermission(curated_read=True, incoming_write=True), - TokenModes.SUBMIT_ONLY: TokenPermission(incoming_write=True), - TokenModes.NOTHING: TokenPermission(), - TokenModes.CURATOR: TokenPermission( - curated_read=True, - incoming_read=True, - incoming_write=True, - curated_write=True, - zones_access=True, - ), -} - - -def get_hex_digest(hasher: Callable, data: str) -> str: - hash_context = hasher(data.encode()) - return hash_context.hexdigest() - - -def mapping_digest_p3( - hasher: Callable, - pid: str, - suffix: str, -) -> Path: - hex_digest = get_hex_digest(hasher, pid) - return Path(hex_digest[:3]) / (hex_digest[3:] + '.' + suffix) - - -def mapping_digest_p3_p3( - hasher: Callable, - pid: str, - suffix: str, -) -> Path: - hex_digest = get_hex_digest(hasher, pid) - return Path(hex_digest[:3]) / hex_digest[3:6] / (hex_digest[6:] + '.' + suffix) - - -def mapping_digest(hasher: Callable, pid: str, suffix: str) -> Path: - hex_digest = get_hex_digest(hasher, pid) - return Path(hex_digest + '.' + suffix) - - -def mapping_after_last_colon(pid: str, suffix: str) -> Path: - plain_result = pid.split(':')[-1] - # Escape any colons and slashes in the pid - escaped_result = ( - plain_result.replace('_', '__').replace('/', '_s').replace('.', '_d') - ) - return Path(escaped_result + '.' + suffix) - - -mapping_functions = { - MappingMethod.digest_md5: partial(mapping_digest, hashlib.md5), - MappingMethod.digest_md5_p3: partial(mapping_digest_p3, hashlib.md5), - MappingMethod.digest_md5_p3_p3: partial(mapping_digest_p3_p3, hashlib.md5), - MappingMethod.digest_sha1: partial(mapping_digest, hashlib.sha1), - MappingMethod.digest_sha1_p3: partial(mapping_digest_p3, hashlib.sha1), - MappingMethod.digest_sha1_p3_p3: partial(mapping_digest_p3_p3, hashlib.sha1), - MappingMethod.after_last_colon: mapping_after_last_colon, -} - - -def get_mapping_function_by_name(mapping_function_name: str) -> Callable: - return mapping_functions[MappingMethod(mapping_function_name)] - - -def get_mapping_function(collection_config: CollectionDirConfig): - return mapping_functions[collection_config.idfx] - - -def get_permissions(mode: TokenModes) -> TokenPermission: - return mode_mapping[mode] - - -class Config: - @staticmethod - def get_config_from_file(path: Path) -> GlobalConfig: - try: - return GlobalConfig(**yaml.load(path.read_text(), Loader=yaml.SafeLoader)) - except ScannerError as e: - msg = f'YAML-error while reading config file {path}: {e}' - raise ConfigError(msg) from e - except TypeError: - msg = f'Error in yaml file {path}: content is not a mapping' - raise ConfigError(msg) from None - except ValidationError as e: - msg = f'Pydantic-error reading config file {path}: {e}' - raise ConfigError(msg) from e - - @staticmethod - def get_config(path: Path, file_name=config_file_name) -> GlobalConfig: - return Config.get_config_from_file(path / file_name) - - @staticmethod - def get_collection_dir_config( - path: Path, - file_name: str = config_file_name, - ) -> CollectionDirConfig: - config_path = path / file_name - if not config_path.exists(): - msg = f'Config file does not exist: {config_path}' - raise ConfigError(msg) - try: - return CollectionDirConfig( - **yaml.load(config_path.read_text(), Loader=yaml.SafeLoader) - ) - except ScannerError as e: - msg = f'YAML-error while reading config file {config_path}: {e}' - raise ConfigError(msg) from e - except ValidationError as e: - msg = f'Pydantic-error reading config file {config_path}: {e}' - raise ConfigError(msg) from e - - -def process_config( - store_path: Path, - config_file: Path, - order_by: list[str], - globals_dict: dict[str, Any], -) -> InstanceConfig: - global global_config_instance - - config_object = Config.get_config_from_file(config_file) - global_config_instance = process_config_object( - store_path=store_path, - config_object=config_object, - order_by=order_by, - globals_dict=globals_dict, - ) - return global_config_instance - - -def get_config(): - return global_config_instance - - -def process_config_object( - store_path: Path, - config_object: GlobalConfig, - order_by: list[str], - globals_dict: dict[str, Any], -): - from dump_things_service.auth.config import ConfigAuthenticationSource - from dump_things_service.auth.forgejo import ForgejoAuthenticationSource - - instance_config = InstanceConfig(store_path=store_path) - instance_config.collections = config_object.collections - - for collection_name, collection_info in config_object.collections.items(): - # Create the authentication providers - instance_config.auth_providers[collection_name] = [] - - auth_provider_list = [] - # Check for multiple providers - for auth_provider in collection_info.auth_sources: - if auth_provider.type == 'config': - key = ('config',) - elif auth_provider.type == 'forgejo': - key = ( - 'forgejo', - auth_provider.url, - auth_provider.organization, - auth_provider.team, - auth_provider.label_type, - auth_provider.repository, - ) - else: - msg = f'Unknown authentication provider type: {auth_provider.type}' - raise ConfigError(msg) - if key in auth_provider_list: - logger.warning('Ignoring duplicated authentication provider: %s', key) - continue - auth_provider_list.append(key) - - for auth_provider in auth_provider_list: - if auth_provider[0] == 'config': - instance_config.auth_providers[collection_name].append( - ConfigAuthenticationSource( - instance_config=instance_config, - collection=collection_name, - ) - ) - else: - instance_config.auth_providers[collection_name].append( - ForgejoAuthenticationSource(*auth_provider[1:]) - ) - - # Set the default backend if not specified - backend = collection_info.backend or BackendConfigRecordDir( - type='record_dir+stl' - ) - - instance_config.backend[collection_name] = backend - backend_name, extension = get_backend_and_extension(backend.type) - if backend_name == 'record_dir': - # Get the config from the curated directory - collection_config = Config.get_collection_dir_config( - store_path / collection_info.curated - ) - schema = collection_config.schema - elif backend.type == 'sqlite': - schema = backend.schema - else: - msg = f'Unsupported backend `{collection_info.backend}` for collection `{collection_name}`.' - raise ConfigError(msg) - - # Generate the collection model - model, classes, model_var_name = get_model_for_schema(schema) - instance_config.model_info[collection_name] = model, classes, model_var_name - globals_dict[model_var_name] = model - - # Generate the curated stores - if backend_name == 'record_dir': - curated_store_backend = RecordDirStore( - root=store_path / collection_info.curated, - pid_mapping_function=get_mapping_function(collection_config), - suffix=collection_config.format, - order_by=order_by, - ) - curated_store_backend.build_index_if_needed(schema=schema) - elif backend.type == 'sqlite': - curated_store_backend = SQLiteBackend( - db_path=store_path / collection_info.curated / sqlite_record_file_name, - ) - else: - msg = f'Unsupported backend `{collection_info.backend}` for collection `{collection_name}`.' - raise ConfigError(msg) - - if extension == 'stl': - curated_store_backend = SchemaTypeLayer( - backend=curated_store_backend, - schema=schema, - ) - - curated_store = ModelStore( - schema=schema, - backend=curated_store_backend, - tags={ - 'id': collection_info.submission_tags.submitter_id_tag, - 'time': collection_info.submission_tags.submission_time_tag, - } - ) - - instance_config.curated_stores[collection_name] = curated_store - - if collection_info.incoming: - instance_config.incoming[collection_name] = collection_info.incoming - - instance_config.schemas[collection_name] = schema - if schema not in instance_config.conversion_objects: - instance_config.conversion_objects[schema] = get_conversion_objects(schema) - - # We do not create stores for tokens here, but leave it to the token - # authentication routine. - instance_config.token_stores[collection_name] = {} - - # Generate audit backends - instance_config.audit_backends[collection_name] = [] - for audit_backend in collection_info.audit_backends: - instance_config.audit_backends[collection_name].append( - GitAuditBackend(audit_backend.path, audit_backend.auto_flush_timeout) - ) - - # Create validator for each collection - for collection_name, _ in config_object.collections.items(): - instance_config.validators[collection_name] = FormatConverter( - schema=instance_config.schemas[collection_name], - input_format=Format.json, - output_format=Format.ttl, - ) - - # Resolve classes-blacklist and -whitelist - for collection_name, collection_info in config_object.collections.items(): - - model_info = instance_config.model_info[collection_name] - - # If the whitelist is present, get all whitelisted classes - if collection_info.use_classes: - # Check that the whitelisted classes exist - undefined = [ - name - for name in collection_info.use_classes - if name not in model_info[1] - ] - if undefined: - msg = ( - 'used class(es): ' - + ', '.join(undefined) - + ' not defined in schema: ' - + model_info[0].linkml_meta.root['id'] - ) - raise ConfigError(msg) - use_classes = collection_info.use_classes - else: - use_classes = model_info[1] - - # Check for blacklisted classes - undefined = [ - name - for name in collection_info.ignore_classes - if name not in use_classes - ] - if undefined: - msg = ( - 'ignored class(es): ' - + ', '.join(undefined) - + ' not defined in schema or in `used_classes`: ' - + model_info[0].linkml_meta.root['id'] - ) - raise ConfigError(msg) - - instance_config.use_classes[collection_name] = [ - name - for name in use_classes - if name not in collection_info.ignore_classes - ] - - # Read info for tokens from the configuration - for token_name, token_info in config_object.tokens.items(): - for collection_name, token_collection_info in token_info.collections.items(): - - if collection_name not in instance_config.hashed_tokens: - instance_config.hashed_tokens[collection_name] = {} - - if token_info.hashed: - token_id, _ = get_token_parts(token_name) - if token_id == '': - msg = 'empty ID in hashed token' - raise ConfigError(msg) - if token_id in instance_config.hashed_tokens[collection_name]: - msg = f'duplicated ID in hashed token: {token_id}' - raise ConfigError(msg) - instance_config.hashed_tokens[collection_name][token_id] = token_name - - if collection_name not in instance_config.tokens: - instance_config.tokens[collection_name] = {} - - permissions = get_permissions(token_collection_info.mode) - instance_config.tokens[collection_name][token_name] = { - 'permissions': permissions, - 'user_id': token_info.user_id, - 'incoming_label': token_collection_info.incoming_label, - } - - # There is only a token store if the token has incoming read- or - # incoming write-permissions. If a token store exists, we ensure - # that an incoming path is set and an incoming label exists. - if permissions.incoming_read or permissions.incoming_write: - # Check that the incoming label is set for a token that has - # access rights to incoming records. - if not token_collection_info.incoming_label: - msg = f'Token `{token_name}` with mode {token_collection_info.mode} must not have an empty `incoming_label`' - raise ConfigError(msg) - - if any(c in token_collection_info.incoming_label for c in ('\\', '/')): - msg = ( - f'Incoming label for token `...` on collection ' - f'`{collection_name}` must not contain slashes or ' - f'backslashes: `{token_collection_info.incoming_label}`' - ) - raise ConfigError(msg) - - if collection_name not in instance_config.incoming: - msg = ( - 'Incoming location not defined for collection ' - f'`{collection_name}`, which has at least one token ' - f'with write access' - ) - raise ConfigError(msg) - - # Create all incoming zones - incoming_location = ( - store_path - / instance_config.collections[collection_name].incoming - / token_collection_info.incoming_label - ) - incoming_location.mkdir(parents=True, exist_ok=True) - - # Check that default tokens are defined - for collection_name, collection_info in config_object.collections.items(): - if collection_info.default_token not in instance_config.tokens[collection_name]: - msg = f'Unknown default token: `{collection_info.default_token}`' - raise ConfigError(msg) - - # Check that config authentication source is present if tokens are defined - # in the config file - for collection_name, _ in config_object.collections.items(): - config_tokens = instance_config.tokens.get(collection_name, {}) - if config_tokens: - if not any( - isinstance(auth_source, ConfigAuthenticationSource) - for auth_source in instance_config.auth_providers[collection_name] - ): - msg = ( - f'Collection `{collection_name}` has tokens defined in ' - 'configuration file, but no `config` authentication source' - ) - raise ConfigError(msg) - - # Check that hashed plain tokens do not clash with hashed tokens: - hashed_plain_tokens = { - hash_token(token) - for collection in instance_config.collections - for token in instance_config.tokens[collection] - if '-' in token - } - hashed_tokens = { - value - for token_dict in instance_config.hashed_tokens.values() - for value in token_dict.values() - } - if hashed_plain_tokens.intersection(hashed_tokens): - msg = 'plain tokens clash with hashed tokens' - raise ConfigError(msg) - - # Check tags - for collection_name, collection_info in config_object.collections.items(): - module = instance_config.model_info[collection_name][0] - try: - resolve_curie(module, collection_info.submission_tags.submission_time_tag) - except CurieResolutionError as e: - raise ConfigError(str(e)) from e - - return instance_config - - -def get_backend_and_extension(backend_type: str) -> tuple[str, str]: - elements = backend_type.split('+') - return (elements[0], elements[1]) if len(elements) > 1 else (elements[0], '') - - -def get_zone( - instance_config: InstanceConfig, - collection: str, - token: str, -) -> str | None: - """Get the zone for the given collection and token.""" - if collection not in instance_config.zones: - raise HTTPException( - status_code=HTTP_404_NOT_FOUND, - detail=f'No incoming zone defined for collection: {collection}', - ) - if token not in instance_config.zones[collection]: - raise HTTPException( - status_code=HTTP_404_NOT_FOUND, - detail=f'Missing incoming_label for given token in collection: {collection}', - ) - return instance_config.zones[collection][token] - - -def get_conversion_objects_for_collection( - instance_config: InstanceConfig, - collection_name: str, -) -> dict: - """Get the conversion objects for the given collection.""" - check_collection(instance_config, collection_name) - return instance_config.conversion_objects[instance_config.schemas[collection_name]] - - -def get_model_info_for_collection( - instance_config: InstanceConfig, - collection_name: str, -) -> tuple[types.ModuleType, dict[str, Any], str]: - check_collection(instance_config, collection_name) - return instance_config.model_info[collection_name] diff --git a/dump_things_service/converter.py b/dump_things_service/converter.py index 74d4348..ed85a8d 100644 --- a/dump_things_service/converter.py +++ b/dump_things_service/converter.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +from functools import cache from json import loads as json_loads from typing import ( TYPE_CHECKING, @@ -8,11 +9,11 @@ from typing import ( Callable, ) +from linkml_runtime import SchemaView from linkml.utils.datautils import ( get_dumper, get_loader, ) -from linkml_runtime import SchemaView from rdflib.term import ( URIRef, _toPythonMapping, @@ -24,9 +25,11 @@ from dump_things_service.lazy_list import LazyList from dump_things_service.model import ( get_model_for_schema, get_schema_model_for_schema, + get_schema_view, ) from dump_things_service.utils import cleaned_json + if TYPE_CHECKING: from types import ModuleType @@ -35,9 +38,6 @@ if TYPE_CHECKING: from dump_things_service.backends import RecordInfo -_cached_conversion_objects = {} - - class TypeValidator: def __init__( self, @@ -72,21 +72,27 @@ def add_type_validator( ) -def get_conversion_objects(schema: str): - if schema not in _cached_conversion_objects: - schema_view = SchemaView(schema) - _cached_conversion_objects[schema] = { - 'schema_module': get_schema_model_for_schema(schema), - 'schema_view': schema_view, - } - # Add types to support explicit type clauses in TTL - for type_definition in schema_view.all_types().values(): - uri = schema_view.expand_curie(type_definition.uri) - add_type_validator( - uri_ref=uri, - regex=type_definition.pattern, - ) - return _cached_conversion_objects[schema] +# Get conversion objects and prepare the conversion by adding type +# validators for explicit type expressions to the RDFLib loader. The +# latter is necessary to load the TTL in rdflib loader. +@cache +def get_conversion_objects(schema_location: str) -> dict: + schema_view = get_schema_view(schema_location) + result = { + 'schema_module': get_schema_model_for_schema(schema_location), + 'schema_view': schema_view, + } + + # Add types to support explicit type clauses in TTL + # TODO: this should probably be outside of a cached function and in a + # function with an appropriate name that indicates the side effect + for type_definition in schema_view.all_types().values(): + uri = schema_view.expand_curie(type_definition.uri) + add_type_validator( + uri_ref=uri, + regex=type_definition.pattern, + ) + return result class FormatConverter: diff --git a/dump_things_service/curated.py b/dump_things_service/curated.py index 7366343..dc79197 100644 --- a/dump_things_service/curated.py +++ b/dump_things_service/curated.py @@ -19,17 +19,18 @@ from fastapi_pagination import ( from dump_things_service import ( HTTP_401_UNAUTHORIZED, HTTP_404_NOT_FOUND, - HTTP_422_UNPROCESSABLE_CONTENT, + HTTP_422_UNPROCESSABLE_CONTENT, abstract_config, ) +from dump_things_service.abstract_config import check_collection, read_config, \ + get_config, get_token_config_for_representation_and_collection from dump_things_service.api_key import api_key_header_scheme from dump_things_service.backends.schema_type_layer import _SchemaTypeLayer -from dump_things_service.config import get_config from dump_things_service.exceptions import CurieResolutionError +from dump_things_service.instance_state import get_instance_state from dump_things_service.lazy_list import ModifierList from dump_things_service.utils import ( authenticate_token, check_bounds, - check_collection, cleaned_json, wrap_http_exception, ) @@ -39,7 +40,8 @@ if TYPE_CHECKING: from dump_things_service.backends import StorageBackend from dump_things_service.lazy_list import LazyList - from dump_things_service.store.model_store import ModelStore + from dump_things_service.store.model_store import _ModelStore + _endpoint_curated_template = """ async def {name}( @@ -79,8 +81,8 @@ async def read_curated_records_of_type( matching: str | None = None, api_key: str | None = Depends(api_key_header_scheme), ): - instance_config = get_config() - if class_name not in instance_config.use_classes[collection]: + instance_state = get_instance_state() + if class_name not in instance_state.collections[collection].active_classes: raise HTTPException( status_code=HTTP_404_NOT_FOUND, detail=f"No '{class_name}'-class in collection '{collection}'.", @@ -108,8 +110,8 @@ async def read_curated_records_of_type_paginated( api_key: str | None = Depends(api_key_header_scheme), ) -> Page[dict]: - instance_config = get_config() - if class_name not in instance_config.use_classes[collection]: + instance_state = get_instance_state() + if class_name not in instance_state.collections[collection].active_classes: raise HTTPException( status_code=HTTP_404_NOT_FOUND, detail=f"No '{class_name}'-class in collection '{collection}'.", @@ -207,10 +209,10 @@ async def _read_curated_records( pid: str | None, matching: str | None = None, api_key: str | None = None, - upper_bound: int = 1000, + upper_bound: int | None = 1000, ) -> LazyList | dict | None: - model_store, backend = await _get_store_and_backend(collection, api_key) + model_store, backend = _get_store_and_backend(collection, api_key) if pid: record_info = backend.get_record_by_iri(model_store.pid_to_iri(pid)) @@ -244,7 +246,7 @@ async def _delete_curated_record( api_key: str | None = None, ) -> bool: with wrap_http_exception(Exception): - model_store, backend = await _get_store_and_backend(collection, api_key) + model_store, backend = _get_store_and_backend(collection, api_key) result = backend.remove_record(model_store.pid_to_iri(pid)) if not result: raise HTTPException( @@ -255,10 +257,10 @@ async def _delete_curated_record( return True -async def _get_store_and_backend( +def _get_store_and_backend( collection: str, plain_token: str | None, -) -> tuple[ModelStore, StorageBackend]: +) -> tuple[_ModelStore, StorageBackend]: # A token is required if plain_token is None: @@ -267,13 +269,14 @@ async def _get_store_and_backend( detail='token required', ) - instance_config = get_config() + instance_state = get_instance_state() + abstract_config = read_config(instance_state.store_path) # Check that the collection exists - check_collection(instance_config, collection) + check_collection(abstract_config=abstract_config, collection=collection) # Get token permissions - auth_info = authenticate_token(instance_config, collection, plain_token) + auth_info = authenticate_token(instance_state, collection, plain_token) permissions = auth_info.token_permission if permissions.curated_write is False: raise HTTPException( @@ -282,90 +285,26 @@ async def _get_store_and_backend( ) # Get the curated model store - model_store = instance_config.curated_stores[collection] + model_store = instance_state.curated_stores[collection] backend = model_store.backend if isinstance(backend, _SchemaTypeLayer): return model_store, backend.backend return model_store, backend -def create_curated_endpoints( - app: FastAPI, - tag_info: list[dict[str, str]], - placeholder: str, - global_dict: dict, +def store_curated_record( + collection: str, + data: BaseModel, + class_name: str, + author_id: str | None = None, + api_key: str | None = Depends(api_key_header_scheme), ): - # Create endpoints for all classes in all collections - logger.info('Creating dynamic curated endpoints...') - serial_number = count() - - instance_config = get_config() - generated_tags = [] - - for collection, ( - model, - classes, - model_var_name, - ) in instance_config.model_info.items(): - - tag_name = f'Curated area: write records to curated area of collection "{collection}"' - - if model_var_name not in global_dict: - global_dict[model_var_name] = model - - for class_name in instance_config.use_classes[collection]: - - # Create an endpoint to dump data of type `class_name` of schema - # `application`. - endpoint_name = f'_endpoint_curated_{next(serial_number)}' - - endpoint_source = _endpoint_curated_template.format( - name=endpoint_name, - model_var_name=model_var_name, - class_name=class_name, - collection=collection, - info=f"'store {collection}/{class_name} objects'", - ) - exec(endpoint_source, global_dict) # noqa S102 - - # Create an API route for the endpoint - app.add_api_route( - path=f'/{collection}/curated/record/{class_name}', - endpoint=global_dict[endpoint_name], - methods=['POST'], - name=f'curated area: store "{class_name}" object (schema: {model.linkml_meta["id"]})', - response_model=None, - tags=[tag_name] - ) - - generated_tags.append({ - 'name': tag_name, - 'description': f'(requires **curator token**)', - }) - - index = tag_info.index({'name': placeholder, 'description': ''}) - tag_info[index:index + 1] = generated_tags - - logger.info( - 'Creation of %d curated endpoints completed.', - next(serial_number), - ) - - -async def store_curated_record( - collection: str, - data: BaseModel, - class_name: str, - author_id: str | None = None, - api_key: str | None = Depends(api_key_header_scheme), -): - - instance_config = get_config() + instance_state = get_instance_state() with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): - instance_config.validators[collection].validate(data) + instance_state.validators[collection].validate(data) pid = data.pid - model_store, backend = await _get_store_and_backend(collection, api_key) + model_store, backend = _get_store_and_backend(collection, api_key) json_object = cleaned_json( data.model_dump(exclude_none=True, mode='json'), @@ -379,9 +318,14 @@ async def store_curated_record( json_object, ) - for audit_backend in instance_config.audit_backends[collection]: + _, token_config, _ = get_token_config_for_representation_and_collection( + abstract_config=get_config(), + token_representation=api_key, + collection_name=collection, + ) + for audit_backend in instance_state.audit_backends[collection]: audit_backend.add_record( record=json_object, - committer_id=instance_config.tokens[collection][api_key]['user_id'], + committer_id=token_config.user_id, author_id=author_id, ) diff --git a/dump_things_service/dynamic_endpoints.py b/dump_things_service/dynamic_endpoints.py deleted file mode 100644 index 913160a..0000000 --- a/dump_things_service/dynamic_endpoints.py +++ /dev/null @@ -1,137 +0,0 @@ -import logging -from itertools import count - -from fastapi import FastAPI - -from dump_things_service.config import InstanceConfig - -logger = logging.getLogger('dump_things_service') - - -_endpoint_template = """ -async def {name}( - data: {model_var_name}.{class_name} | Annotated[str, Body(media_type='text/plain')], - api_key: str = Depends(api_key_header_scheme), - format: Format = Format.json, -) -> JSONResponse | PlainTextResponse: - logger.info('{name}(%s, %s, %s, %s)', repr(data), repr('{class_name}'), repr({model_var_name}), repr(format)) - return {handler}('{collection}', data, '{class_name}', {model_var_name}, format, api_key) -""" - - -def create_store_endpoints( - app: FastAPI, - instance_config: InstanceConfig, - tag_info: list[dict[str, str]], - placeholder: str, - global_dict: dict, -): - # Create endpoints for all classes in all collections - logger.info('Creating dynamic store_record endpoints...') - serial_number = count() - - generated_tags = [] - - for collection, ( - model, - classes, - model_var_name, - ) in instance_config.model_info.items(): - - tag_name = f'Write records to collection "{collection}"' - - global_dict[model_var_name] = model - for class_name in instance_config.use_classes[collection]: - - # Create an endpoint to dump data of type `class_name` in version - # `version` of schema `application`. - endpoint_name = f'_endpoint_{next(serial_number)}' - - endpoint_source = _endpoint_template.format( - name=endpoint_name, - model_var_name=model_var_name, - class_name=class_name, - collection=collection, - info=f"'store {collection}/{class_name} objects'", - handler='store_record', - ) - exec(endpoint_source, global_dict) # noqa S102 - - # Create an API route for the endpoint - app.add_api_route( - path=f'/{collection}/record/{class_name}', - endpoint=global_dict[endpoint_name], - methods=['POST'], - name=f'store "{class_name}" object (schema: {model.linkml_meta["id"]})', - response_model=None, - tags=[tag_name] - ) - - generated_tags.append({ - 'name': tag_name, - 'description': '', - }) - - index = tag_info.index({'name': placeholder, 'description': ''}) - tag_info[index:index + 1] = generated_tags - - logger.info('Creation of %d endpoints completed.', next(serial_number)) - - -def create_validate_endpoints( - app: FastAPI, - instance_config: InstanceConfig, - tag_info: list[dict[str, str]], - placeholder: str, - global_dict: dict, -): - # Create endpoints for all classes in all collections - logger.info('Creating dynamic validate_record endpoints...') - serial_number = count() - - generated_tags = [] - - for collection, ( - model, - classes, - model_var_name, - ) in instance_config.model_info.items(): - - tag_name = f'Validate records for collection "{collection}"' - - global_dict[model_var_name] = model - for class_name in instance_config.use_classes[collection]: - - # Create an endpoint to dump data of type `class_name` in version - # `version` of schema `application`. - endpoint_name = f'_endpoint_validate_{next(serial_number)}' - - endpoint_source = _endpoint_template.format( - name=endpoint_name, - model_var_name=model_var_name, - class_name=class_name, - collection=collection, - info=f"'validate {collection}/{class_name} objects'", - handler='validate_record', - ) - exec(endpoint_source, global_dict) # noqa S102 - - # Create an API route for the endpoint - app.add_api_route( - path=f'/{collection}/validate/record/{class_name}', - endpoint=global_dict[endpoint_name], - methods=['POST'], - name=f'Validate "{class_name}" object (schema: {model.linkml_meta["id"]})', - response_model=None, - tags=[tag_name] - ) - - generated_tags.append({ - 'name': tag_name, - 'description': '', - }) - - index = tag_info.index({'name': placeholder, 'description': ''}) - tag_info[index:index + 1] = generated_tags - - logger.info('Creation of %d endpoints completed.', next(serial_number)) diff --git a/dump_things_service/exceptions.py b/dump_things_service/exceptions.py index aa703e7..8a3b8ae 100644 --- a/dump_things_service/exceptions.py +++ b/dump_things_service/exceptions.py @@ -2,5 +2,9 @@ class ConfigError(Exception): pass +class ConfigCollisionError(ConfigError): + pass + + class CurieResolutionError(Exception): pass diff --git a/dump_things_service/export/__init__.py b/dump_things_service/export/__init__.py deleted file mode 100644 index 7f1c00b..0000000 --- a/dump_things_service/export/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .json import export_json -from .tree import export_tree - -exporter_info = { - 'json': export_json, - 'tree': export_tree, -} diff --git a/dump_things_service/export/json.py b/dump_things_service/export/json.py deleted file mode 100644 index 122ee52..0000000 --- a/dump_things_service/export/json.py +++ /dev/null @@ -1,138 +0,0 @@ -import json -import sys -from pathlib import Path -from typing import TextIO - -from dump_things_service.config import InstanceConfig -from dump_things_service.lazy_list import LazyList -from dump_things_service.model import get_classes -from dump_things_service.store.model_store import ModelStore - -level_width = 2 - - -# The _lookahead function is taken from: -# https://stackoverflow.com/questions/1630320/what-is-the-pythonic-way-to-detect-the-last-element-in-a-for-loop -# with small changes -def _lookahead(iterable): - """Pass through all values from the given iterable, augmented by the - information if there are more values to come after the current one - (True), or if it is the last value (False). - """ - # Get an iterator and pull the first value. - it = iter(iterable) - try: - last = next(it) - except StopIteration: - return - # Run the iterator to exhaustion (starting from the second value). - for val in it: - # Report the *previous* value (more to come). - yield last, False - last = val - # Report the last value. - yield last, True - - -def export_json( - instance_config: InstanceConfig, - destination: str, -): - if destination == '-': - output = sys.stdout - else: - output = Path(destination).open('wt', encoding='utf-8') # noqa: SIM115 - - output.write('{\n') - for collection, is_last in _lookahead(instance_config.collections): - output.write(f'{level_width * " "}"{collection}": {{\n') - export_collection(instance_config, collection, 2 * level_width, output) - if is_last: - output.write(f'\n{level_width * " "}}}\n') - else: - output.write(f'\n{level_width * " "}}},\n') - output.write('}\n') - - -def export_collection( - instance_config: InstanceConfig, - collection: str, - indent: int, - output: TextIO, -): - output.write(f'{indent * " "}"schema": "{instance_config.schemas[collection]}",\n') - output.write(f'{indent * " "}"curated": {{\n') - append_classes( - instance_config.curated_stores[collection], indent + level_width, output - ) - output.write(f'\n{indent * " "}}}') - - # Determine stores for incoming zones - zones = { - label: instance_config.token_stores[token]['collections'] - .get(collection, {}) - .get('store') - for token, label in instance_config.zones.get(collection, {}).items() - if instance_config.token_stores[token]['collections'] - .get(collection, {}) - .get('store') - is not None - } - - if zones: - # Put a comma between "curated" and "incoming". - output.write(f',\n{indent * " "}"incoming": {{\n') - indent_zone = indent + level_width - indent_classes = indent_zone + level_width - for (zone, store), is_last in _lookahead(zones.items()): - output.write(f'{indent_zone * " "}"{zone}": {{\n') - append_classes(store, indent_classes, output) - if is_last: - output.write(f'\n{(indent + level_width) * " "}}}') - else: - output.write(f'\n{(indent + level_width) * " "}}},\n') - - # End the "incoming" dictionary - output.write(f'\n{indent * " "}}}') - - -def append_classes( - store: ModelStore, - indent: int, - output: TextIO, -): - """Append instances of all classes to the file""" - class_names = get_classes(store.model) - - first = True - for class_name in class_names: - # We know that pure `Thing` instances are not stored in the store. - if class_name == 'Thing': - continue - - class_instances = store.get_objects_of_class( - class_name, include_subclasses=False - ) - if class_instances: - if not first: - output.write(',\n') - first = False - output.write(f'{indent * " "}"{class_name}": [\n') - append_instances( - class_instances, - output, - indent + level_width, - ) - output.write(f'\n{indent * " "}]') - - -def append_instances( - instances: LazyList, - output: TextIO, - indent: int, -): - for instance, is_last in _lookahead(instances): - json_string = json.dumps(instance.json_object, ensure_ascii=False) - output.write(f'{(indent + level_width) * " "}{json_string}') - if not is_last: - output.write(',\n') diff --git a/dump_things_service/export/tree.py b/dump_things_service/export/tree.py deleted file mode 100644 index d12855f..0000000 --- a/dump_things_service/export/tree.py +++ /dev/null @@ -1,115 +0,0 @@ -from pathlib import Path - -import yaml - -from dump_things_service.config import ( - InstanceConfig, - get_mapping_function_by_name, -) -from dump_things_service.model import get_classes -from dump_things_service.store.model_store import ModelStore - -idfx = get_mapping_function_by_name('digest-md5-p3-p3') - - -def export_tree( - instance_config: InstanceConfig, - destination: str, -): - destination = Path(destination) - if destination.exists() and not destination.is_dir(): - msg = 'The export_tree destination path must be a directory.' - raise ValueError(msg) - - destination.mkdir(parents=True, exist_ok=True) - for collection in instance_config.collections: - export_collection( - instance_config, - collection, - destination, - ) - - -def export_collection( - instance_config: InstanceConfig, - collection: str, - destination: Path, -): - collection_destination = destination / collection - collection_destination.mkdir(parents=True, exist_ok=True) - - config_content = ( - 'type: records\n' - 'version: 1\n' - f'schema: {instance_config.schemas[collection]}\n' - 'format: yaml\n' - 'idfx: digest-md5-p3-p3\n' - ) - - curated_destination = collection_destination / 'curated' - curated_destination.mkdir(parents=True, exist_ok=True) - (curated_destination / '.dumpthings.yaml').write_text(config_content) - exported_stores = { - id(instance_config.curated_stores[collection]): curated_destination - } - export_classes(instance_config.curated_stores[collection], curated_destination) - - # Determine stores for incoming zones - zones = { - label: instance_config.token_stores[token]['collections'] - .get(collection, {}) - .get('store') - for token, label in instance_config.zones.get(collection, {}).items() - if instance_config.token_stores[token]['collections'] - .get(collection, {}) - .get('store') - is not None - } - - if zones: - incoming_destination = collection_destination / 'incoming' - for zone, store in zones.items(): - zone_destination = incoming_destination / zone - if id(store) in exported_stores: - # Already exported this store, make `zone_destination` a link - # to the existing export. - zone_destination.parent.mkdir(parents=True, exist_ok=True) - zone_destination.symlink_to(exported_stores[id(store)]) - continue - exported_stores[id(store)] = zone_destination = ( - collection_destination / 'incoming' / zone - ) - zone_destination.mkdir(parents=True, exist_ok=True) - (zone_destination / '.dumpthings.yaml').write_text(config_content) - export_classes(store, zone_destination) - - -def export_classes( - store: ModelStore, - destination: Path, -): - class_names = get_classes(store.model) - for class_name in class_names: - # We know that pure `Thing` instances are not stored in the store. - if class_name == 'Thing': - continue - - record_infos = store.get_objects_of_class(class_name, include_subclasses=False) - if record_infos: - class_destination = destination / class_name - class_destination.mkdir(parents=True, exist_ok=True) - for record_info in record_infos: - json_object = record_info.json_object - instance_destination = class_destination / idfx( - json_object['pid'], - 'yaml', - ) - instance_destination.parent.mkdir(parents=True, exist_ok=True) - instance_destination.write_text( - yaml.dump( - data=json_object, - sort_keys=False, - allow_unicode=True, - default_flow_style=False, - ) - ) diff --git a/dump_things_service/incoming.py b/dump_things_service/incoming.py index ce123fc..2d888cd 100644 --- a/dump_things_service/incoming.py +++ b/dump_things_service/incoming.py @@ -1,13 +1,11 @@ from __future__ import annotations import logging -from itertools import count from typing import TYPE_CHECKING from fastapi import ( APIRouter, Depends, - FastAPI, HTTPException, ) from fastapi_pagination import ( @@ -21,19 +19,22 @@ from dump_things_service import ( HTTP_404_NOT_FOUND, HTTP_422_UNPROCESSABLE_CONTENT, ) +from dump_things_service.abstract_config import ( + check_collection, + check_label, + get_config_labels, + get_config, +) from dump_things_service.api_key import api_key_header_scheme from dump_things_service.backends.schema_type_layer import _SchemaTypeLayer -from dump_things_service.config import get_config from dump_things_service.exceptions import CurieResolutionError +from dump_things_service.instance_state import get_instance_state from dump_things_service.lazy_list import ModifierList from dump_things_service.utils import ( authenticate_token, check_bounds, - check_collection, - check_label, cleaned_json, create_token_store, - get_config_labels, get_on_disk_labels, wrap_http_exception, ) @@ -43,28 +44,7 @@ if TYPE_CHECKING: from dump_things_service.backends import StorageBackend from dump_things_service.lazy_list import LazyList - from dump_things_service.store.model_store import ModelStore - -_endpoint_incoming_template = """ -async def {name}( - data: {model_var_name}.{class_name}, - label: str, - api_key: str = Depends(api_key_header_scheme), -) -> JSONResponse: - logger.info( - '{name}(%s, %s, %s)', - repr(data), - repr(label), - repr({model_var_name}), - ) - return await store_incoming_record( - '{collection}', - label, - data, - '{class_name}', - api_key, - ) -""" + from dump_things_service.store.model_store import _ModelStore logger = logging.getLogger('dump_things_service') @@ -83,8 +63,10 @@ async def incoming_read_labels( ) -> list[str]: # Authorize api_key await authorize_zones(collection, api_key) + + instance_state = get_instance_state() configured_labels = get_config_labels(get_config(), collection) - on_disk_labels = get_on_disk_labels(get_config(), collection) + on_disk_labels = get_on_disk_labels(instance_state.store_path, get_config(), collection) return list(configured_labels.union(on_disk_labels)) @@ -100,8 +82,8 @@ async def incoming_read_records_of_type( matching: str | None = None, api_key: str | None = Depends(api_key_header_scheme), ): - instance_config = get_config() - if class_name not in instance_config.use_classes[collection]: + instance_state = get_instance_state() + if class_name not in instance_state.collections[collection].active_classes: raise HTTPException( status_code=HTTP_404_NOT_FOUND, detail=f"No '{class_name}'-class in collection '{collection}'.", @@ -131,8 +113,8 @@ async def incoming_read_records_of_type_paginated( api_key: str | None = Depends(api_key_header_scheme), ) -> Page[dict]: - instance_config = get_config() - if class_name not in instance_config.use_classes[collection]: + instance_state = get_instance_state() + if class_name not in instance_state.collections[collection].active_classes: raise HTTPException( status_code=HTTP_404_NOT_FOUND, detail=f"No '{class_name}'-class in collection '{collection}'.", @@ -293,49 +275,58 @@ async def _get_store_and_backend( collection: str, label: str, plain_token: str | None, -) -> tuple[ModelStore, StorageBackend]: +) -> tuple[_ModelStore, StorageBackend]: # Authorize api_key await authorize_zones(collection, plain_token) # Check that the incoming zone exists - instance_config = get_config() - check_label(instance_config, collection, label) + instance_state = get_instance_state() + abstract_config = get_config() + check_label(instance_state.store_path, abstract_config, collection, label) # Create a store (or get an already created store) for collection # `collection` and storage dir `store_dir`. store_dir = ( - instance_config.store_path - / instance_config.incoming[collection] - / label + instance_state.store_path + / abstract_config.collections[collection].incoming + / label ) # `create_token_store` will cache and return already created stores with # the same collection and storage dir. model_store = create_token_store( - instance_config=instance_config, + abstract_configuration=abstract_config, + instance_state=instance_state, collection_name=collection, store_dir=store_dir, ) + xxx = """ # For consistency, associate the store with all matching tokens from the - # configuration file. + # configuration file. That means with all tokens that have the same + # input matching_tokens = [ - token - for token, token_info in instance_config.tokens[collection].items() - if token_info['incoming_label'] == label + token_name + for token_name, token_info in abstract_config.tokens.items() + if (collection, label) in [ + (collection_name, token_collection_info.incoming_label) + for collection_name, token_collection_info in token_info.items() + ] ] + for matching_token in matching_tokens: # Associate the store with all matching tokens in the configuration. # Note: there are stores that are not associated with a token in - # the configuration. These are stores that belong to a token that - # are authenticated with an external authentication source. - token_info = instance_config.tokens[collection][matching_token] - instance_config.token_stores[collection][matching_token] = ( + # the abstract configuration. These are stores that belong to a token + # that is authenticated with an external authentication source. + token_info = instance_state.tokens[collection][matching_token] + instance_state.token_stores[collection][matching_token] = ( model_store, matching_token, token_info['permissions'], token_info['user_id'], ) + """ backend = model_store.backend if isinstance(backend, _SchemaTypeLayer): @@ -354,12 +345,13 @@ async def authorize_zones( detail='token required', ) - instance_config = get_config() + abstract_config = get_config() + instance_state = get_instance_state() # Check that the collection exists - check_collection(instance_config, collection) + check_collection(abstract_config, collection) - auth_info = authenticate_token(instance_config, collection, plain_token) + auth_info = authenticate_token(instance_state, collection, plain_token) permissions = auth_info.token_permission if permissions.zones_access is False: raise HTTPException( @@ -368,69 +360,6 @@ async def authorize_zones( ) -def create_incoming_endpoints( - app: FastAPI, - tag_info: list[dict[str, str]], - placeholder: str, - global_dict: dict, -): - # Create endpoints for all classes in all collections - logger.info('Creating dynamic incoming endpoints...') - serial_number = count() - - instance_config = get_config() - generated_tags = [] - - for collection, ( - model, - classes, - model_var_name, - ) in instance_config.model_info.items(): - - tag_name = f'Incoming area: write records to the given incoming area of collection "{collection}"' - - if model_var_name not in global_dict: - global_dict[model_var_name] = model - - for class_name in instance_config.use_classes[collection]: - - # Create an endpoint to dump data of type `class_name` of schema - # `model`. - endpoint_name = f'_endpoint_incoming_{next(serial_number)}' - - endpoint_source = _endpoint_incoming_template.format( - name=endpoint_name, - model_var_name=model_var_name, - class_name=class_name, - collection=collection, - info=f"'store {collection}/{class_name} objects'", - ) - exec(endpoint_source, global_dict) # noqa S102 - - # Create an API route for the endpoint - app.add_api_route( - path=f'/{collection}/incoming/{{label}}/record/{class_name}', - endpoint=global_dict[endpoint_name], - methods=['POST'], - name=f'incoming area: store "{class_name}" object (schema: {model.linkml_meta["id"]})', - response_model=None, - tags=[tag_name] - ) - - generated_tags.append({ - 'name': tag_name, - 'description': f'(requires **curator token**)', - }) - - index = tag_info.index({'name': placeholder, 'description': ''}) - tag_info[index:index + 1] = generated_tags - - logger.info( - 'Creation of %d incoming endpoints completed.', - next(serial_number), - ) - - async def store_incoming_record( collection: str, label: str, @@ -439,9 +368,9 @@ async def store_incoming_record( api_key: str | None = Depends(api_key_header_scheme), ): - instance_config = get_config() + instance_state = get_instance_state() with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): - instance_config.validators[collection].validate(data) + instance_state.validators[collection].validate(data) pid = data.pid model_store, backend = await _get_store_and_backend( diff --git a/dump_things_service/instance_state.py b/dump_things_service/instance_state.py new file mode 100644 index 0000000..359a185 --- /dev/null +++ b/dump_things_service/instance_state.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +import dataclasses +import logging +from functools import cache +from pathlib import Path +from types import ModuleType +from typing import ( + Any, + Callable, +) + +import yaml +from fastapi import FastAPI +from linkml_runtime import SchemaView +from pydantic import ValidationError +from yaml.scanner import ScannerError + +from dump_things_service.abstract_config import ( + RecordDirConfigFileContent, + MappingMethod, + mapping_functions, +) + +from dump_things_service.converter import get_conversion_objects +from dump_things_service.exceptions import ConfigError +from dump_things_service.model import ( + get_model_for_schema, + get_schema_model_for_schema, + get_schema_view, +) + + +logger = logging.getLogger('dump_things_service') + +record_dir_config_file_name = '.dumpthings.yaml' +ignored_files = {'.', '..', record_dir_config_file_name} + + +@dataclasses.dataclass +class PydanticModuleInfo: + module: ModuleType + module_var_name: str + + +@dataclasses.dataclass +class SchemaInfo: + schema_view: SchemaView + classes: list[str] + pydantic_module_info: PydanticModuleInfo + python_module: ModuleType + conversion_objects: tuple[Any, Any] + + +@dataclasses.dataclass +class InstanceStateCollectionInfo: + active_classes: set[str] + tag_info: dict[str, str] + + +@cache +def get_schema_info(schema_location: str): + module, classes, module_var_name = get_model_for_schema(schema_location) + return SchemaInfo( + schema_view=get_schema_view(schema_location), + classes=classes, + pydantic_module_info=PydanticModuleInfo( + module=module, + module_var_name=module_var_name, + ), + python_module=get_schema_model_for_schema(schema_location), + conversion_objects=get_conversion_objects(schema_location), + ) + + +@dataclasses.dataclass +class InstanceState: + # foundational information from command line or initialization code + store_path: Path + bootstrap_token: str | None + + # Dynamically created elements + fastapi_app: FastAPI + + # Influenced by maintainer interface + maintenance_mode: set = dataclasses.field(default_factory=set) + + # Created based on abstract configuration + collections: dict[str, InstanceStateCollectionInfo] = dataclasses.field(default_factory=dict) + tokens: dict = dataclasses.field(default_factory=dict) + auth_sources: dict[str, list] = dataclasses.field(default_factory=dict) + audit_backends: dict[str, list] = dataclasses.field(default_factory=dict) + curated_stores: dict = dataclasses.field(default_factory=dict) + incoming_stores: dict = dataclasses.field(default_factory=dict) + schema_info: dict[str, SchemaInfo] = dataclasses.field(default_factory=dict) + validators: dict = dataclasses.field(default_factory=dict) + order_by: list[str] = dataclasses.field(default_factory=list) + + +g_instance_state:InstanceState | None = None + + +def create_instance_state( + store_path: Path, + bootstrap_token: str, + fastapi_app: FastAPI, +) -> InstanceState: + global g_instance_state + + if g_instance_state: + logger.warning('create_instance_state() already called') + else: + g_instance_state = InstanceState( + store_path=store_path, + bootstrap_token=bootstrap_token, + fastapi_app=fastapi_app, + ) + return g_instance_state + + +def get_instance_state() -> InstanceState: + global g_instance_state + + if not g_instance_state: + msg = 'get_instance_state() called before create_instance_state()' + raise RuntimeError(msg) + return g_instance_state + + +def get_record_dir_config( + path: Path, + file_name: str = record_dir_config_file_name, +) -> RecordDirConfigFileContent: + config_path = path / file_name + if not config_path.exists(): + msg = f'Config file does not exist: {config_path}' + raise ConfigError(msg) + try: + return RecordDirConfigFileContent( + **yaml.load(config_path.read_text(), Loader=yaml.SafeLoader) + ) + except ScannerError as e: + msg = f'YAML-error while reading config file {config_path}: {e}' + raise ConfigError(msg) from e + except ValidationError as e: + msg = f'Pydantic-error reading config file {config_path}: {e}' + raise ConfigError(msg) from e + + +def get_mapping_function_by_name(mapping_function_name: str) -> Callable: + return mapping_functions[MappingMethod(mapping_function_name)] + + +def get_mapping_function(collection_config: RecordDirConfigFileContent): + return mapping_functions[collection_config.idfx] diff --git a/dump_things_service/lazy_list.py b/dump_things_service/lazy_list.py index 91ca11a..83eb0de 100644 --- a/dump_things_service/lazy_list.py +++ b/dump_things_service/lazy_list.py @@ -177,7 +177,7 @@ class PriorityList(LazyList): """ def __init__( - self, + self, ): super().__init__() self.seen = set() diff --git a/dump_things_service/main.py b/dump_things_service/main.py index 39fccec..d2cb0ed 100644 --- a/dump_things_service/main.py +++ b/dump_things_service/main.py @@ -2,23 +2,23 @@ from __future__ import annotations # noqa: I001 -- the patches have to be impor import argparse import logging +import os +import sys from pathlib import Path -from typing import ( - Annotated, # noqa F401 -- used by generated code - Any, - TYPE_CHECKING, -) +from typing import TYPE_CHECKING +from dump_things_service.abstract_config import store_config +from dump_things_service.commands.load_config import convert_to_new_format +from dump_things_service.manifest import manifest_configuration # Perform the patching before importing any third-party libraries -from dump_things_service.patches import enabled # noqa: F401 +from dump_things_service.patches import enabled # noqa F401 -- used by generated code +import yaml import uvicorn from fastapi import ( - Body, # noqa F401 -- used by generated code Depends, FastAPI, HTTPException, - Response, # noqa F401 -- used by generated code ) from fastapi.middleware.cors import CORSMiddleware from fastapi_pagination import ( @@ -29,11 +29,9 @@ from fastapi_pagination import ( from fastapi_pagination.utils import disable_installed_extensions_check from pydantic import ( BaseModel, - TypeAdapter, - ValidationError, + Field, ) from starlette.responses import ( - JSONResponse, PlainTextResponse, RedirectResponse, ) @@ -42,51 +40,38 @@ from dump_things_service import ( HTTP_400_BAD_REQUEST, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND, - HTTP_422_UNPROCESSABLE_CONTENT, Format, - config_file_name, ) from dump_things_service.__about__ import __version__ -from dump_things_service.api_key import api_key_header_scheme -from dump_things_service.config import ( - get_config, - process_config, +from dump_things_service.abstract_config import ( + Configuration, + check_collection, + hash_token_representation, + read_config, ) +from dump_things_service.api_key import api_key_header_scheme from dump_things_service.converter import ( FormatConverter, ConvertingList, ) -from dump_things_service.curated import ( - create_curated_endpoints, - router as curated_router, - store_curated_record, # noqa F401 -- used by generated code -) +from dump_things_service.curated import router as curated_router from dump_things_service.exceptions import CurieResolutionError -from dump_things_service.incoming import ( - create_incoming_endpoints, - router as incoming_router, - store_incoming_record, # noqa F401 -- used by generated code -) -from dump_things_service.dynamic_endpoints import ( - create_store_endpoints, - create_validate_endpoints, -) +from dump_things_service.incoming import router as incoming_router +from dump_things_service.instance_state import create_instance_state, \ + InstanceState from dump_things_service.lazy_list import ( PriorityList, ModifierList, ) -from dump_things_service.model import ( - get_classes, - get_subclasses, +from dump_things_service.model import get_subclasses +from dump_things_service.collection_endpoints import router as collection_router +from dump_things_service.token_endpoints import ( + hash_matcher, + router as token_router, ) from dump_things_service.utils import ( authenticate_token, check_bounds, - check_collection, - combine_ttl, - get_default_token_name, - get_token_store, - join_default_token_permissions, process_token, wrap_http_exception, ) @@ -102,7 +87,7 @@ class MaintenanceRequest(BaseModel): class ServerCollectionResponse(BaseModel): name: str - schema: str + schema_location: str = Field(alias='schema') classes: list[str] @@ -124,11 +109,24 @@ parser = argparse.ArgumentParser() parser.add_argument('--host', default='0.0.0.0') # noqa S104 parser.add_argument('--port', default=8000, type=int) parser.add_argument('--origins', action='append', default=[]) +parser.add_argument( + '--admin-token-hash', + type=str, + default='', + help='The sha256 hash of an initial admin token that will allow to add or ' + 'remove tokens, collections, and additional admin tokens (64 ' + 'characters hex-digit). NOTE: an admin token in plaintext is read ' + 'from the environment variable `DTS_ADMIN_TOKEN` if it is set, and ' + 'if this option is not provided.', +) parser.add_argument( '-c', '--config', metavar='CONFIG_FILE', - help="Read the configuration from 'CONFIG_FILE' instead of looking for it in the data store root directory. ", + help="Read the configuration from 'CONFIG_FILE' if no persisted " + "configuration is found in the data store root directory, and " + "initialize the persistent configuration and the service state with " + "the values in 'CONFIG_FILE'.", ) parser.add_argument( '--root-path', @@ -142,7 +140,7 @@ parser.add_argument( ) parser.add_argument( 'store', - help='The root of the data stores, it should contain a global_store and token_stores.', + help='The root of the data store, it should contain a global_store and token_stores.', ) @@ -162,250 +160,139 @@ Curators store data in an incoming area or in the curated area and read data from any incoming area or the curated area. -For more information refer to the [README-file](https://github.com/christian-monch/dump-things-server?tab=readme-ov-file#dump-things-service) +For more information refer to the [README-file](https://hub.psychoinformatics.de/orinoco/dump-things-server) of the project. """ -tag_info = [ - { - 'name': 'Server management', - 'description': 'General server operations', - }, - { - 'name': 'Read records', - 'description': 'Read records from the given collection', - }, - { - 'name': 'placeholder_write', - 'description': '', - }, - { - 'name': 'placeholder_validate', - 'description': '', - }, - { - 'name': 'Delete records', - 'description': 'Delete records from the incoming area associated with the authorization token', - }, - { - 'name': 'Curated area: read records', - 'description': 'Read records only from the curated area of the given collection (requires **curator token**)', - }, - { - 'name': 'placeholder_curated_write', - 'description': '', - }, - { - 'name': 'Curated area: delete records', - 'description': 'Delete records from the curated area of the given collection (requires **curator token**)', - }, - { - 'name': 'Incoming area: read labels', - 'description': 'Read labels of all incoming areas for the given collection (requires **curator token**)', - }, - { - 'name': 'Incoming area: read records', - 'description': 'Read records from the given incoming area of the given collection (requires **curator token**)', - }, - { - 'name': 'placeholder_incoming_write', - 'description': '', - }, - { - 'name': 'Incoming area: delete records', - 'description': 'Delete records from the given incoming area of the given collection (requires **curator token**)', - }, -] - - arguments = parser.parse_args() + +# Try to get bootstrap token from environment if an admin token hash is +# not provided via option +if not arguments.admin_token_hash: + if 'DTS_ADMIN_TOKEN' in os.environ: + arguments.admin_token_hash = hash_token_representation( + os.environ.get('DTS_ADMIN_TOKEN', ''), + ) +else: + # Validate the hash token format + if not hash_matcher.match(arguments.admin_token_hash): + print( + 'Hashed admin token is not a 64-digits hex-number', + file=sys.stderr, + flush=True, + ) + sys.exit(1) + + # Set the log level numeric_level = getattr(logging, arguments.log_level.upper(), None) if not isinstance(numeric_level, int): logger.error( - 'Invalid log level: %s, defaulting to level "WARNING"', arguments.log_level + 'Invalid log level: %s, defaulting to level "WARNING"', + arguments.log_level, ) else: logger.setLevel(level=numeric_level) - store_path = Path(arguments.store).resolve() if not store_path.exists(): logger.error(f'Store path does not exist: {store_path}') raise SystemExit(1) -config_path = ( - Path(arguments.config).resolve() if arguments.config else store_path / config_file_name -) -if not config_path.exists(): - logger.error(f'Config file does not exist: {config_path}') - raise SystemExit(1) - - -process_config( - store_path=store_path, - config_file=config_path, - order_by=['pid'], - globals_dict=globals(), -) -g_instance_config = get_config() - - disable_installed_extensions_check() + app = FastAPI( title='Dump Things Service', description=description, version=__version__, - openapi_tags=tag_info ) + app.include_router(curated_router) app.include_router(incoming_router) +app.include_router(token_router) +app.include_router(collection_router) + +# Add CORS origins +app.add_middleware( + CORSMiddleware, + allow_origins=arguments.origins, + allow_credentials=True, + allow_methods=['*'], + allow_headers=['*'], +) + +# Add pagination +add_pagination(app) -def store_record( - collection: str, - data: BaseModel | str, - class_name: str, - model: Any, - input_format: Format, - api_key: str | None = Depends(api_key_header_scheme), -) -> JSONResponse | PlainTextResponse: - if input_format == Format.json and isinstance(data, str): - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail='Invalid JSON data provided.' +g_instance_state = create_instance_state( + store_path=store_path, + bootstrap_token=arguments.admin_token_hash, + fastapi_app=app, +) + + +g_configuration = read_config(store_path) + + +def initialize_from_config_file( + instance_state: InstanceState, + config_file: str, +) -> Configuration: + with open(config_file) as f: + config_dict = yaml.safe_load(f) + + config_version = config_dict['version'] + if config_version == 1: + logger.info( + 'Converting version 1 configuration at %s', + arguments.config, + ) + config_dict = convert_to_new_format( + config_dict, + instance_state.store_path, + ) + elif config_version != 2: + msg = f'Invalid version in config file: {config_version}' + raise ValueError(msg) + + return Configuration(**config_dict) + + +# If the configuration is empty, check for configuration option +if not ( + g_configuration.admin_tokens + or g_configuration.collections + or g_configuration.tokens +): + if arguments.config: + logger.info( + 'Initializing empty persisted configuration from %s', + arguments.config, + ) + g_configuration = initialize_from_config_file( + g_instance_state, + arguments.config, + ) + # Persist the configuration + store_config( + store_path=g_instance_state.store_path, + config=g_configuration, ) - if input_format == Format.ttl and not isinstance(data, str): - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail='Invalid ttl data provided.' - ) - check_collection(g_instance_config, collection) - - token = ( - get_default_token_name(g_instance_config, collection) - if api_key is None - else api_key - ) - - # Get the token permissions and extend them by the default permissions. - # This call will also convert plaintext tokens into the hashed version of - # the token, if the token is hashed. This is necessary because we do not - # store the plaintext token, so all token-information is associated with - # the hashed representation of the token. - store, token, token_permissions, user_id = get_token_store( - g_instance_config, - collection, - token, - ) - final_permissions = join_default_token_permissions( - g_instance_config, token_permissions, collection - ) - if not final_permissions.incoming_write: - raise HTTPException( - status_code=HTTP_403_FORBIDDEN, - detail=f"Not authorized to submit to collection '{collection}'.", - ) - - if input_format == Format.ttl: - with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Conversion error'): - json_object = FormatConverter( - g_instance_config.schemas[collection], - input_format=Format.ttl, - output_format=Format.json, - ).convert(data, class_name) - with wrap_http_exception(ValidationError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): - record = TypeAdapter(getattr(model, class_name)).validate_python(json_object) - else: - record = data - - with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): - g_instance_config.validators[collection].validate(record) - - with wrap_http_exception(CurieResolutionError): - stored_records = store.store_object(obj=record, submitter=user_id) - - if input_format == Format.ttl: - format_converter = FormatConverter( - g_instance_config.schemas[collection], - input_format=Format.json, - output_format=Format.ttl, - ) - with wrap_http_exception(ValueError, header='Conversion error'): - return PlainTextResponse( - combine_ttl( - [ - format_converter.convert( - record, - class_name, - ) - for class_name, record in stored_records - ] - ), - media_type='text/turtle', - ) - return JSONResponse([record for _, record in stored_records]) +manifest_configuration( + configuration=g_configuration, + instance_state=g_instance_state, +) -def validate_record( - collection: str, - data: BaseModel | str, - class_name: str, - model: Any, - input_format: Format, - api_key: str | None = Depends(api_key_header_scheme), -) -> JSONResponse: - if input_format == Format.json and isinstance(data, str): - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail='Invalid JSON data provided.' - ) - - if input_format == Format.ttl and not isinstance(data, str): - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail='Invalid ttl data provided.' - ) - - check_collection(g_instance_config, collection) - - token = ( - get_default_token_name(g_instance_config, collection) - if api_key is None - else api_key - ) - - store, token, token_permissions, user_id = get_token_store( - g_instance_config, - collection, - token, - ) - final_permissions = join_default_token_permissions( - g_instance_config, token_permissions, collection - ) - if not final_permissions.incoming_write: - raise HTTPException( - status_code=HTTP_403_FORBIDDEN, - detail=f"Not authorized to validate records for collection '{collection}'.", - ) - - if input_format == Format.ttl: - with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Conversion error'): - json_object = FormatConverter( - g_instance_config.schemas[collection], - input_format=Format.ttl, - output_format=Format.json, - ).convert(data, class_name) - with wrap_http_exception(ValidationError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): - TypeAdapter(getattr(model, class_name)).validate_python(json_object) - else: - # Try to convert it into TTL to detect potential errors before storing - # the record - with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): - g_instance_config.validators[collection].validate(data) - - return JSONResponse(True) +g_instance_state.fastapi_app.openapi_schema = None +g_instance_state.fastapi_app.setup() +add_pagination(g_instance_state.fastapi_app) @app.get('/', response_class=RedirectResponse) @@ -424,10 +311,10 @@ async def server() -> ServerResponse: collections = [ ServerCollectionResponse( name=collection_name, - schema=g_instance_config.schemas[collection_name], - classes=g_instance_config.model_info[collection_name][1], + schema=g_configuration.collections[collection_name].schema_location, + classes=g_instance_state.schema_info[g_configuration.collections[collection_name].schema_location].classes, ) - for collection_name in g_instance_config.collections + for collection_name in g_configuration.collections ] ) @@ -441,7 +328,6 @@ async def maintenance( body: MaintenanceRequest, api_key: str | None = Depends(api_key_header_scheme), ): - if api_key is None: raise HTTPException( status_code=HTTP_400_BAD_REQUEST, @@ -453,8 +339,8 @@ async def maintenance( # Try to authenticate the token with the authentication providers that # are associated with the collection. - check_collection(g_instance_config, collection) - auth_info = authenticate_token(g_instance_config, collection, api_key) + check_collection(g_configuration, collection) + auth_info = authenticate_token(g_instance_state, collection, api_key) permissions = auth_info.token_permission if not ( @@ -468,9 +354,9 @@ async def maintenance( ) if active: - g_instance_config.maintenance_mode.add(collection) + g_instance_state.maintenance_mode.add(collection) else: - g_instance_config.maintenance_mode.remove(collection) + g_instance_state.maintenance_mode.remove(collection) return @@ -485,10 +371,10 @@ async def read_record_with_pid( format: Format = Format.json, # noqa A002 api_key: str = Depends(api_key_header_scheme), ): - check_collection(g_instance_config, collection) + check_collection(g_configuration, collection) final_permissions, token_store = await process_token( - g_instance_config, api_key, collection + g_configuration, g_instance_state, api_key, collection ) class_name, json_object = None, None @@ -498,7 +384,7 @@ async def read_record_with_pid( if not json_object and final_permissions.curated_read: with wrap_http_exception(CurieResolutionError, header='CURIE error:'): - class_name, json_object = g_instance_config.curated_stores[ + class_name, json_object = g_instance_state.curated_stores[ collection ].get_object_by_pid(pid) @@ -507,7 +393,7 @@ async def read_record_with_pid( if format == Format.ttl: converter = FormatConverter( - schema=g_instance_config.schemas[collection], + schema=g_configuration.collections[collection].schema_location, input_format=Format.json, output_format=format, ) @@ -623,9 +509,9 @@ async def _read_all_records( detail=f'Conversion error: {e}', ) from e - check_collection(g_instance_config, collection) + check_collection(g_configuration, collection) final_permissions, token_store = await process_token( - g_instance_config, api_key, collection + g_configuration, g_instance_state, api_key, collection ) result_list = PriorityList() @@ -636,7 +522,7 @@ async def _read_all_records( result_list.add_list(token_store_list) if final_permissions.curated_read: - curated_store_list = g_instance_config.curated_stores[ + curated_store_list = g_instance_state.curated_stores[ collection ].get_all_objects( matching=matching, @@ -651,7 +537,7 @@ async def _read_all_records( if format == Format.ttl: result_list = ConvertingList( result_list, - g_instance_config.schemas[collection], + g_configuration.collections[collection].schema_location, input_format=Format.json, output_format=format, exception_handler=convert_to_http_exception, @@ -678,16 +564,17 @@ async def _read_records_of_type( detail=f'Conversion error: {e}', ) from e - check_collection(g_instance_config, collection) - model = g_instance_config.model_info[collection][0] - if class_name not in g_instance_config.use_classes[collection]: + check_collection(g_configuration, collection) + schema_location = g_configuration.collections[collection].schema_location + model = g_instance_state.schema_info[schema_location].pydantic_module_info.module + if class_name not in g_instance_state.collections[collection].active_classes: raise HTTPException( status_code=HTTP_404_NOT_FOUND, detail=f"No '{class_name}'-class in collection '{collection}'.", ) final_permissions, token_store = await process_token( - g_instance_config, api_key, collection + g_configuration, g_instance_state, api_key, collection ) result_list = PriorityList() @@ -703,7 +590,7 @@ async def _read_records_of_type( if final_permissions.curated_read: for search_class_name in get_subclasses(model, class_name): - curated_store_list = g_instance_config.curated_stores[ + curated_store_list = g_instance_state.curated_stores[ collection ].get_objects_of_class( class_name=search_class_name, @@ -719,7 +606,7 @@ async def _read_records_of_type( if format == Format.ttl: result_list = ConvertingList( result_list, - g_instance_config.schemas[collection], + schema_location, input_format=Format.json, output_format=format, exception_handler=convert_to_http_exception, @@ -742,9 +629,9 @@ async def delete_record( pid: str, api_key: str = Depends(api_key_header_scheme), ): - check_collection(g_instance_config, collection) + check_collection(g_configuration, collection) final_permissions, token_store = await process_token( - g_instance_config, api_key, collection + g_configuration, g_instance_state, api_key, collection ) if not final_permissions.incoming_write: @@ -764,29 +651,6 @@ async def delete_record( return True -# Create dynamic endpoints and rebuild the app to include all dynamically -# created endpoints. -create_store_endpoints(app, g_instance_config, tag_info, 'placeholder_write', globals()) -create_validate_endpoints(app, g_instance_config, tag_info, 'placeholder_validate', globals()) -create_curated_endpoints(app, tag_info, 'placeholder_curated_write', globals()) -create_incoming_endpoints(app, tag_info, 'placeholder_incoming_write', globals()) -app.openapi_schema = None -app.setup() - - -# Add CORS origins -app.add_middleware( - CORSMiddleware, - allow_origins=arguments.origins, - allow_credentials=True, - allow_methods=['*'], - allow_headers=['*'], -) - -# Add pagination -add_pagination(app) - - def main(): uvicorn.run( app, diff --git a/dump_things_service/manifest.py b/dump_things_service/manifest.py new file mode 100644 index 0000000..71cb722 --- /dev/null +++ b/dump_things_service/manifest.py @@ -0,0 +1,209 @@ +import logging + +from fastapi_pagination import add_pagination + +from dump_things_service.abstract_config import ( + Configuration, + TokenConfig, +) +from dump_things_service.collection import ( + create_collection, + delete_endpoints_for_collection, +) +from dump_things_service.instance_state import InstanceState + + +logger = logging.getLogger('dump_things_service') + +tag_groups = [ + 'write', + 'validate', + 'curated_write', + 'incoming_write', +] + +openapi_tags_template = [ + { + 'name': 'Server management', + 'description': 'General server operations', + }, + { + 'name': 'Read records', + 'description': 'Read records from the given collection', + }, + 'write', + 'validate', + { + 'name': 'Delete records', + 'description': 'Delete records from the incoming area associated with the authorization token', + }, + 'curated_write', + { + 'name': 'Curated area: delete records', + 'description': 'Delete records from the curated area of the given collection (requires **curator token**)', + }, + { + 'name': 'Incoming area: read labels', + 'description': 'Read labels of all incoming areas for the given collection (requires **curator token**)', + }, + { + 'name': 'Incoming area: read records', + 'description': 'Read records from the given incoming area of the given collection (requires **curator token**)', + }, + 'incoming_write', + { + 'name': 'Incoming area: delete records', + 'description': 'Delete records from the given incoming area of the given collection (requires **curator token**)', + }, +] + + + +def manifest_configuration( + configuration: Configuration, + instance_state: InstanceState, +): + """Interpret the configuration and instantiate respective objects + + For every collection in the configuration this method will: + + - create a `ModelStore`-instance with correct `Backend`-instances and + check for compatibility with existing data + - create schema-related objects + - add schema class-specific http-endpoints to: + -- validate records + -- create records in the user's inbox + -- create records in the curated area + -- create records in a specific incoming area + + Objects and endpoints that belong to a non-existing configuration are + deleted. + + If objects for a collection do already exist, they are kept unmodified + and are not validated. That means changes of existing configuration objects + are not possible. To modify a collection or token configuration, the + configuration has to be deleted and created in the new state. + + If case of an error in the configuration, no objects will be create for + the respective collection or token. + + Tokens can be updated. Collections can not be updated, to modify the + configuration of a collection, the collection must be deleted and + recreated with the modified configuration. Deleting a collection will + not delete the data of the collection. If the collection is recreated with + the same backend- and directory-configuration, the data will be accessible + in the new collection as well. It is, however, not recommended to update + the schema of a collection. This will most likely break the service on this + collection. + + When collections are deleted, some tokens might still refer to them, this + is silently ignored. This supports to delete a collection and delete or + modify the token later. + + Default-tokens are not validated when a new collection is created. This + allows to first create a collection and then the default token. The cost + is that a "default-token unknown" error might be created when accessing + a collection. + """ + + # Determine the changes in collections. + existing_collections = set(instance_state.collections) + configured_collections = set(configuration.collections) + new_collection_names = configured_collections - existing_collections + deleted_collection_names = existing_collections - configured_collections + + # Delete collection objects of collections that are no longer in the + # configuration (we do not delete the collection from token-objects here + # because token-objects are all re-created below). + for collection_name in deleted_collection_names: + delete_endpoints_for_collection(instance_state, collection_name) + delete_collection(instance_state, collection_name) + + # Create the internal representation objects for collections that have been + # added to the configuration. + for collection_name in new_collection_names: + create_collection( + instance_state, + configuration, + collection_name, + ) + + # Delete all token objects and recreate the tokens. This ensures that + # modified token scope and permissions are set for all tokens. + for token_name in list(instance_state.tokens): + delete_token(instance_state, token_name) + + for token_name, token_configuration in configuration.tokens.items(): + create_token( + instance_state, + token_name, + token_configuration, + ) + + if new_collection_names or deleted_collection_names: + instance_state.fastapi_app.openapi_schema = None + instance_state.fastapi_app.openapi_tags = create_openapi_tags( + instance_state, + openapi_tags_template, + ) + instance_state.fastapi_app.setup() + add_pagination(instance_state.fastapi_app) + + # We do not create any incoming areas for configuration-file tokens + # here. The reason is that the configuration does not fully determine + # the possible incoming areas because incoming areas come from + # authentication sources and the configuration-file authentication source + # is just one possible authentication source. Other authentication sources + # have unknown means to create incoming area labels. + # Incoming areas are therefore created when a write request for a token + # is authorized. + + +def create_token( + instance_state: InstanceState, + token_name: str, + token_configuration: TokenConfig, +): + instance_state.tokens[token_name] = token_configuration + + +def delete_token( + instance_state: InstanceState, + token_name: str, +): + instance_state.tokens.pop(token_name) + + +def delete_collection( + instance_state: InstanceState, + collection_name: str, +): + instance_state.collections.pop(collection_name) + + # TODO: remove further collection-related information from + # instance_state. Maybe all collection-specific information + # should go into the instance_state.collection[x]-object!? + # That would allow to remove it easily. + + +def create_openapi_tags( + instance_state: InstanceState, + openapi_tags_template: list[dict | str], +) -> list[dict]: + + # Collect tag name lists for all tag groups that we have defined. + tag_group_info = { + tag_group: sorted( + [ + {'name': collection_info.tag_info[tag_group]} + for collection_info in instance_state.collections.values() + ], + key=lambda x: x['name'] + ) + for tag_group in tag_groups + } + result = openapi_tags_template.copy() + for tag_group, tag_list in tag_group_info.items(): + index = result.index(tag_group) + result[index:index + 1] = tag_list + return result diff --git a/dump_things_service/model.py b/dump_things_service/model.py index e8fb7c0..9155490 100644 --- a/dump_things_service/model.py +++ b/dump_things_service/model.py @@ -3,8 +3,8 @@ from __future__ import annotations import dataclasses # noqa F401 -- used by generated code import logging import sys +from functools import cache from itertools import count -from types import ModuleType from typing import ( TYPE_CHECKING, Any, @@ -35,10 +35,6 @@ lgr = logging.getLogger('dump_things_service') serial_number = count() _model_counter = count() -_model_cache = {} -_schema_model_cache = {} -_schema_view_cache = {} - # Pydantic module generation might require a higher recursion limit than the # default. Add a mechanism to increase it as needed, up to a maximum. @@ -58,6 +54,9 @@ def get_subclasses( class_name: str, ) -> list[str]: """get names of all subclasses (includes class_name itself)""" + + # TODO: this could also be implemented via SchemaView: + # return schema_view.class_children(class_name, mixins=False) super_class = getattr(model, class_name) return [ name @@ -66,6 +65,21 @@ def get_subclasses( ] +# TODO: shall we use the following code? +# The code below would use schema-definitions to determine classes and not +# go through thw pydantic module generation. +@cache +def get_subclasses_2( + collection_name: str, + class_name: str, +) -> list[str]: + from dump_things_service.instance_state import get_instance_state + + instance_state = get_instance_state() + schema_view = instance_state.schema_info[collection_name].schema_view + return schema_view.class_children(class_name, mixins=False) + + def compile_module_with_increasing_recursion_limit( pydantic_generator: PydanticGenerator, schema_location: str, @@ -99,33 +113,29 @@ def compile_module_with_increasing_recursion_limit( return module +@cache def get_model_for_schema( schema_location: str, ) -> tuple[ModuleType, list[str], str]: - if schema_location not in _model_cache: - lgr.info(f'Building model for schema {schema_location}.') - pydantic_generator = PydanticGenerator(schema_location) - model = compile_module_with_increasing_recursion_limit( - pydantic_generator, - schema_location, - ) - classes = get_classes(model) - model_var_name = f'model_{next(_model_counter)}' - _model_cache[schema_location] = model, classes, model_var_name - return _model_cache[schema_location] + lgr.info(f'Building pydantic modulr for schema {schema_location}') + pydantic_generator = PydanticGenerator(schema_location) + model = compile_module_with_increasing_recursion_limit( + pydantic_generator, + schema_location, + ) + classes = get_classes(model) + model_var_name = f'model_{next(_model_counter)}' + return model, classes, model_var_name +@cache def get_schema_view(schema_location: str) -> SchemaView: - if schema_location not in _schema_view_cache: - _schema_view_cache[schema_location] = SchemaView(schema_location) - return _schema_view_cache[schema_location] + return SchemaView(schema_location) +@cache def get_schema_model_for_schema( schema_location: str, ) -> ModuleType: - if schema_location not in _schema_model_cache: - _schema_model_cache[schema_location] = PythonGenerator( - schema_location - ).compile_module() - return _schema_model_cache[schema_location] + lgr.info(f'Building python module for schema {schema_location}') + return PythonGenerator(schema_location).compile_module() diff --git a/dump_things_service/store/model_store.py b/dump_things_service/store/model_store.py index 2242abc..cb72c03 100644 --- a/dump_things_service/store/model_store.py +++ b/dump_things_service/store/model_store.py @@ -17,7 +17,7 @@ if TYPE_CHECKING: from pydantic import BaseModel from dump_things_service.backends import ( - RecordInfo, + _RecordInfo, StorageBackend, ) from dump_things_service.lazy_list import LazyList @@ -29,10 +29,10 @@ submitter_namespace = 'http://purl.obolibrary.org/obo/' class _ModelStore: def __init__( - self, - schema: str, - backend: StorageBackend, - tags: dict[str, str] + self, + schema: str, + backend: StorageBackend, + tags: dict[str, str] ): self.schema = schema self.model = get_model_for_schema(self.schema)[0] @@ -43,9 +43,9 @@ class _ModelStore: return self.backend.get_uri() def store_object( - self, - obj: BaseModel, - submitter: str, + self, + obj: BaseModel, + submitter: str, ) -> Iterable[tuple[str, dict]]: if obj.__class__.__name__ == 'Thing': msg = f'Cannot store `Thing` instance: {obj}.' @@ -65,15 +65,15 @@ class _ModelStore: ] def pid_to_iri( - self, - pid: str, + self, + pid: str, ): return resolve_curie(self.model, pid) def _store_flat_object( - self, - obj: BaseModel, - submitter: str, + self, + obj: BaseModel, + submitter: str, ) -> dict: iri = self.pid_to_iri(obj.pid) class_name = obj.__class__.__name__ @@ -93,9 +93,9 @@ class _ModelStore: return json_object def annotate( - self, - json_object: dict, - submitter: str, + self, + json_object: dict, + submitter: str, ) -> None: """Add submitter IRI to the record annotations, use CURIE if possible""" json_object['annotations'] = self.homogenize_annotations(json_object) @@ -112,8 +112,8 @@ class _ModelStore: } def get_curie( - self, - curie_or_iri: str, + self, + curie_or_iri: str, ) -> str: if is_curie(curie_or_iri): return curie_or_iri @@ -130,8 +130,8 @@ class _ModelStore: return curie_or_iri def extract_inlined( - self, - record: BaseModel, + self, + record: BaseModel, ) -> list[BaseModel]: # The trivial case: no relations if not hasattr(record, 'relations') or record.relations is None: @@ -158,14 +158,14 @@ class _ModelStore: return [new_record, *extracted_sub_records] def get_object_by_pid( - self, - pid: str, + self, + pid: str, ) -> tuple[str, dict] | tuple[None, None]: return self.get_object_by_iri(self.pid_to_iri(pid)) def get_object_by_iri( - self, - iri: str, + self, + iri: str, ) -> tuple[str, dict] | tuple[None, None]: record_info = self.backend.get_record_by_iri(iri) if record_info: @@ -173,12 +173,12 @@ class _ModelStore: return None, None def get_objects_of_class( - self, - class_name: str, - matching: str | None, - *, - include_subclasses: bool = True, - ) -> LazyList[RecordInfo]: + self, + class_name: str, + matching: str | None, + *, + include_subclasses: bool = True, + ) -> LazyList[_RecordInfo]: """ Get all objects of a specific class. @@ -196,9 +196,9 @@ class _ModelStore: return self.backend.get_records_of_classes(class_names, matching) def get_all_objects( - self, - matching: str | None = None, - ) -> LazyList[RecordInfo]: + self, + matching: str | None = None, + ) -> LazyList[_RecordInfo]: """ Get all objects of a specific class. @@ -208,8 +208,8 @@ class _ModelStore: return self.backend.get_all_records(matching) def delete_object( - self, - pid: str, + self, + pid: str, ) -> bool: return self.backend.remove_record(self.pid_to_iri(pid)) @@ -218,15 +218,18 @@ _existing_model_stores = {} def ModelStore( # noqa: N802 - schema: str, - backend: StorageBackend, - tags: dict[str, str], + schema: str, + backend: StorageBackend, + tags: dict[str, str], ) -> _ModelStore: - """ - Create a unique model store for the given schema and backend. + """Create a unique model store for the given schema and backend. + + Raise `ValueError` if a store with a different schema already exists for + the given backend. :param schema: The schema to use for the model store. :param backend: The storage backend to use. + :param tags: Tags that will be used for annotations :return: An instance of _ModelStore. """ existing_model_store, _ = _existing_model_stores.get(id(backend), (None, None)) @@ -235,4 +238,10 @@ def ModelStore( # noqa: N802 # We store a pointer to the backend in the value to ensure that the # backend object exists while we use its `id` as a key. _existing_model_stores[id(backend)] = existing_model_store, backend + else: + # Check that the schemas are compatible, if the backend is reused. + if existing_model_store.schema != schema: + msg = 'Backend is already used in a ModelStore with a different schema' + raise ValueError(msg) + return existing_model_store diff --git a/dump_things_service/tests/__init__.py b/dump_things_service/tests/__init__.py index e69de29..d5185ae 100644 --- a/dump_things_service/tests/__init__.py +++ b/dump_things_service/tests/__init__.py @@ -0,0 +1,4 @@ +from pathlib import Path + +# Path to a local simple test schema +schema_file = Path(__file__).parent / 'testschema.yaml' diff --git a/dump_things_service/tests/create_store.py b/dump_things_service/tests/create_store.py index 2248fe6..5f038c0 100644 --- a/dump_things_service/tests/create_store.py +++ b/dump_things_service/tests/create_store.py @@ -4,18 +4,16 @@ from typing import TYPE_CHECKING import yaml +from dump_things_service.backends.record_dir import RecordDirStore from dump_things_service.backends.sqlite import ( SQLiteBackend, -) -from dump_things_service.backends.sqlite import ( record_file_name as sqlite_record_file_name, ) -from dump_things_service.config import ( - BackendConfigRecordDir, +from dump_things_service.abstract_config import ( + RecordDirBackendConfig, CollectionConfig, - GlobalConfig, + Configuration, MappingMethod, - config_file_name, mapping_functions, ) from dump_things_service.model import get_model_for_schema @@ -25,6 +23,8 @@ if TYPE_CHECKING: from pathlib import Path +config_file_name = '.dumpthings.yaml' + collection_config_template = """type: records version: 1 schema: {schema} @@ -58,21 +58,11 @@ faulty_yaml = ': : -: : :' def create_store( root_dir: Path, - config: GlobalConfig, + abstract_config: Configuration, per_collection_info: dict[str, tuple[str, str]], default_entries: dict[str, list[tuple[str, str, str]]] | None = None, ): - # Create the global config file - config_text = yaml.safe_dump( - config.model_dump(mode='json', exclude_none=True), - allow_unicode=True, - sort_keys=False, - ) - with open(root_dir / config_file_name, 'w') as f: - f.write(config_text) - - # Create all collection directories - for collection_name, collection_config in config.collections.items(): + for collection_name, collection_config in abstract_config.collections.items(): create_collection( root_dir=root_dir, collection_config=collection_config, @@ -102,7 +92,7 @@ def create_collection( curated_dir.mkdir(parents=True, exist_ok=True) if collection_config.backend is None: - collection_config.backend = BackendConfigRecordDir(type='record_dir+stl') + collection_config.backend = RecordDirBackendConfig(type='record_dir+stl') if collection_config.backend.type == 'record_dir+stl': # Add the collection level config file diff --git a/dump_things_service/tests/fixtures.py b/dump_things_service/tests/fixtures.py index 491553e..21f3b87 100644 --- a/dump_things_service/tests/fixtures.py +++ b/dump_things_service/tests/fixtures.py @@ -1,13 +1,31 @@ import sys -from pathlib import Path +from pathlib import ( + Path, + PurePosixPath, +) +from types import ModuleType import pytest import yaml -from dump_things_service import config_file_name -from dump_things_service.config import GlobalConfig +from dump_things_service.abstract_config import ( + GitAuditBackendConfig, + SQLiteBackendConfig, + TokenCollectionConfig, + TokenModes, hash_token_representation, TagSpec, +) +from dump_things_service.backends import StorageBackend +from dump_things_service.backends.record_dir import RecordDirStore +from dump_things_service.backends.sqlite import ( + SQLiteBackend, + record_file_name as sqlite_db_filename, +) +from dump_things_service.collection_endpoints import CollectionRequest +from dump_things_service.instance_state import get_mapping_function_by_name +from dump_things_service.model import get_model_for_schema +from dump_things_service.resolve_curie import resolve_curie +from dump_things_service.token_endpoints import TokenRequest from dump_things_service.tests.create_store import ( - create_store, pid, pid_curated, pid_trr, @@ -16,322 +34,409 @@ from dump_things_service.tests.create_store import ( test_record_trr, ) + # String representation of curated- and incoming-path curated = 'curated' incoming = 'incoming' # Path to a local simple test schema -schema_path = Path(__file__).parent / 'testschema.yaml' +test_schema_location = str((Path(__file__).parent / 'testschema.yaml').absolute()) +flat_social_schema_location = 'https://concepts.datalad.org/s/flat-social/unreleased.yaml' -# The global configuration file, all collections and -# staging areas share the same directories. All tokens -# of the same collection share an "incoming_label". -global_config_text = f""" -type: collections -version: 1 -collections: - collection_1: - default_token: basic_access - curated: {curated}/in_token_1 - incoming: {incoming} - backend: - type: record_dir+stl - auth_sources: - - type: config - submission_tags: - submitter_id_tag: oxo:NCIT_C54269 - submission_time_tag: https://time - audit_backends: - - type: gitaudit - path: {{audit_store_path}} - auto_flush_timeout: 2 - collection_2: - default_token: basic_access - curated: {curated}/collection_2 - incoming: incoming_2 - backend: - type: record_dir+stl - collection_3: - default_token: basic_access - curated: {curated}/collection_3 - incoming: incoming_3 - backend: - type: record_dir+stl - collection_4: - default_token: basic_access - curated: {curated}/collection_4 - incoming: incoming_4 - backend: - type: record_dir+stl - collection_5: - default_token: basic_access - curated: {curated}/collection_5 - incoming: incoming_5 - backend: - type: record_dir+stl - collection_6: - default_token: basic_access - curated: {curated}/collection_6 - incoming: incoming_6 - backend: - type: record_dir+stl - collection_7: - default_token: basic_access - curated: {curated}/collection_7 - incoming: incoming_7 - backend: - type: record_dir+stl - collection_8: - default_token: basic_access - curated: {curated}/collection_8 - incoming: incoming_8 - backend: - type: sqlite - schema: {schema_path} - collection_dlflatsocial-1: - default_token: basic_access - curated: {curated}/collection_dlflatsocial-1 - incoming: {incoming}/collection_dlflatsocial-1 - backend: - type: record_dir+stl - collection_dlflatsocial-2: - default_token: basic_access - curated: {curated}/collection_dlflatsocial-2 - incoming: {incoming}/collection_dlflatsocial-2 - backend: - type: sqlite - schema: https://concepts.datalad.org/s/flat-social/unreleased.yaml - use_classes: - - Organization - - Person - - Project - ignore_classes: - - Organization - - Project +# The test store is created empty and collections are added via the admin +# web interface. +g_default_collections = [ + CollectionRequest( + name=f'collection_{i}', + default_token='test_default_token', + curated=PurePosixPath(f'{curated}/collection_{i}'), + schema=test_schema_location, + incoming=PurePosixPath(f'{incoming}/collection_{i}'), + ) + for i in range(1, 8) +] -tokens: - basic_access: - user_id: anonymous - collections: - collection_1: - mode: READ_CURATED - incoming_label: '' - collection_2: - mode: READ_CURATED - incoming_label: '' - collection_3: - mode: READ_CURATED - incoming_label: '' - collection_4: - mode: READ_CURATED - incoming_label: '' - collection_5: - mode: READ_CURATED - incoming_label: '' - collection_6: - mode: READ_CURATED - incoming_label: '' - collection_7: - mode: READ_CURATED - incoming_label: '' - collection_8: - mode: READ_CURATED - incoming_label: '' - collection_dlflatsocial-1: - mode: READ_CURATED - incoming_label: '' - collection_dlflatsocial-2: - mode: READ_CURATED - incoming_label: '' - cmo-33b726a7e2b9eaf1f8f124049822ade31cb6516a4d8221634b01d13d793bfe16: - hashed: True - user_id: cmo - collections: - collection_1: - mode: WRITE_COLLECTION - incoming_label: cmo - # The plaintext of the following is `token-1`: - token-6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b: - hashed: True - user_id: test_user_1 - collections: - collection_1: - mode: WRITE_COLLECTION - incoming_label: in_token_1 - collection_dlflatsocial-1: - mode: WRITE_COLLECTION - incoming_label: in_token_1 - collection_dlflatsocial-2: - mode: WRITE_COLLECTION - incoming_label: in_token_1 - token_1_xxooo: - user_id: test_user_1_read_collection - collections: - collection_1: - mode: READ_COLLECTION - incoming_label: modes - token_1_xxxoo: - user_id: test_user_1_write_collection - collections: - collection_1: - mode: WRITE_COLLECTION - incoming_label: modes - token_1_oxooo: - user_id: test_user_1_read_submissions - collections: - collection_1: - mode: READ_SUBMISSIONS - incoming_label: modes - token_1_oxxoo: - user_id: test_user_1_write_submissions - collections: - collection_1: - mode: WRITE_SUBMISSIONS - incoming_label: modes - token_1_xoxoo: - user_id: test_user_1_submit - collections: - collection_1: - mode: SUBMIT - incoming_label: modes - token_1_ooxoo: - user_id: test_user_1_submit_only - collections: - collection_1: - mode: SUBMIT_ONLY - incoming_label: modes - token_1_ooxoo: - user_id: test_user_1_submit_only - collections: - collection_1: - mode: SUBMIT_ONLY - incoming_label: modes - token_1_xoooo: - user_id: test_user_1_read_curated - collections: - collection_1: - mode: READ_CURATED - incoming_label: modes - token_1_ooooo: - user_id: test_user_1_nothing - collections: - collection_1: - mode: NOTHING - incoming_label: modes - token_1_xxxxx: - user_id: test_user_1_curated - collections: - collection_1: - mode: CURATOR - incoming_label: modes - collection_8: - mode: CURATOR - incoming_label: modes - token_admin: - user_id: test_admin - collections: - collection_1: - mode: CURATOR - incoming_label: admin_1 - collection_2: - mode: CURATOR - incoming_label: admin_2 - collection_3: - mode: CURATOR - incoming_label: admin_3 - collection_4: - mode: CURATOR - incoming_label: admin_4 - collection_5: - mode: CURATOR - incoming_label: admin_common - collection_6: - mode: CURATOR - incoming_label: admin_common - collection_7: - mode: CURATOR - incoming_label: admin_common - collection_8: - mode: CURATOR - incoming_label: admin_common - token-2: - user_id: test_user_2 - collections: - collection_2: - mode: WRITE_COLLECTION - incoming_label: in_token-2 - token-8: - user_id: test_user_8 - collections: - collection_8: - mode: WRITE_COLLECTION - incoming_label: test_user_8 -""" +g_default_collections[6].submission_tags = TagSpec( + submitter_id_tag='abc:id', + submission_time_tag='abc:time', +) + +g_default_collections.append( + CollectionRequest( + name=f'collection_8', + default_token='test_default_token', + curated=PurePosixPath(f'{curated}/collection_8'), + schema=test_schema_location, + incoming=PurePosixPath(f'{incoming}/collection_8'), + backend=SQLiteBackendConfig( + type='sqlite', + ), + submission_tags=TagSpec( + submitter_id_tag='no_default_id_tag', + submission_time_tag='no_default_time_tag', + ) + ) +) + +g_default_collections.extend([ + CollectionRequest( + name='collection_dlflatsocial-1', + schema=flat_social_schema_location, + default_token='test_default_token', + curated=PurePosixPath(f'{curated}/collection_dlflatsocial-1'), + incoming=PurePosixPath(f'{incoming}/collection_dlflatsocial-1'), + ), + CollectionRequest( + name='collection_dlflatsocial-2', + schema=flat_social_schema_location, + default_token='test_default_token', + curated=PurePosixPath(f'{curated}/collection_dlflatsocial-2'), + incoming=PurePosixPath(f'{incoming}/collection_dlflatsocial-2'), + backend=SQLiteBackendConfig( + type='sqlite', + ), + use_classes=[ + 'Organization', + 'Person', + 'Project', + ], + ignore_classes=[ + 'Organization', + 'Project', + ], + ), +]) + +g_default_tokens = [ + TokenRequest( + name='test_default_token', + user_id='basic_access_user', + hashed=False, + representation='basic_access', + collections={ + **{ + f'collection_{i}': TokenCollectionConfig( + mode=TokenModes.READ_CURATED, + ) + for i in range(1, 9) + }, + **{ + f'collection_dlflatsocial-{i}': TokenCollectionConfig( + mode=TokenModes.READ_CURATED, + ) + for i in range(1, 3) + }, + }, + ), + TokenRequest( + name='Test token for some collections', + user_id='test_user_1', + hashed=False, + representation='token-1', + collections={ + collection_name: TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label='in_token_1', + ) + for collection_name in ( + 'collection_1', + 'collection_dlflatsocial-1', + 'collection_dlflatsocial-2', + ) + }, + ), + TokenRequest( + name='Test token for collection_2', + user_id='test_user_2', + hashed=False, + representation='token-2', + collections={ + f'collection_2': TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label='in_token-2', + ) + }, + ), + TokenRequest( + name='Test token for collection_8', + user_id='test_user_8', + hashed=False, + representation='token-8', + collections={ + f'collection_8': TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label='test_user_8', + ) + }, + ), + TokenRequest( + name='Test token for all collections', + user_id='user_all', + hashed=False, + representation='token-all', + collections={ + **{ + f'collection_{i}': TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label='token-all:user_all', + ) + for i in range(1, 9) + }, + **{ + f'collection_dlflatsocial-{i}': TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label='token-all:user_all', + ) + for i in range(1, 3) + }, + }, + ), + TokenRequest( + name='Test Curator Token', + user_id='test_curator', + representation='token_curator', + collections={ + f'collection_{i}': TokenCollectionConfig( + mode=TokenModes.CURATOR, + incoming_label=f'admin_{i}' if i < 5 else 'admin_common', + ) + for i in range(1, 9) + }, + ), + TokenRequest( + name='Test Hashed Token', + user_id='test_hashed', + representation='token-hashed', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label='token-hashed-1', + ), + }, + ), + TokenRequest( + name='Test XX000 (READ_COLLECTION)', + user_id='test_user_1_read_collection', + representation='token_1_xxooo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.READ_COLLECTION, + incoming_label='modes', + ), + }, + ), + TokenRequest( + name='Test XXX00 (WRITE_COLLECTION)', + user_id='test_user_1_write_collection', + representation='token_1_xxxoo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label='modes', + ), + } + ), + TokenRequest( + name='Test 0X000 (READ_SUBMISSIONS)', + user_id='test_user_1_read_submissions', + representation='token_1_oxooo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.READ_SUBMISSIONS, + incoming_label='modes', + ), + }, + ), + TokenRequest( + name='Test 0XX00 (WRITE_SUBMISSIONS)', + user_id='test_user_1_write_submissions', + representation='token_1_oxxoo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.WRITE_SUBMISSIONS, + incoming_label='modes', + ), + }, + ), + TokenRequest( + name='Test X0X00 (SUBMIT)', + user_id='test_user_1_submit', + representation='token_1_xoxoo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.SUBMIT, + incoming_label='modes', + ), + }, + ), + TokenRequest( + name='Test 00X00 (SUBMIT_ONLY)', + user_id='test_user_1_submit_only', + representation='token_1_ooxoo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.SUBMIT_ONLY, + incoming_label='modes', + ), + }, + ), + TokenRequest( + name='Test X0000 (READ_CURATED)', + user_id='test_user_1_read_curated', + representation='token_1_xoooo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.READ_CURATED, + incoming_label='modes', + ), + }, + ), + TokenRequest( + name='Test 00000 (NOTHING)', + user_id='test_user_1_nothing', + representation='token_1_ooooo', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.NOTHING, + incoming_label='modes', + ), + }, + ), + TokenRequest( + name='Test XXXXX (CURATOR)', + user_id='test_user_1_curator', + representation='token_1_xxxxx', + collections={ + 'collection_1': TokenCollectionConfig( + mode=TokenModes.CURATOR, + incoming_label='modes', + ), + 'collection_8': TokenCollectionConfig( + mode=TokenModes.CURATOR, + incoming_label='modes', + ), + }, + ), +] + +g_default_entries = { + f'collection_{i}': [('Person', pid, test_record)] for i in range(1, 9) +} +for collection_id in range(1, 9): + g_default_entries[f'collection_{collection_id}'].extend( + [ + ('Person', pid_curated, test_record_curated), + ( + 'Person', + 'abc:mode_test', + 'pid: abc:mode_test\ngiven_name: mode_curated\nschema_type: abc:Person\n', + ), + ] + ) + +g_default_entries['collection_dlflatsocial-1'] = [('Person', pid_trr, test_record_trr)] +g_default_entries['collection_dlflatsocial-2'] = [('Person', pid_trr, test_record_trr)] @pytest.fixture(scope='session') def dump_stores_simple(tmp_path_factory): tmp_path = tmp_path_factory.mktemp('dump_store') audit_store_path = tmp_path_factory.mktemp('audit_store') - - final_config_text = global_config_text.format(audit_store_path=str(audit_store_path)) - (tmp_path / config_file_name).write_text(final_config_text) - - default_entries = { - f'collection_{i}': [('Person', pid, test_record)] for i in range(1, 9) - } - for collection_id in (1, 8): - default_entries[f'collection_{collection_id}'].extend( - [ - ('Person', pid_curated, test_record_curated), - ( - 'Person', - 'abc:mode_test', - 'pid: abc:mode_test\ngiven_name: mode_curated\nschema_type: abc:Person\n', - ), - ] - ) - default_entries['collection_dlflatsocial-1'] = [('Person', pid_trr, test_record_trr)] - default_entries['collection_dlflatsocial-2'] = [('Person', pid_trr, test_record_trr)] - create_store( - root_dir=tmp_path, - config=GlobalConfig(**yaml.safe_load(final_config_text)), - per_collection_info={ - 'collection_1': (str(schema_path), 'digest-md5'), - 'collection_2': (str(schema_path), 'digest-md5-p3'), - 'collection_3': (str(schema_path), 'digest-sha1'), - 'collection_4': (str(schema_path), 'digest-sha1-p3'), - 'collection_5': (str(schema_path), 'after-last-colon'), - 'collection_6': (str(schema_path), 'digest-md5-p3-p3'), - 'collection_7': (str(schema_path), 'digest-sha1-p3-p3'), - 'collection_8': (str(schema_path), 'digest-md5'), - 'collection_dlflatsocial-1': ( - 'https://concepts.datalad.org/s/flat-social/unreleased.yaml', - 'digest-md5', - ), - 'collection_dlflatsocial-2': ( - 'https://concepts.datalad.org/s/flat-social/unreleased.yaml', - 'digest-md5', - ), - }, - default_entries=default_entries, - ) - return tmp_path + return tmp_path, audit_store_path @pytest.fixture(scope='session') def fastapi_app_simple(dump_stores_simple): + tmp_path, audit_tmp_path = dump_stores_simple + + admin_token = 'admin-1' old_sys_argv = sys.argv - sys.argv = ['test-runner', str(dump_stores_simple)] + sys.argv = [ + 'test-runner', + '--admin-token-hash', hash_token_representation(admin_token), + str(tmp_path), + ] from dump_things_service.main import app sys.argv = old_sys_argv - return app, dump_stores_simple + return app, tmp_path, audit_tmp_path, admin_token @pytest.fixture(scope='session') def fastapi_client_simple(fastapi_app_simple): from fastapi.testclient import TestClient - return TestClient(fastapi_app_simple[0]), fastapi_app_simple[1] + test_client = TestClient(fastapi_app_simple[0]) + store_path = fastapi_app_simple[1] + audit_path = fastapi_app_simple[2] + admin_token = fastapi_app_simple[3] + + # Add an audit backend to the first collection in g_default_collections + assert g_default_collections[0].name == 'collection_1' + g_default_collections[0].audit_backends = [ + GitAuditBackendConfig( + type='gitaudit', + path=Path(audit_path), + auto_flush_timeout=2, + ) + ] + + # Add collections via the Web-API + for collection_config in g_default_collections: + response = test_client.post( + '/collections', + json=collection_config.model_dump( + exclude_unset=True, + mode='json', + by_alias=True, + ), + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == 201 + + # Add tokens via Web-API + for token_config in g_default_tokens: + response = test_client.post( + '/tokens', + json=token_config.model_dump(exclude_unset=True, mode='json'), + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == 201 + + # Add default content via backend instances + for collection_config in g_default_collections: + curated_path = Path(store_path / collection_config.curated) + backend_config = collection_config.backend + if backend_config.type.startswith('sqlite'): + backend = SQLiteBackend(curated_path / sqlite_db_filename) + else: + backend = RecordDirStore( + curated_path, + pid_mapping_function=get_mapping_function_by_name( + backend_config.mapping_method, + ), + suffix='yaml', + ) + pydantic_module = get_model_for_schema(collection_config.schema_location)[0] + add_records_to_backend( + backend, + pydantic_module, + g_default_entries[collection_config.name], + ) + return test_client, store_path, admin_token + + +def add_records_to_backend( + backend: StorageBackend, + pydantic_module: ModuleType, + record_infos: list[tuple[str, str, str]], +): + for class_name, record_pid, yaml_stream in record_infos: + json_object = yaml.load(yaml_stream, Loader=yaml.SafeLoader ) + assert record_pid == json_object['pid'] + backend.add_record( + iri=resolve_curie(pydantic_module, json_object['pid']), + class_name=class_name, + json_object=json_object, + ) diff --git a/dump_things_service/tests/test_auth.py b/dump_things_service/tests/test_auth.py index e63438d..efba9be 100644 --- a/dump_things_service/tests/test_auth.py +++ b/dump_things_service/tests/test_auth.py @@ -4,8 +4,8 @@ import json import pytest +from dump_things_service.abstract_config import TokenPermission from dump_things_service.auth.forgejo import ForgejoAuthenticationSource -from dump_things_service.token import TokenPermission user_1 = { 'id': 1, diff --git a/dump_things_service/tests/test_basic.py b/dump_things_service/tests/test_basic.py index 31068b1..140db21 100644 --- a/dump_things_service/tests/test_basic.py +++ b/dump_things_service/tests/test_basic.py @@ -1,7 +1,7 @@ -from pathlib import Path import pytest # F401 +from . import schema_file from .. import ( HTTP_200_OK, HTTP_400_BAD_REQUEST, @@ -18,8 +18,6 @@ from .create_store import ( ) from .test_utils import basic_write_locations -# Path to a local simple test schema -schema_file = Path(__file__).parent / 'testschema.yaml' extra_record = { 'schema_type': 'abc:Person', @@ -41,11 +39,11 @@ unicode_record = { def test_search_by_pid(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 9): response = test_client.get( f'/collection_{i}/record?pid={pid}', - headers={'x-dumpthings-token': 'basic_access'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK assert response.json() == { @@ -56,7 +54,7 @@ def test_search_by_pid(fastapi_client_simple): def test_get_all(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 9): response = test_client.get( f'/collection_{i}/records/', @@ -74,7 +72,7 @@ def test_get_all(fastapi_client_simple): def test_delete(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.post( '/collection_1/record/Person', @@ -113,10 +111,10 @@ def test_delete(fastapi_client_simple): def test_hashed_token(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( f'/collection_1/record?pid={pid}', - headers={'x-dumpthings-token': 'cmo-cmo'}, + headers={'x-dumpthings-token': 'token-hashed'}, ) assert response.status_code == HTTP_200_OK assert response.json() == { @@ -124,19 +122,20 @@ def test_hashed_token(fastapi_client_simple): 'pid': pid, 'given_name': given_name, } + # Ensure that the hashed token value is not compared verbatim response = test_client.get( f'/collection_1/record?pid={pid}', - headers={'x-dumpthings-token': 'cmo-33b726a7e2b9eaf1f8f124049822ade31cb6516a4d8221634b01d13d793bfe16'}, + headers={'x-dumpthings-token': '25d3fc9469f4971012815cb3ab8f9db3f50c0d63'}, ) assert response.status_code == HTTP_401_UNAUTHORIZED def test_search_by_class(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 9): response = test_client.get( f'/collection_{i}/records/Thing', - headers={'x-dumpthings-token': 'basic_access'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK json_result = response.json() @@ -170,7 +169,7 @@ def test_search_by_class(fastapi_client_simple): def test_search_by_pid_no_token(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 9): response = test_client.get( f'/collection_{i}/record?pid={pid}', @@ -184,7 +183,7 @@ def test_search_by_pid_no_token(fastapi_client_simple): def test_store_record(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple # Store a record in two collections for i, token in basic_write_locations: @@ -213,13 +212,10 @@ def test_store_record(fastapi_client_simple): f'/collection_{i}/records/Person', headers={'x-dumpthings-token': 'basic_access'}, ) - assert response.json() == [ - { - 'schema_type': 'abc:Person', - 'pid': pid, - 'given_name': given_name, - } - ] + # The following assertion works because the stored records don't contain + # annotations. If they would, the result would have to be cleaned + # before the containment-check. + assert extra_record not in response.json() # Check that subclasses are retrieved for i, token in basic_write_locations: @@ -255,7 +251,7 @@ def test_store_record(fastapi_client_simple): def test_encoding(fastapi_client_simple): - test_client, store_path = fastapi_client_simple + test_client, store_path, _ = fastapi_client_simple # Store a record with non-ASCII characters in collections via the API. that # will trigger the YAML-dumping, which should be checked @@ -275,12 +271,13 @@ def test_encoding(fastapi_client_simple): def test_global_store_write_fails(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 9): # Since we provide no token, the default token will be used. This will # only allow reading from curated, not posting. response = test_client.post( - f'/collection_{i}/record/Person', json={'pid': extra_record['pid']} + f'/collection_{i}/record/Person', + json={'pid': extra_record['pid']}, ) assert response.status_code == HTTP_403_FORBIDDEN @@ -306,7 +303,7 @@ def test_token_store_adding(fastapi_client_simple): def test_funky_pid(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple record_pid = 'dlflatsocial:contributors/someone' for i, token in basic_write_locations: response = test_client.post( @@ -326,7 +323,7 @@ def test_funky_pid(fastapi_client_simple): def test_token_store_priority(fastapi_client_simple): - test_client, store_dir = fastapi_client_simple + test_client, store_dir, _ = fastapi_client_simple # Post a record with the same pid as the global store's test record, but # with different content. @@ -355,7 +352,7 @@ def test_token_store_priority(fastapi_client_simple): def test_unknown_token(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple # Check that fetching with an unknown token is handled gracefully response = test_client.get( @@ -374,7 +371,7 @@ def test_unknown_token(fastapi_client_simple): def test_curie_expansion(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple # Check that the pid is expanded correctly response = test_client.get( @@ -389,7 +386,7 @@ def test_curie_expansion(fastapi_client_simple): def test_server(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( '/server', @@ -433,25 +430,25 @@ def test_server(fastapi_client_simple): def test_ignore_classes(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for class_name in ('Organization', 'Project'): response = test_client.post( f'/collection_dlflatsocial-1/record/{class_name}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json={'pid': f'dlflatsocial:c_{class_name}'}, ) assert response.status_code == HTTP_200_OK response = test_client.post( f'/collection_dlflatsocial-2/record/{class_name}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json={'pid': f'dlflatsocial:c_{class_name}'}, ) assert response.status_code == HTTP_404_NOT_FOUND def test_maintenance(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple # Ensure that only curators can put a collection in maintenance mode response = test_client.post( @@ -464,14 +461,14 @@ def test_maintenance(fastapi_client_simple): # Ensure unknown collections are caught in maintenance mode response = test_client.post( '/maintenance', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, json={'collection': 'collection_x', 'active': True}, ) assert response.status_code == HTTP_404_NOT_FOUND response = test_client.post( '/maintenance', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, json={'collection': 'collection_1', 'active': True}, ) @@ -491,7 +488,7 @@ def test_maintenance(fastapi_client_simple): # Deactivate maintenance mode response = test_client.post( '/maintenance', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, json={'collection': 'collection_1', 'active': False}, ) assert response.status_code == HTTP_200_OK diff --git a/dump_things_service/tests/test_collection_administration.py b/dump_things_service/tests/test_collection_administration.py new file mode 100644 index 0000000..6e5ada4 --- /dev/null +++ b/dump_things_service/tests/test_collection_administration.py @@ -0,0 +1,240 @@ +from pathlib import ( + Path, + PurePosixPath, +) + +from starlette.testclient import TestClient + +from dump_things_service import ( + HTTP_201_CREATED, + HTTP_200_OK, + HTTP_404_NOT_FOUND, + HTTP_401_UNAUTHORIZED, +) +from dump_things_service.abstract_config import ( + TokenCollectionConfig, + TokenModes, + hash_token_representation, +) +from dump_things_service.collection_endpoints import CollectionRequest +from dump_things_service.token_endpoints import ( + TokenRequest, + AdminTokenRequest, +) +from dump_things_service.utils import cleaned_json + +# String representation of curated- and incoming-path +curated = 'admin_test_curated' +incoming = 'admin_test_incoming' + +# Path to a local simple test schema +test_schema_location = str((Path(__file__).parent / 'testschema.yaml').absolute()) + +new_collection_name = 'admin_test_collection' +new_token_name = 'admin_test_token' +new_token_representation = 'admin_test_token' +new_collection_request = CollectionRequest( + name=new_collection_name, + default_token='test_default_token', + curated=PurePosixPath(f'{curated}/admin_test_collection'), + schema=test_schema_location, + incoming=PurePosixPath(f'{incoming}/admin_test_collection'), +) + +new_token_request = TokenRequest( + name=new_token_name, + user_id='admin_test_token_user', + hashed=False, + representation=new_token_representation, + collections={ + new_collection_name: TokenCollectionConfig( + mode=TokenModes.WRITE_COLLECTION, + incoming_label=f'{new_collection_name}_label', + ) + }, +) + +new_admin_token_name='New_Admin_Token' +plain_new_admin_token = 'admin-XXX' +new_admin_token_request = AdminTokenRequest( + name=new_admin_token_name, + representation=hash_token_representation(plain_new_admin_token), +) + + +def _name_in_openapi_paths( + test_client: TestClient, + name: str, +) -> bool: + response = test_client.get('/openapi.json') + open_api = response.json() + for path in open_api['paths'].keys(): + if name in path: + return True + return False + + +def test_collection_adding(fastapi_client_simple): + test_client, _, admin_token = fastapi_client_simple + + # Check that the collection does not yet exist + response = test_client.get( + f'/collections/{new_collection_name}', + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_404_NOT_FOUND + assert not _name_in_openapi_paths(test_client, new_collection_name) + + # Add a new collection + response = test_client.post( + '/collections', + headers={'x-dumpthings-token': admin_token}, + json=new_collection_request.model_dump(mode='json', by_alias=True), + ) + assert response.status_code == HTTP_201_CREATED + assert _name_in_openapi_paths(test_client, new_collection_name) + + response = test_client.get( + f'/collections/{new_collection_name}', + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_200_OK + new_collection_config = new_collection_request.model_dump(mode='json', by_alias=True) + del new_collection_config['name'] + assert response.json() == new_collection_config + + # Add a token to the collection + response = test_client.post( + '/tokens', + headers={'x-dumpthings-token': admin_token}, + json=new_token_request.model_dump(mode='json'), + ) + assert response.status_code == HTTP_201_CREATED + + # Read the token back + response = test_client.get( + f'/tokens/{new_token_name}', + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_200_OK + assert response.json() == { + 'name': new_token_request.name, + 'user_id': new_token_request.user_id, + 'collections': new_token_request.model_dump(mode='json')['collections'], + } + + new_record = { + 'pid': 'http://example.com/admin-test-1', + 'given_name': 'Admin Test 1', + 'schema_type': 'abc:Person', + } + + # Add a record to the collection + response = test_client.post( + f'/{new_collection_name}/record/Person', + headers={'x-dumpthings-token': new_token_representation}, + json=new_record, + ) + assert response.status_code == HTTP_200_OK + + # Read the record back + response = test_client.get( + f'/{new_collection_name}/records/Person', + headers={'x-dumpthings-token': new_token_representation}, + ) + assert response.status_code == HTTP_200_OK + assert cleaned_json(response.json()[0], ('annotations',)) == new_record + + # Remove the token + response = test_client.delete( + f'/tokens/{new_token_name}', + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_200_OK + + # Check that posting is not possible with the removed token + response = test_client.post( + f'/{new_collection_name}/record/Person', + headers={'x-dumpthings-token': new_token_representation}, + json=new_record, + ) + assert response.status_code == HTTP_401_UNAUTHORIZED + + # Remove the collection + response = test_client.delete( + f'/collections/{new_collection_name}', + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_200_OK + + # Check that the collection endpoints are not found + response = test_client.get( + f'/{new_collection_name}/records/Person', + headers={'x-dumpthings-token': new_token_representation}, + ) + assert response.status_code == HTTP_404_NOT_FOUND + + # Check that the openapi document is adjusted + assert not _name_in_openapi_paths(test_client, new_collection_name) + + +def test_collection_reading(fastapi_client_simple): + test_client, _, admin_token = fastapi_client_simple + + # Check that the new admin token is not yet working + response = test_client.get( + f'/collections', + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_200_OK + response_object = response.json() + assert isinstance(response_object, dict) + assert len(response_object) == 10 + + +def test_admin_token_management(fastapi_client_simple): + test_client, _, admin_token = fastapi_client_simple + + # Check that the new admin token is not yet working + response = test_client.get( + f'/collections/collection_1', + headers={'x-dumpthings-token': plain_new_admin_token}, + ) + assert response.status_code == HTTP_401_UNAUTHORIZED + + # Add a new admin token + response = test_client.post( + '/admin_tokens', + headers={'x-dumpthings-token': admin_token}, + json=new_admin_token_request.model_dump(mode='json'), + ) + assert response.status_code == HTTP_201_CREATED + + # Try the new token + response = test_client.get( + f'/collections/collection_1', + headers={'x-dumpthings-token': plain_new_admin_token}, + ) + assert response.status_code == HTTP_200_OK + + # Check that the token shows up in the token list + response = test_client.get( + f'/admin_tokens', + headers={'x-dumpthings-token': plain_new_admin_token}, + ) + assert response.status_code == HTTP_200_OK + assert new_admin_token_name in response.json() + + # Delete the new admin token + response = test_client.delete( + f'/admin_tokens/{new_admin_token_name}', + headers={'x-dumpthings-token': plain_new_admin_token}, + ) + assert response.status_code == HTTP_200_OK + + response = test_client.get( + f'/admin_tokens', + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_200_OK + assert new_admin_token_name not in response.json() diff --git a/dump_things_service/tests/test_config.py b/dump_things_service/tests/test_config.py index 011ff73..5b72fad 100644 --- a/dump_things_service/tests/test_config.py +++ b/dump_things_service/tests/test_config.py @@ -1,188 +1,224 @@ +import hashlib +from pathlib import PurePosixPath import pytest -import yaml -from pydantic import ValidationError -from yaml.scanner import ScannerError -from dump_things_service.config import ( - ConfigError, - GlobalConfig, - process_config, - process_config_object, +from dump_things_service import ( + HTTP_200_OK, + HTTP_201_CREATED, + HTTP_406_NOT_ACCEPTABLE, + HTTP_409_CONFLICT, +) +from dump_things_service.abstract_config import ( + TokenCollectionConfig, + TokenModes, + dump_things_config_iri, + dump_things_private_collection_name, + get_config_backends, + read_config, +) +from dump_things_service.collection_endpoints import CollectionRequest +from dump_things_service.exceptions import ConfigError +from dump_things_service.tests import schema_file +from dump_things_service.token_endpoints import TokenRequest + + +collection_request_pattern = CollectionRequest( + name='', + schema=str(schema_file), + default_token='test_default_token', + curated=PurePosixPath('curate_dir'), + incoming=PurePosixPath(f'incoming_dir'), ) -def test_scanner_error_detection(tmp_path): - config_file_path = tmp_path / 'config.yaml' - config_file_path.write_text('type: col: le\n:xxx:') - global_dict = {} - with pytest.raises(ConfigError) as e: - process_config(tmp_path, config_file_path, [], global_dict) - assert isinstance(e.value.__cause__, ScannerError) +def test_illegal_collection_name_detection(fastapi_client_simple): + test_client, _, admin_token = fastapi_client_simple - -def test_structure_error_detection(tmp_path): - config_file_path = tmp_path / 'config.yaml' - config_file_path.write_text('type: colle\n') - global_dict = {} - with pytest.raises(ConfigError) as e: - process_config(tmp_path, config_file_path, [], global_dict) - assert isinstance(e.value.__cause__, ValidationError) - - -def test_missing_incoming_detection(tmp_path): - config_object = GlobalConfig( - **yaml.load( - """ -type: collections -version: 1 -collections: - collection_1: - default_token: basic_access - curated: curated/collection_1 - -tokens: - basic_access: - user_id: anonymous - collections: - collection_1: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - """, - Loader=yaml.SafeLoader, + for name in ( + 'collections', + 'tokens', + 'admin_tokens', + dump_things_private_collection_name, + ): + response = test_client.post( + f'/collections', + json={ + **collection_request_pattern.model_dump(mode='json', by_alias=True), + 'name': name, + }, + headers={'x-dumpthings-token': admin_token}, ) + assert response.status_code == HTTP_409_CONFLICT + + +def test_collection_dir_reuse_detection(fastapi_client_simple): + test_client, _, admin_token = fastapi_client_simple + + for curated_path, incoming_path in ( + ('curated/collection_1', 'incoming/XXXX'), + ('curated/XXXX', 'incoming/collection_1'), + ('curated/collection_1', 'incoming/collection_2'), + ): + response = test_client.post( + f'/collections', + json={ + **collection_request_pattern.model_dump(mode='json', by_alias=True), + 'curated': curated_path, + 'incoming': incoming_path, + }, + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_409_CONFLICT + + +def test_scanner_error_detection(tmp_path_factory): + tmp_path = tmp_path_factory.mktemp('config_scanner_test') + + config_backend, audit_backend = get_config_backends(tmp_path) + config_backend.add_record( + iri=dump_things_config_iri, + class_name='DumpThingsConfig', + json_object={'pid': dump_things_config_iri} ) - global_dict = {} + md5_hexdigest = hashlib.md5(dump_things_config_iri.encode()).hexdigest() + config_file_path = config_backend.root / 'DumpThingsConfig' / f'{md5_hexdigest}.yaml' + config_file_path.write_text('collections: ::: -\n sdsdfsdf: xxx') with pytest.raises(ConfigError): - process_config_object(tmp_path, config_object, [], global_dict) + read_config(tmp_path, force_reload=True) -def test_submission_tags_handling(dump_stores_simple): - config_object = GlobalConfig( - **yaml.load( - """ -type: collections -version: 1 -collections: - collection_1: - default_token: basic_access - curated: curated/in_token_1 - incoming: contributions - submission_tags: - submitter_id_tag: no_default_id_tag - submission_time_tag: no_default_time_tag - collection_2: - default_token: basic_access - curated: curated/collection_2 - incoming: contributions -tokens: - basic_access: - user_id: anonymous - collections: - collection_1: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - collection_2: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - """, - Loader=yaml.SafeLoader, - ) +def test_structure_error_detection(tmp_path_factory): + tmp_path = tmp_path_factory.mktemp('config_scanner_test') + + config_backend, audit_backend = get_config_backends(tmp_path) + config_backend.add_record( + iri=dump_things_config_iri, + class_name='DumpThingsConfig', + json_object={'pid': dump_things_config_iri} ) - global_dict = {} - config = process_config_object(dump_stores_simple, config_object, [], global_dict) - # Check for specified tags in collection `collection_1` - assert config.collections['collection_1'].submission_tags.submission_time_tag == 'no_default_time_tag' - assert config.collections['collection_1'].submission_tags.submitter_id_tag == 'no_default_id_tag' - # Check for default tags in collection `collection_2` - assert config.collections['collection_2'].submission_tags.submission_time_tag == 'http://semanticscience.org/resource/SIO_001083' - assert config.collections['collection_2'].submission_tags.submitter_id_tag == 'http://purl.obolibrary.org/obo/NCIT_C54269' + md5_hexdigest = hashlib.md5(dump_things_config_iri.encode()).hexdigest() + config_file_path = config_backend.root / 'DumpThingsConfig' / f'{md5_hexdigest}.yaml' + config_file_path.write_text('type: 1\n') + with pytest.raises(ConfigError): + read_config(tmp_path, force_reload=True) -def test_submission_tags_resolving(dump_stores_simple): - config_object = GlobalConfig( - **yaml.load( - """ -type: collections -version: 1 -collections: - collection_1: - default_token: basic_access - curated: curated/in_token_1 - incoming: contributions - submission_tags: - submitter_id_tag: abc:id - submission_time_tag: abc:time -tokens: - basic_access: - user_id: anonymous - collections: - collection_1: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - """, - Loader=yaml.SafeLoader, - ) +def test_missing_incoming_detection(fastapi_client_simple): + test_client, _, admin_token = fastapi_client_simple + + # Add a collection without incoming + collection_request = CollectionRequest( + name='missing_incoming_detection_test', + default_token='Test XXXXX (CURATOR)', + curated=PurePosixPath('missing_incoming_detection'), + schema=str(schema_file), ) - global_dict = {} - process_config_object(dump_stores_simple, config_object, [], global_dict) + response = test_client.post( + '/collections', + json=collection_request.model_dump(mode='json', by_alias=True), + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_201_CREATED - -def test_submission_tags_resolving_error(dump_stores_simple): - config_object = GlobalConfig( - **yaml.load( - """ -type: collections -version: 1 -collections: - collection_1: - default_token: basic_access - curated: curated/in_token_1 - incoming: contributions - submission_tags: - submitter_id_tag: non-existing:id - collection_2: - default_token: basic_access - curated: curated/in_token_1 - incoming: contributions - submission_tags: - submission_time_tag: non-existing:time - collection_3: - default_token: basic_access - curated: curated/in_token_1 - incoming: contributions - submission_tags: - submitter_id_tag: http://something/non-existing - collection_4: - default_token: basic_access - curated: curated/in_token_1 - incoming: contributions - submission_tags: - submission_time_tag: http://something/non-existing -tokens: - basic_access: - user_id: anonymous - collections: - collection_1: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - collection_2: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - collection_3: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - collection_4: - mode: WRITE_COLLECTION - incoming_label: incoming_anonymous - """, - Loader=yaml.SafeLoader, - ) + # Add a write token that references the collection, expect this to + # fail because the collection does not contain an incoming path + token_request = TokenRequest( + name='missing-incoming-token', + user_id='missing_incoming_user', + collections={ + 'missing_incoming_detection_test': TokenCollectionConfig( + mode=TokenModes.CURATOR, + incoming_label='', + ) + } ) - global_dict = {} - with pytest.raises(ConfigError) as e: - process_config_object(dump_stores_simple, config_object, [], global_dict) + # Check that a write token for a collection without incoming path cannot + # be created. + response = test_client.post( + '/tokens', + json=token_request.model_dump(mode='json', by_alias=True), + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_406_NOT_ACCEPTABLE + + # Remove the collection without incoming path + response = test_client.delete( + '/collections/missing_incoming_detection_test', + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_200_OK + + # Add a collection with incoming path + collection_request.incoming = PurePosixPath('missing_incoming_detection_test_incoming') + response = test_client.post( + '/collections', + json=collection_request.model_dump(mode='json', by_alias=True), + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_201_CREATED + + # Check that a write token for a collection with an incoming path but a + # missing label cannot be created. + response = test_client.post( + '/tokens', + json=token_request.model_dump(mode='json', by_alias=True), + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_406_NOT_ACCEPTABLE + + # Check that a write token for a collection with an incoming path can be created + token_request.collections['missing_incoming_detection_test'] = TokenCollectionConfig( + mode=TokenModes.CURATOR, + incoming_label='test_incoming_label', + ) + response = test_client.post( + '/tokens', + json=token_request.model_dump(mode='json', by_alias=True), + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_201_CREATED + + # Remove the collection with the incoming path + response = test_client.delete( + '/collections/missing_incoming_detection_test', + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_200_OK + + # Check that a creation attempt for the collection without incoming path fails + collection_request.incoming = None + response = test_client.post( + '/collections', + json=collection_request.model_dump(mode='json', by_alias=True), + headers={'x-dumpthings-token': admin_token}, + ) + assert response.status_code == HTTP_406_NOT_ACCEPTABLE + + +def test_submission_tags_handling(fastapi_client_simple): + test_client, _, admin_token = fastapi_client_simple + + response = test_client.get( + '/collections/collection_8', + headers={'x-dumpthings-token': admin_token}, + ) + json_obj = response.json() + assert json_obj['submission_tags'] == { + 'submitter_id_tag': 'no_default_id_tag', + 'submission_time_tag': 'no_default_time_tag', + } + + response = test_client.get( + '/collections/collection_1', + headers={'x-dumpthings-token': admin_token}, + ) + json_obj = response.json() + assert json_obj['submission_tags'] == { + 'submitter_id_tag': 'http://purl.obolibrary.org/obo/NCIT_C54269', + 'submission_time_tag': 'http://semanticscience.org/resource/SIO_001083', + } diff --git a/dump_things_service/tests/test_curated.py b/dump_things_service/tests/test_curated.py index 7eebee2..47737a8 100644 --- a/dump_things_service/tests/test_curated.py +++ b/dump_things_service/tests/test_curated.py @@ -9,7 +9,8 @@ from dump_things_service import ( HTTP_200_OK, HTTP_404_NOT_FOUND, ) -from dump_things_service.config import get_config +from dump_things_service.instance_state import get_instance_state + delete_record = { 'schema_type': 'abc:Person', @@ -25,7 +26,7 @@ def test_read_curated_records( paginate, class_name, ): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( f'/collection_1/curated/records/{paginate}{class_name}', @@ -40,7 +41,7 @@ def test_read_curated_records( assert len(json_object) == 3 for pattern, count in (('%25wolf%25', 1), ('%25cura%25', 2)): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( f'/collection_8/curated/records/{paginate}{class_name}?matching={pattern}', headers={'x-dumpthings-token': 'token_1_xxxxx'}, @@ -58,7 +59,7 @@ pytest.mark.parametrize( ('abc:mode_test', 'abc:some_timee@x.com', 'abc:curated'), ) def test_read_curated_records_by_pid(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( '/no_such_collection/curated/records/', @@ -68,7 +69,7 @@ def test_read_curated_records_by_pid(fastapi_client_simple): def test_unknown_collection(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( '/no_such_collection/curated/records/', @@ -78,7 +79,7 @@ def test_unknown_collection(fastapi_client_simple): def test_curated_delete(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.post( '/collection_8/curated/record/Person', @@ -116,12 +117,12 @@ def test_curated_delete(fastapi_client_simple): def test_audit_backend(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple record_id = 'abc:audit-trailed' names = 'Frederick', 'Johny' - tokens = 'token_1_xxxxx', 'token_admin' - user_names = 'test_user_1_curated', 'test_admin' + tokens = 'token_1_xxxxx', 'token_curator' + user_names = 'test_user_1_curator', 'test_curator' json_objects = tuple( { 'schema_type': 'abc:Person', @@ -139,8 +140,8 @@ def test_audit_backend(fastapi_client_simple): ) assert response.status_code == HTTP_200_OK - config_instance = get_config() - audit_backend = config_instance.audit_backends['collection_1'][0] + instance_state = get_instance_state() + audit_backend = instance_state.audit_backends['collection_1'][0] changes = audit_backend.get_audit_log(record_id) assert len(changes) == 2 @@ -152,12 +153,11 @@ def test_audit_backend(fastapi_client_simple): def test_audit_backend_auto_flush(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple record_id = 'abc:audit-trailed' names = 'Robert', 'Anton' - tokens = 'token_1_xxxxx', 'token_admin' - user_names = 'test_user_1_curated', 'test_admin' + tokens = 'token_1_xxxxx', 'token_curator' json_objects = tuple( { 'schema_type': 'abc:Person', @@ -175,8 +175,8 @@ def test_audit_backend_auto_flush(fastapi_client_simple): ) assert response.status_code == HTTP_200_OK - config_instance = get_config() - audit_backend = config_instance.audit_backends['collection_1'][0] + instance_state = get_instance_state() + audit_backend = instance_state.audit_backends['collection_1'][0] assert audit_backend.current_change_set, 'expected unpersisted changes in audit log' diff --git a/dump_things_service/tests/test_extract_inline.py b/dump_things_service/tests/test_extract_inline.py index 4928dc7..6cb09b4 100644 --- a/dump_things_service/tests/test_extract_inline.py +++ b/dump_things_service/tests/test_extract_inline.py @@ -212,13 +212,13 @@ def test_dont_extract_empty_things_locally(): # relations @pytest.mark.xfail def test_inline_extraction_on_service(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 3): # Deposit JSON record response = test_client.post( f'/collection_dlflatsocial-{i}/record/Person', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json=inlined_json_record, ) assert response.status_code == HTTP_200_OK, 'Response content:' + response.text @@ -231,7 +231,7 @@ def test_inline_extraction_on_service(fastapi_client_simple): for record_pid in (entry[0] for entry in tree): response = test_client.get( f'/collection_dlflatsocial-{i}/record?pid={record_pid}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK records.append(response.json()) @@ -247,7 +247,7 @@ def test_inline_extraction_on_service(fastapi_client_simple): ): records = test_client.get( f'/collection_dlflatsocial-{i}/records/{class_name}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ).json() for pid in pids: assert any(record['pid'] == pid for record in records) @@ -257,14 +257,14 @@ def test_inline_extraction_on_service(fastapi_client_simple): # relations @pytest.mark.xfail def test_inline_ttl_processing(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 3): # Deposit TTL records for class_name, ttl_record in ttls_with_inline: response = test_client.post( f'/collection_dlflatsocial-{i}/record/{class_name}?format=ttl', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json=ttl_record, ) assert response.status_code == HTTP_200_OK @@ -275,7 +275,7 @@ def test_inline_ttl_processing(fastapi_client_simple): for record_pid in (entry[0] for entry in ttl_tree): response = test_client.get( f'/collection_dlflatsocial-{i}/record?pid={record_pid}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK records.append(response.json()) @@ -291,7 +291,7 @@ def test_inline_ttl_processing(fastapi_client_simple): ): records = test_client.get( f'/collection_dlflatsocial-{i}/records/{class_name}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ).json() for pid in pids: assert any(record['pid'] == pid for record in records) @@ -329,7 +329,7 @@ def test_dont_extract_empty_things_on_service(fastapi_client_simple): # Deposit JSON record response = test_client.post( f'/collection_dlflatsocial-{i}/record/Person', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json=empty_inlined_json_record, ) assert response.status_code == HTTP_200_OK diff --git a/dump_things_service/tests/test_incoming.py b/dump_things_service/tests/test_incoming.py index f6efb05..15b27a4 100644 --- a/dump_things_service/tests/test_incoming.py +++ b/dump_things_service/tests/test_incoming.py @@ -17,12 +17,12 @@ delete_record = { def test_incoming_labels(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 9): response = test_client.get( f'/collection_{i}/incoming/', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) existing_labels = response.json() assert len(existing_labels) >= 1 @@ -46,7 +46,7 @@ def fill_zones(test_client): (7, 'admin_common'), (8, 'admin_common'), ): - token = 'token_admin' + token = 'token_curator' result = test_client.post( f'/collection_{collection_id}/incoming/{label}/record/Person', headers={'x-dumpthings-token': token}, @@ -63,16 +63,16 @@ def fill_zones(test_client): @pytest.mark.parametrize('paginate', ('', 'p/')) @pytest.mark.parametrize('class_name', ('', 'Person')) def test_read_incoming_records( - fastapi_client_simple, - paginate: str, - class_name: str, + fastapi_client_simple, + paginate: str, + class_name: str, ): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple fill_zones(test_client) for collection_id, labels in ( - (1, ['modes', 'admin_1', 'cmo', 'in_token_1']), + (1, ['modes', 'admin_1', 'in_token_1']), (2, ['in_token-2', 'admin_2']), (3, ['admin_3']), (4, ['admin_4']), @@ -85,9 +85,9 @@ def test_read_incoming_records( for label in labels: response = test_client.get( f'/collection_{collection_id}/incoming/{label}/records/{paginate}{class_name}', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) - assert response.status_code == HTTP_200_OK + assert response.status_code == HTTP_200_OK, f'failed on collection: {collection_id}, label: {label}, class: {class_name}' # We don't know the exact number of entries in each zone, because # it depends on the tests that ran before. @@ -96,10 +96,10 @@ def test_read_incoming_records( expected_length = 0 if label.startswith('admin_'): expected_length = 1 - pattern = f'abc:test_incoming-collection_{collection_id}-token_admin' + pattern = f'abc:test_incoming-collection_{collection_id}-token_curator' response = test_client.get( f'/collection_{collection_id}/incoming/{label}/records/{paginate}{class_name}?matching={pattern}', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_200_OK json_object = response.json() @@ -120,7 +120,7 @@ pytest.mark.parametrize( ('abc:mode_test', 'abc:some_timee@x.com', 'abc:curated'), ) def test_read_incoming_records_by_pid(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( '/no_such_collection/curated/records/', @@ -130,74 +130,74 @@ def test_read_incoming_records_by_pid(fastapi_client_simple): def test_incoming_unknown_collection(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( '/no_such_collection/incoming/no_such_label/records/', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_404_NOT_FOUND def test_incoming_unknown_label(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.get( '/collection_1/incoming/no_such_label/records/', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_404_NOT_FOUND def test_incoming_delete(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple response = test_client.post( '/collection_7/incoming/admin_common/record/Person', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, json=delete_record, ) assert response.status_code == HTTP_200_OK response = test_client.get( '/collection_7/incoming/admin_common/record?pid=abc:delete-me', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_200_OK assert response.json()['pid'] == 'abc:delete-me' response = test_client.delete( '/collection_7/incoming/admin_common/record?pid=abc:delete-me', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_200_OK assert response.json() is True response = test_client.get( '/collection_7/incoming/admin_common/record?pid=abc:delete-me', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_200_OK assert response.json() is None response = test_client.delete( '/collection_7/incoming/admin_common/record?pid=abc:delete-me', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_404_NOT_FOUND def test_incoming_on_disk_only(fastapi_client_simple): - test_client, data_root = fastapi_client_simple + test_client, data_root, _ = fastapi_client_simple # add a random directory to the incoming area of collection_1 random_part = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=6)) dir_name = f'random_{random_part}' - (data_root / 'incoming' / dir_name).mkdir() + (data_root / 'incoming' / 'collection_1' / dir_name).mkdir() response = test_client.get( '/collection_1/incoming/', - headers={'x-dumpthings-token': 'token_admin'}, + headers={'x-dumpthings-token': 'token_curator'}, ) assert response.status_code == HTTP_200_OK assert dir_name in response.json() diff --git a/dump_things_service/tests/test_mapping.py b/dump_things_service/tests/test_mapping.py index dae22d9..417d3a3 100644 --- a/dump_things_service/tests/test_mapping.py +++ b/dump_things_service/tests/test_mapping.py @@ -19,7 +19,7 @@ record_b = { def test_mapping_functions_ignore_data(fastapi_client_simple): - test_client, store_path = fastapi_client_simple + test_client, store_path, _ = fastapi_client_simple for i, token in basic_write_locations: response = test_client.post( diff --git a/dump_things_service/tests/test_modes.py b/dump_things_service/tests/test_modes.py index 22805df..2b22206 100644 --- a/dump_things_service/tests/test_modes.py +++ b/dump_things_service/tests/test_modes.py @@ -50,7 +50,7 @@ def verify_modes( def test_token_modes(fastapi_client_simple): - test_client, store_dir = fastapi_client_simple + test_client, store_dir, _ = fastapi_client_simple # Post a record to incoming of collections `collection_1`. We use it to # validate read/write permissions on class-base diff --git a/dump_things_service/tests/test_pid_resolution.py b/dump_things_service/tests/test_pid_resolution.py index 4fe2a69..d9caa0d 100644 --- a/dump_things_service/tests/test_pid_resolution.py +++ b/dump_things_service/tests/test_pid_resolution.py @@ -6,7 +6,7 @@ from .. import HTTP_422_UNPROCESSABLE_CONTENT @pytest.mark.parametrize('pid', ['unknown_prefix:test_pid', 'abc:test_öö_pid']) @pytest.mark.parametrize('url_part', ['', 'curated/', 'incoming/in_token_1/']) def test_store_record_validation(fastapi_client_simple, pid, url_part): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple # Store a record in two collections response = test_client.post( @@ -15,41 +15,3 @@ def test_store_record_validation(fastapi_client_simple, pid, url_part): json={'pid': pid}, ) assert response.status_code == HTTP_422_UNPROCESSABLE_CONTENT - - -x = """ -def test_store_record_curated_with_unresolvable_pid(fastapi_client_simple): - test_client, _ = fastapi_client_simple - - # Store a record in two collections - response = test_client.post( - f'/collection_1/curated/record/Person', - headers={'x-dumpthings-token': 'token_admin'}, - json={'pid': 'unknown_prefix:test_pid'}, - ) - assert response.status_code == HTTP_422_UNPROCESSABLE_CONTENT - - -def test_store_record_incoming_with_unresolvable_pid(fastapi_client_simple): - test_client, _ = fastapi_client_simple - - # Store a record in two collections - response = test_client.post( - f'/collection_1/incoming/in_token_1/record/Person', - headers={'x-dumpthings-token': 'token_admin'}, - json={'pid': 'unknown_prefix:test_pid'}, - ) - assert response.status_code == HTTP_422_UNPROCESSABLE_CONTENT - - -def test_store_record_with_non_ascii_pid(fastapi_client_simple): - test_client, _ = fastapi_client_simple - - # Store a record in two collections - response = test_client.post( - f'/collection_1/record/Person', - headers={'x-dumpthings-token': 'token-1'}, - json={'pid': 'abc:test_pid'}, - ) - assert response.status_code == HTTP_422_UNPROCESSABLE_CONTENT -""" diff --git a/dump_things_service/tests/test_roundtrip.py b/dump_things_service/tests/test_roundtrip.py index f324fd4..175d154 100644 --- a/dump_things_service/tests/test_roundtrip.py +++ b/dump_things_service/tests/test_roundtrip.py @@ -26,7 +26,7 @@ ttl_result_record_a = """@prefix abc: . xyz:HenryAdams a abc:Person ; abc:annotations [ a abc:Annotation ; - abc:annotation_tag ; + abc:annotation_tag ; abc:annotation_value "1970-01-01T00:00:00" ], [ a abc:Annotation ; abc:annotation_tag oxo:NCIT_C54269 ; @@ -44,7 +44,7 @@ xyz:HenryAdams a abc:Person ; abc:annotation_tag oxo:NCIT_C54269 ; abc:annotation_value "test_user_1" ], [ a abc:Annotation ; - abc:annotation_tag ; + abc:annotation_tag ; abc:annotation_value "1970-01-01T00:00:00" ] ; abc:given_name "Henryöäß" ; abc:schema_type "abc:Person" . @@ -54,7 +54,7 @@ new_json_pid = 'xyz:HenryBaites' def test_json_ttl_json(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple # Deposit JSON records response = test_client.post( @@ -96,7 +96,7 @@ def test_json_ttl_json(fastapi_client_simple): @freezegun.freeze_time('1970-01-01') def test_ttl_json_ttl(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple # Deposit a ttl record response = test_client.post( diff --git a/dump_things_service/tests/test_roundtrip_flatsocial.py b/dump_things_service/tests/test_roundtrip_flatsocial.py index 46239cd..8c7f8d9 100644 --- a/dump_things_service/tests/test_roundtrip_flatsocial.py +++ b/dump_things_service/tests/test_roundtrip_flatsocial.py @@ -48,7 +48,7 @@ dlflatsocial:another_john_ttl a dlflatsocial:Person ; dlthings:annotation_value "1970-01-01T00:00:00" ], [ a dlflat:FlatAnnotation ; dlthings:annotation_tag obo:NCIT_C54269 ; - dlthings:annotation_value "test_user_1" ] . + dlthings:annotation_value "user_all" ] . """ @@ -62,7 +62,7 @@ dlflatsocial:another_john_ttl a dlflatsocial:Person ; dlsocialmx:given_name "Johnöüß" ; dlthings:annotations [ a dlflat:FlatAnnotation ; dlthings:annotation_tag obo:NCIT_C54269 ; - dlthings:annotation_value "test_user_1" ], + dlthings:annotation_value "user_all" ], [ a dlflat:FlatAnnotation ; dlthings:annotation_tag ; dlthings:annotation_value "1970-01-01T00:00:00" ] . @@ -73,13 +73,13 @@ new_json_pid = 'dlflatsocial:another_john_ttl' def test_json_ttl_json_dlflatsocial(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 3): # Deposit JSON records response = test_client.post( f'/collection_dlflatsocial-{i}/record/Person', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json=json_record, ) assert response.status_code == HTTP_200_OK @@ -87,7 +87,7 @@ def test_json_ttl_json_dlflatsocial(fastapi_client_simple): # Retrieve TTL records response = test_client.get( f'/collection_dlflatsocial-{i}/record?pid={json_record["pid"]}&format=ttl', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK ttl = response.text @@ -97,7 +97,7 @@ def test_json_ttl_json_dlflatsocial(fastapi_client_simple): response = test_client.post( f'/collection_dlflatsocial-{i}/record/Person?format=ttl', - headers={'content-type': 'text/turtle', 'x-dumpthings-token': 'token-1'}, + headers={'content-type': 'text/turtle', 'x-dumpthings-token': 'token-all'}, data=ttl, ) assert response.status_code == HTTP_200_OK @@ -105,7 +105,7 @@ def test_json_ttl_json_dlflatsocial(fastapi_client_simple): # Retrieve JSON record response = test_client.get( f'/collection_dlflatsocial-{i}/record?pid={new_ttl_pid}&format=json', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK json_object = cleaned_json(response.json(), remove_keys=('annotations',)) @@ -116,14 +116,14 @@ def test_json_ttl_json_dlflatsocial(fastapi_client_simple): @freezegun.freeze_time('1970-01-01') def test_ttl_json_ttl_dlflatsocial(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for i in range(1, 3): # Deposit a ttl record response = test_client.post( f'/collection_dlflatsocial-{i}/record/Person?format=ttl', headers={ - 'x-dumpthings-token': 'token-1', + 'x-dumpthings-token': 'token-all', 'content-type': 'text/turtle', }, data=ttl_input_record, @@ -133,7 +133,7 @@ def test_ttl_json_ttl_dlflatsocial(fastapi_client_simple): # Retrieve JSON records response = test_client.get( f'/collection_dlflatsocial-{i}/record?pid=dlflatsocial:test_john_ttl&format=json', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK json_object = response.json() @@ -143,7 +143,7 @@ def test_ttl_json_ttl_dlflatsocial(fastapi_client_simple): response = test_client.post( f'/collection_dlflatsocial-{i}/record/Person?format=json', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json=json_object, ) assert response.status_code == HTTP_200_OK @@ -151,7 +151,7 @@ def test_ttl_json_ttl_dlflatsocial(fastapi_client_simple): # Retrieve ttl record response = test_client.get( f'/collection_dlflatsocial-{i}/record?pid={new_json_pid}&format=ttl', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert response.status_code == HTTP_200_OK assert ( diff --git a/dump_things_service/tests/test_token_endpoints.py b/dump_things_service/tests/test_token_endpoints.py new file mode 100644 index 0000000..0209eb3 --- /dev/null +++ b/dump_things_service/tests/test_token_endpoints.py @@ -0,0 +1,45 @@ +from starlette.status import HTTP_409_CONFLICT + +from dump_things_service import HTTP_201_CREATED + + +def test_token_creation(fastapi_client_simple): + test_client, _, admin_token = fastapi_client_simple + + json_record = { + 'name': 'a', + 'user_id': 'u_a', + 'representation': '8bb6805ff10bcb1c2ca49dcd4bfef94d', + 'collections': { + 'collection_1': { + 'mode': 'WRITE_COLLECTION', + 'incoming_label': 'i_a' + } + } + } + + # Create a token eith name 'a' + response = test_client.post( + '/tokens', + headers={'x-dumpthings-token': admin_token}, + json=json_record, + ) + assert response.status_code == HTTP_201_CREATED + + # Try to create another token with name 'a', should result in a 4ß9-error + response = test_client.post( + '/tokens', + headers={'x-dumpthings-token': admin_token}, + json=json_record, + ) + assert response.status_code == HTTP_409_CONFLICT + + # Try to create another token eith name 'b' and the same representation + # as 'a', should result in a 4ß9-error + json_record['name'] = 'b' + response = test_client.post( + '/tokens', + headers={'x-dumpthings-token': admin_token}, + json=json_record, + ) + assert response.status_code == HTTP_409_CONFLICT diff --git a/dump_things_service/tests/test_unicode.py b/dump_things_service/tests/test_unicode.py new file mode 100644 index 0000000..15cf7cc --- /dev/null +++ b/dump_things_service/tests/test_unicode.py @@ -0,0 +1,39 @@ +from pathlib import Path + +from .. import HTTP_200_OK + + +# Path to a local simple test schema +schema_file = Path(__file__).parent / 'testschema.yaml' + +extra_record = { + 'schema_type': 'abc:Person', + 'pid': 'abc:aaaa', + 'given_name': 'DavidÖÄÜ', +} +delete_record = { + 'schema_type': 'abc:Person', + 'pid': 'abc:delete-me', + 'given_name': 'Detlef', +} +unicode_name = 'AlienÖÄÜ-ß👽' +unicode_bytes = unicode_name.encode('utf-8') +unicode_record = { + 'schema_type': 'abc:Person', + 'pid': 'abc:unicode-test', + 'given_name': unicode_name, +} + + +def test_unicode_iri(fastapi_client_simple): + test_client, _, _ = fastapi_client_simple + + response = test_client.post( + '/collection_1/record/Person', + headers={'x-dumpthings-token': 'token-1'}, + json = { + 'pid': 'https://en.wikipedia.org/wiki/Universita_degli_Studi_eCampus', + 'given_name': 'Università degli Studi eCampus (Italy)', + } + ) + assert response.status_code == HTTP_200_OK diff --git a/dump_things_service/tests/test_utils.py b/dump_things_service/tests/test_utils.py index bfd4294..294a7d4 100644 --- a/dump_things_service/tests/test_utils.py +++ b/dump_things_service/tests/test_utils.py @@ -1,7 +1,7 @@ from dump_things_service.utils import cleaned_json basic_write_locations = tuple((x, f'token-{x}') for x in range(1, 3)) -unauthorized_write_locations = tuple((x, 'token-1') for x in range(3, 6)) +unauthorized_write_locations = tuple((x, 'token-1') for x in range(2, 9)) def test_cleaned_json(): diff --git a/dump_things_service/tests/test_validate.py b/dump_things_service/tests/test_validate.py index a5da6aa..800b17c 100644 --- a/dump_things_service/tests/test_validate.py +++ b/dump_things_service/tests/test_validate.py @@ -44,7 +44,7 @@ xyz:henry a abc:Person ; def test_validate_record(fastapi_client_simple): - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple for record, expected_status in json_records: response = test_client.post( diff --git a/dump_things_service/tests/test_web_interface.py b/dump_things_service/tests/test_web_interface.py index 61bdf44..8096d5a 100644 --- a/dump_things_service/tests/test_web_interface.py +++ b/dump_things_service/tests/test_web_interface.py @@ -18,17 +18,17 @@ pids = ('', '--------', '&&&&&', 'abc', 'abc&', 'abc&format=ttl') tuple(product(*(collection_names, class_names, queries, format_names))), ) def test_web_interface_post_errors( - fastapi_client_simple, - collection_name, - class_name, - query, - format_name, + fastapi_client_simple, + collection_name, + class_name, + query, + format_name, ): """Check that no internal server error occurs with weird input""" - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple result = test_client.post( f'/{collection_name}/record/{class_name}?{query}={format_name}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, json={'pid': 'xyz:web_interface_test_pid0x123123'}, ) assert result.status_code < HTTP_500_INTERNAL_SERVER_ERROR @@ -39,14 +39,14 @@ def test_web_interface_post_errors( tuple(product(*(collection_names, class_names, queries, format_names))), ) def test_web_interface_get_class_errors( - fastapi_client_simple, - collection_name, - class_name, - query, - format_name, + fastapi_client_simple, + collection_name, + class_name, + query, + format_name, ): """Check that no internal server error occurs with weird input""" - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple result = test_client.get( f'/{collection_name}/records/{class_name}?{query}={format_name}', ) @@ -54,7 +54,7 @@ def test_web_interface_get_class_errors( result = test_client.get( f'/{collection_name}/record/{class_name}?{query}={format_name}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert result.status_code < HTTP_500_INTERNAL_SERVER_ERROR @@ -64,14 +64,14 @@ def test_web_interface_get_class_errors( tuple(product(*(collection_names, pids, queries, format_names))), ) def test_web_interface_get_pid_errors( - fastapi_client_simple, - collection_name, - pid, - query, - format_name, + fastapi_client_simple, + collection_name, + pid, + query, + format_name, ): """Check that no internal server error occurs with weird input""" - test_client, _ = fastapi_client_simple + test_client, _, _ = fastapi_client_simple result = test_client.get( f'/{collection_name}/records?{pid}&{query}={format_name}', ) @@ -79,6 +79,6 @@ def test_web_interface_get_pid_errors( result = test_client.get( f'/{collection_name}/records?{pid}&{query}={format_name}', - headers={'x-dumpthings-token': 'token-1'}, + headers={'x-dumpthings-token': 'token-all'}, ) assert result.status_code < HTTP_500_INTERNAL_SERVER_ERROR diff --git a/dump_things_service/token.py b/dump_things_service/token.py deleted file mode 100644 index 15cb8ba..0000000 --- a/dump_things_service/token.py +++ /dev/null @@ -1,26 +0,0 @@ -import hashlib - -from pydantic import BaseModel - - -class TokenPermission(BaseModel): - curated_read: bool = False - incoming_read: bool = False - incoming_write: bool = False - curated_write: bool = False - zones_access: bool = False - - -def get_token_parts(token: str) -> list[str]: - parts = token.split('-', 1) - if len(parts) != 2: - msg = 'Invalid token format' - raise ValueError(msg) - return parts - - -def hash_token(token: str) -> str: - parts = get_token_parts(token) - hasher = hashlib.sha256() - hasher.update(parts[1].encode()) - return f'{parts[0]}-{hasher.hexdigest()}' diff --git a/dump_things_service/token_endpoints.py b/dump_things_service/token_endpoints.py new file mode 100644 index 0000000..85f308b --- /dev/null +++ b/dump_things_service/token_endpoints.py @@ -0,0 +1,345 @@ +import logging +import random +import re +from urllib.parse import quote + +from fastapi import ( + APIRouter, + Depends, + HTTPException, + Response, +) +from starlette.status import HTTP_406_NOT_ACCEPTABLE + +from dump_things_service import ( + HTTP_201_CREATED, + HTTP_404_NOT_FOUND, + HTTP_409_CONFLICT, +) +from dump_things_service.abstract_config import ( + AdminTokenConfig, + StrictModel, + TokenCollectionConfig, + TokenConfig, + get_config, + get_token_info_by_representation, + get_token_permissions, + hash_token_representation, + read_config, + store_config, +) +from dump_things_service.admin import authenticate_admin +from dump_things_service.api_key import api_key_header_scheme +from dump_things_service.instance_state import get_instance_state +from dump_things_service.exceptions import ConfigError +from dump_things_service.manifest import manifest_configuration +from dump_things_service.utils import wrap_http_exception + + +logger = logging.getLogger('dump_things_service') +router = APIRouter() + +hash_matcher = re.compile(r'^[a-f0-9A-F]{64}$') + + +class TokenRequest(TokenConfig): + name: str + + +class TokenResponse(StrictModel): + name: str + user_id: str + collections: dict[str, TokenCollectionConfig] + + +class AdminTokenRequest(AdminTokenConfig): + name: str + + +def get_token_parts(token: str) -> list[str]: + parts = token.split('-', 1) + if len(parts) != 2: + msg = 'Invalid token format' + raise ValueError(msg) + return parts + + +@router.post( + '/tokens', + tags=['Administration interface'], + name='Create a new token', + status_code=HTTP_201_CREATED, +) +async def create_token( + response: Response, + body: TokenRequest, + api_key: str = Depends(api_key_header_scheme), +) -> TokenRequest: + + instance_state = get_instance_state() + abstract_config = read_config(store_path=instance_state.store_path) + + authenticate_admin(instance_state, abstract_config, api_key) + + # Check for existing token-name + if body.name in abstract_config.tokens: + raise HTTPException( + status_code=HTTP_409_CONFLICT, + detail=f"Token with name '{body.name}' already exists.", + ) + + # Ensure that all specified collections and modes exist + for collection_name, token_collection_info in body.collections.items(): + if collection_name not in abstract_config.collections: + detail = f"No such collection: '{collection_name}'." + raise HTTPException(status_code=HTTP_404_NOT_FOUND, detail=detail) + + # Check that incoming areas are defined if the token allows writing. + token_permissions = get_token_permissions(token_collection_info.mode) + if token_permissions.incoming_write or token_permissions.zones_access: + + # Check for incoming definition in collection config + collection_info = abstract_config.collections[collection_name] + if not collection_info.incoming: + detail = ( + f"Cannot add token with write access to collection " + f"'{collection_name}' without `incoming`." + ) + raise HTTPException( + status_code=HTTP_406_NOT_ACCEPTABLE, + detail=detail, + ) + + # Check for incoming label in token definition for the collection + if not token_collection_info.incoming_label: + detail = f"Incoming label missing for collection '{collection_name}'" + raise HTTPException( + status_code=HTTP_406_NOT_ACCEPTABLE, + detail=detail, + ) + + if body.representation: + # We have a specific representation, check that it is not already used + existing_token_info = get_token_info_by_representation( + abstract_config=abstract_config, + token_representation=body.representation, + ) + if existing_token_info: + detail= f"Token with identical representation already exists." + raise HTTPException(status_code=HTTP_409_CONFLICT, detail=detail) + else: + # Generate a random representation that does not yet exist. + collision = True + while collision: + body.representation = random.randbytes(24).hex() + existing_token_info = get_token_info_by_representation( + abstract_config=abstract_config, + token_representation=body.representation, + ) + collision = existing_token_info is not None + + # Store the new token in the configuration + abstract_config.tokens[body.name] = TokenConfig( + user_id=body.user_id, + collections=body.collections, + representation=( + hash_token_representation(body.representation) + if body.hashed + else body.representation + ), + ) + + # Manifest the new configuration + with wrap_http_exception(ConfigError): + manifest_configuration(abstract_config, instance_state) + + # Persist the configuration + store_config( + store_path=instance_state.store_path, + config=abstract_config, + ) + + response.headers['Location'] = f'/tokens/{quote(body.name)}' + return TokenRequest( + name=body.name, + user_id=body.user_id, + collections=body.collections, + representation=body.representation, + hashed=body.hashed, + ) + + +@router.get( + '/tokens', + tags=['Administration interface'], + name='Get existing tokens', +) +async def get_tokens( + api_key: str = Depends(api_key_header_scheme), +) -> list[TokenResponse]: + + instance_state = get_instance_state() + abstract_config = read_config(store_path=instance_state.store_path) + + authenticate_admin(instance_state, abstract_config, api_key) + + return [ + TokenResponse( + name=n, + user_id=t.user_id, + collections=t.collections, + ) + for n, t in abstract_config.tokens.items() + ] + + +@router.get( + '/tokens/{token_name}', + tags=['Administration interface'], + name='Get token by name', +) +async def get_token_with_name( + token_name: str, + api_key: str = Depends(api_key_header_scheme), +) -> TokenResponse: + + instance_state = get_instance_state() + abstract_config = get_config() + + authenticate_admin(instance_state, abstract_config, api_key) + + abstract_config = read_config(store_path=instance_state.store_path) + if token_name not in abstract_config.tokens: + detail = f"token with name '{token_name}' does not exist." + raise HTTPException(status_code=HTTP_404_NOT_FOUND, detail=detail) + + t = abstract_config.tokens[token_name] + return TokenResponse( + name=token_name, + user_id=t.user_id, + collections=t.collections, + ) + + +@router.delete( + '/tokens/{token_name}', + tags=['Administration interface'], + name='Delete token with name', +) +async def delete_token_with_name( + token_name: str, + api_key: str = Depends(api_key_header_scheme), +): + + instance_state = get_instance_state() + abstract_config = get_config() + + authenticate_admin(instance_state, abstract_config, api_key) + + abstract_config = read_config(store_path=instance_state.store_path) + if token_name not in abstract_config.tokens: + detail = f"token with name '{token_name}' does not exist." + raise HTTPException(status_code=HTTP_404_NOT_FOUND, detail=detail) + + # Store the new token in the configuration + del abstract_config.tokens[token_name] + + # Manifest the new configuration + with wrap_http_exception(ConfigError): + manifest_configuration(abstract_config, instance_state) + + +@router.post( + '/admin_tokens', + tags=['Administration interface'], + name='Add a new admin token', + status_code=HTTP_201_CREATED, +) +async def create_admin_token( + body: AdminTokenRequest, + api_key: str = Depends(api_key_header_scheme), +): + + instance_state = get_instance_state() + abstract_config = read_config(store_path=instance_state.store_path) + + authenticate_admin(instance_state, abstract_config, api_key) + + # Check for token content + if not body.representation: + detail='Empty administrator token is not allowed' + raise HTTPException(status_code=HTTP_406_NOT_ACCEPTABLE, detail=detail) + + if not hash_matcher.match(body.representation.strip()): + detail='Hashed token is not a 64-digits hex-number' + raise HTTPException(status_code=HTTP_406_NOT_ACCEPTABLE, detail=detail) + + # Check for existing token-name + if body.name in abstract_config.admin_tokens: + raise HTTPException( + status_code=HTTP_409_CONFLICT, + detail=f"Admin token with name '{body.name}' already exists.", + ) + + # It is sufficient to add the new admin token to the admin_token dictionary + # in order to manifest the new configuration. + abstract_config.admin_tokens[body.name] = AdminTokenConfig( + representation=body.representation, + ) + + # Persist the configuration. + store_config( + store_path=instance_state.store_path, + config=abstract_config, + ) + + +@router.get( + '/admin_tokens', + tags=['Administration interface'], + name='Get admin token names', +) +async def get_admin_token( + api_key: str = Depends(api_key_header_scheme), +) -> list[str]: + instance_state = get_instance_state() + abstract_config = read_config(store_path=instance_state.store_path) + + authenticate_admin(instance_state, abstract_config, api_key) + + return list(abstract_config.admin_tokens) + ( + [] + if instance_state.bootstrap_token is None + else ['__bootstrap__'] + ) + + +@router.delete( + '/admin_tokens/{token_name}', + tags=['Administration interface'], + name='Delete admin token with name', +) +async def delete_admin_token( + token_name: str, + api_key: str = Depends(api_key_header_scheme), +): + + instance_state = get_instance_state() + abstract_config = read_config(store_path=instance_state.store_path) + + authenticate_admin(instance_state, abstract_config, api_key) + + # Check for token existence + if token_name not in abstract_config.admin_tokens: + raise HTTPException( + status_code=HTTP_404_NOT_FOUND, + detail=f"Admin token with name '{token_name}' does not exist.", + ) + + del abstract_config.admin_tokens[token_name] + + # Persist the configuration. + store_config( + store_path=instance_state.store_path, + config=abstract_config, + ) diff --git a/dump_things_service/utils.py b/dump_things_service/utils.py index 064dcf3..328e3e9 100644 --- a/dump_things_service/utils.py +++ b/dump_things_service/utils.py @@ -1,3 +1,11 @@ +""" + + +To speed up processing, multiple indices could be introduced, e.g.: + +- token representation -> token name + +""" from __future__ import annotations import logging @@ -12,33 +20,38 @@ from typing import ( import fsspec from fastapi import HTTPException from rdflib import Graph -from starlette.status import HTTP_500_INTERNAL_SERVER_ERROR from dump_things_service import ( HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, - HTTP_404_NOT_FOUND, HTTP_413_CONTENT_TOO_LARGE, HTTP_503_SERVICE_UNAVAILABLE, ) +from dump_things_service.abstract_config import ( + Configuration, + RecordDirBackendConfig, + TokenModes, + TokenPermission, + mode_mapping, + check_collection, + get_default_token_config, + get_token_config_for_representation_and_collection, + get_mapping_function_by_name, +) from dump_things_service.auth import ( AuthenticationError, AuthenticationInfo, ) -from dump_things_service.token import ( - TokenPermission, - get_token_parts, -) if TYPE_CHECKING: from pathlib import Path from dump_things_service import JSON - from dump_things_service.backends.record_dir import RecordDirStore - from dump_things_service.backends.sqlite import SQLiteBackend - from dump_things_service.config import InstanceConfig - from dump_things_service.store.model_store import ModelStore + from dump_things_service.backends.record_dir import _RecordDirStore + from dump_things_service.backends.sqlite import _SQLiteBackend + from dump_things_service.instance_state import InstanceState + from dump_things_service.store.model_store import _ModelStore logger = logging.getLogger('dump_things_service') @@ -57,7 +70,7 @@ def sys_path(paths: list[str | Path]): def read_url(url: str) -> str: """ - Read the content of an URL into memory. + Read the content of a URL into memory. """ open_file = fsspec.open(url, 'rt') with open_file as f: @@ -81,17 +94,6 @@ def combine_ttl(documents: list[str]) -> str: return reduce(lambda g1, g2: g1 + g2, graphs).serialize(format='ttl') -def get_schema_type_curie( - instance_config: InstanceConfig, - collection: str, - class_name: str, -) -> str: - schema_url = instance_config.schemas[collection] - schema_module = instance_config.conversion_objects[schema_url]['schema_module'] - class_object = getattr(schema_module, class_name) - return class_object.class_class_curie - - @contextmanager def wrap_http_exception( exception_class: type[BaseException] = ValueError, @@ -109,13 +111,32 @@ def wrap_http_exception( def join_default_token_permissions( - instance_config: InstanceConfig, + abstract_configuration: Configuration, + instance_state: InstanceState, permissions: TokenPermission, collection: str, ) -> TokenPermission: - default_token_name = instance_config.collections[collection].default_token - default_token_permissions = instance_config.tokens[collection][default_token_name]['permissions'] - result = TokenPermission() + + result = permissions.model_copy() + + # Get the default token name. If a default token is not defined, return + # token permissions without any right. A collection might define a default + # token that does not yet exist. We allow this inconsistency to decouple + # token and collection creation, i.e. to allow to create a collection first + # and a token later. + default_token_name = abstract_configuration.collections[collection].default_token + if default_token_name not in abstract_configuration.tokens: + return result + + # We allow inconsistencies in token/collection configuration space. This + # allows an administrator to create tokens and collections in two separate + # steps. Therefore, we have to check whether the referred default token + # is actually defined for the collection. + if collection not in abstract_configuration.tokens[default_token_name].collections: + return result + + default_token_mode = abstract_configuration.tokens[default_token_name].collections[collection].mode + default_token_permissions = mode_mapping[TokenModes(default_token_mode)] result.curated_read = ( permissions.curated_read | default_token_permissions.curated_read ) @@ -128,54 +149,15 @@ def join_default_token_permissions( return result -def check_collection( - instance_config: InstanceConfig, - collection: str, -): - if collection not in instance_config.collections: - raise HTTPException( - status_code=HTTP_404_NOT_FOUND, - detail=f"No such collection: '{collection}'.", - ) - - -def check_label( - instance_config: InstanceConfig, - collection: str, - label: str, -): - # Get the on-disk labels for the collection - if ( - label not in get_config_labels(instance_config, collection) - and label not in get_on_disk_labels(instance_config, collection) - ): - raise HTTPException( - status_code=HTTP_404_NOT_FOUND, - detail=f"No incoming label: '{label}' in collection: '{collection}'.", - ) - - -def get_config_labels( - instance_config: InstanceConfig, - collection: str, -) -> set[str]: - check_collection(instance_config, collection) - return { - token['incoming_label'] - for token in instance_config.tokens[collection].values() - if token['incoming_label'] != '' - } - - def get_on_disk_labels( - instance_config: InstanceConfig, - collection: str, + store_path: Path, + abstract_config: Configuration, + collection: str, ) -> set[str]: - check_collection(instance_config, collection) + check_collection(abstract_config, collection) incoming_path = ( - instance_config.store_path - / instance_config.collections[collection].incoming + store_path / abstract_config.collections[collection].incoming ) if not incoming_path or not incoming_path.exists(): return set() @@ -187,94 +169,30 @@ def get_on_disk_labels( } -def get_default_token_name( - instance_config: InstanceConfig, - collection: str -) -> str: - check_collection(instance_config, collection) - return instance_config.collections[collection].default_token - - -async def process_token( - instance_config: InstanceConfig, - api_key: str, - collection: str, -) -> tuple[TokenPermission, ModelStore]: - token = ( - get_default_token_name(instance_config, collection) - if api_key is None - else api_key - ) - - token_store, token, token_permissions, _ = get_token_store( - instance_config, - collection, - token, - ) - final_permissions = join_default_token_permissions( - instance_config, token_permissions, collection - ) - - # Check for maintenance mode - if collection in instance_config.maintenance_mode: - if not ( - final_permissions.curated_read - and final_permissions.curated_write - and final_permissions.zones_access - ): - raise HTTPException( - status_code=HTTP_503_SERVICE_UNAVAILABLE, - detail=f"Collection '{collection}' is in maintenance mode", - ) - - if not final_permissions.incoming_read and not final_permissions.curated_read: - raise HTTPException( - status_code=HTTP_403_FORBIDDEN, - detail=f"No read access to curated or incoming data in collection '{collection}'.", - ) - return final_permissions, token_store - - -def resolve_hashed_token( - instance_config: InstanceConfig, - collection_name: str, - token: str, -) -> str: - - # Check for hashed token and return the hashed token value instead - # of the plain text token value if the token is hashed. - if '-' in token: - return instance_config.hashed_tokens[collection_name].get( - get_token_parts(token)[0], - token, - ) - return token - - def authenticate_token( - instance_config: InstanceConfig, + instance_state: InstanceState, collection_name: str, - plain_token: str, + token_representation: str, ) -> AuthenticationInfo: # Try to authenticate the token with the authentication providers that # are associated with the collection. auth_info = None messages = [] - for auth_provider in instance_config.auth_providers[collection_name]: + for auth_source in instance_state.auth_sources[collection_name]: try: - logger.debug('trying to authenticate with %s', auth_provider) - auth_info = auth_provider.authenticate(plain_token) + logger.debug('trying to authenticate with %s', auth_source) + auth_info = auth_source.authenticate(token_representation) break except AuthenticationError as ae: logger.debug( 'Authentication provider %s could not ' 'authenticate token for collection %s: %s', - auth_provider, + auth_source, collection_name, str(ae), ) - messages.append(f'{auth_provider.__class__.__name__} failed with: {ae}') + messages.append(f'{auth_source.__class__.__name__} failed with: {ae}') continue if not auth_info: @@ -289,38 +207,33 @@ def authenticate_token( def get_token_store( - instance_config: InstanceConfig, + abstract_config: Configuration, + instance_state: InstanceState, collection_name: str, - plain_token: str -) -> tuple[ModelStore, str, TokenPermission, str] | tuple[None, None, None, None]: - check_collection(instance_config, collection_name) + token_representation: str, +) -> tuple[_ModelStore, TokenPermission, str] | tuple[None, None, None, None]: # Try to authenticate the token with the authentication providers that # are associated with the collection. - auth_info = authenticate_token(instance_config, collection_name, plain_token) - permissions = auth_info.token_permission - - # If the token is hashed, get the hashed value. This is required because - # we associate token info with the hashed version of the token. - hashed_token = resolve_hashed_token( - instance_config, + auth_info = authenticate_token( + instance_state, collection_name, - plain_token, + token_representation, ) + permissions = auth_info.token_permission # If the token has no incoming-read or incoming-write permissions, we do not # need to create a store. if not permissions.incoming_read and not permissions.incoming_write: - instance_config.token_stores[collection_name][plain_token] = ( + instance_state.incoming_stores[collection_name][token_representation] = ( None, - hashed_token, permissions, auth_info.user_id, ) - return instance_config.token_stores[collection_name][plain_token] + return instance_state.incoming_stores[collection_name][token_representation] # Check whether the collection has an incoming definition - incoming = instance_config.incoming.get(collection_name) + incoming = abstract_config.collections[collection_name].incoming if not incoming: raise HTTPException( status_code=HTTP_401_UNAUTHORIZED, @@ -328,117 +241,131 @@ def get_token_store( ) # Check whether a store for this collection and token does already exist. - store_info = instance_config.token_stores[collection_name].get(plain_token) + store_info = instance_state.incoming_stores[collection_name].get(token_representation) if store_info: return store_info - store_dir = instance_config.store_path / incoming / auth_info.incoming_label + store_dir = instance_state.store_path / incoming / auth_info.incoming_label token_store = create_token_store( - instance_config=instance_config, + abstract_configuration=abstract_config, + instance_state=instance_state, collection_name=collection_name, store_dir=store_dir, ) - instance_config.token_stores[collection_name][plain_token] = ( + instance_state.incoming_stores[collection_name][token_representation] = ( token_store, - hashed_token, permissions, auth_info.user_id, ) - return instance_config.token_stores[collection_name][plain_token] + return instance_state.incoming_stores[collection_name][token_representation] + + +def create_store( + abstract_configuration: Configuration, + instance_state: InstanceState, + collection_name: str, +) -> _ModelStore: + collection_curated_path = abstract_configuration.collections[collection_name].curated + return create_token_store( + abstract_configuration=abstract_configuration, + instance_state=instance_state, + collection_name=collection_name, + store_dir=instance_state.store_path / collection_curated_path, + ) def create_token_store( - instance_config: InstanceConfig, + abstract_configuration: Configuration, + instance_state: InstanceState, collection_name: str, store_dir: Path, -) -> ModelStore: +) -> _ModelStore: from dump_things_service.backends.schema_type_layer import SchemaTypeLayer - from dump_things_service.config import ( - ConfigError, - get_backend_and_extension, - ) + from dump_things_service.abstract_config import get_backend_and_extension + from dump_things_service.exceptions import ConfigError from dump_things_service.store.model_store import ModelStore - # Check if the store was already created and if it was created for the - # same schema. - if store_dir in instance_config.all_stores: - existing_collection_name, existing_model_store = instance_config.all_stores[store_dir] - if ( - existing_collection_name != collection_name - and instance_config.schemas[existing_collection_name] != instance_config.schemas[collection_name] - ): - msg = ( - f"collections '{existing_collection_name}' and " - f"'{collection_name}' with different schemas map onto the same" - f" storage directory: '/{store_dir.name}'" - ) - raise HTTPException( - status_code=HTTP_500_INTERNAL_SERVER_ERROR, - detail=msg, - ) - return existing_model_store + # One early requirement for the service was to be able to specify + # arbitrary directories for curated stores and incoming stores. This + # explicitly included the use case where an incoming store and a + # curated store are identical. This has the following consequences: + # + # 1. Any collection might have multiple incoming stores that use the same + # directory as the curated store. + # + # 2. Multiple collections might share curated or incoming directories with + # other stores. + # + # From 1. follows that, for efficiency and consistency reasons, existing + # backends for a directory should be reused. With 2. one has to check that + # the collections that specify the backend have matching schemas. Schemas + # must match if the same backend, i.e., the same directory and basic backend + # type (basic backend types are `record_dir` or `sqlite`) are used. + # If different backend types are used (which is possible in the same + # directory), the schemas could in principle be different. store_dir.mkdir(parents=True, exist_ok=True) + schema_uri = abstract_configuration.collections[collection_name].schema_location - schema_uri = instance_config.schemas[collection_name] - - # We get the backend information from the curated store - backend_type = instance_config.backend[collection_name].type - backend_name, extension = get_backend_and_extension(backend_type) - - backend = instance_config.curated_stores[collection_name].backend + # We get the backend information from the abstract configuration + backend_config = abstract_configuration.collections[collection_name].backend + backend_name, extension = get_backend_and_extension(backend_config.type) if backend_name == 'record_dir': - # The configuration routines have read the backend configuration of the - # curated store from disk and stored it in `instance_config`. We fetch - # it from there. - if extension == 'stl': - backend = backend.backend - token_store = create_record_dir_token_store( + backend = create_record_dir_token_store_backend( store_dir=store_dir, - order_by=backend.order_by, - schema_uri=instance_config.schemas[collection_name], - mapping_function=backend.pid_mapping_function, - suffix=backend.suffix, + order_by=instance_state.order_by, + schema_uri=schema_uri, + mapping_function=backend_config.mapping_method, + suffix='yaml', ) elif backend_name == 'sqlite': - token_store = create_sqlite_token_store( + backend = create_sqlite_token_store_backend( store_dir=store_dir, - order_by=backend.order_by, + order_by=instance_state.order_by, ) else: # This should not happen because we base our decision on already # existing backends. - msg = f'Unsupported backend type: `{backend_type}`.' + msg = f'Unsupported backend type: `{backend_name}`.' raise ConfigError(msg) if extension == 'stl': - token_store = SchemaTypeLayer(backend=token_store, schema=schema_uri) + backend = SchemaTypeLayer(backend=backend, schema=schema_uri) - submission_tags = instance_config.collections[collection_name].submission_tags - tags = { - 'id': submission_tags.submitter_id_tag, - 'time': submission_tags.submission_time_tag, - } - model_store = ModelStore(backend=token_store, schema=schema_uri, tags=tags) - instance_config.all_stores[store_dir] = (collection_name, model_store) - - return model_store + submission_tags = abstract_configuration.collections[collection_name].submission_tags + return ModelStore( + schema=schema_uri, + backend=backend, + tags={ + 'id': submission_tags.submitter_id_tag, + 'time': submission_tags.submission_time_tag, + }, + ) -def create_record_dir_token_store( +def create_record_dir_token_store_backend( store_dir: Path, order_by: list[str], schema_uri: str, - mapping_function: Callable, + mapping_function: str, suffix: str, -) -> RecordDirStore: +) -> _RecordDirStore: + from dump_things_service.instance_state import record_dir_config_file_name from dump_things_service.backends.record_dir import RecordDirStore + # Write the configuration to the store, if it does not yet exist. + if not (store_dir / record_dir_config_file_name).exists(): + write_record_dir_config( + path=store_dir, + mapping_function=mapping_function, + schema=schema_uri, + ) + store_backend = RecordDirStore( root=store_dir, - pid_mapping_function=mapping_function, + pid_mapping_function=get_mapping_function_by_name(mapping_function), suffix=suffix, order_by=order_by, ) @@ -446,10 +373,29 @@ def create_record_dir_token_store( return store_backend -def create_sqlite_token_store( +def write_record_dir_config( + path: Path, + mapping_function: str, + schema: str, +): + from dump_things_service.instance_state import record_dir_config_file_name + + record_dir_config_file_path = path / record_dir_config_file_name + if not record_dir_config_file_path.exists(): + record_dir_config_file_path.write_text(f"""# RecordDir Config +type: records +version: 1 +schema: {schema} +format: yaml +idfx: {mapping_function} +""", + ) + + +def create_sqlite_token_store_backend( store_dir: Path, order_by: list[str], -) -> SQLiteBackend: +) -> _SQLiteBackend: from dump_things_service.backends.sqlite import SQLiteBackend from dump_things_service.backends.sqlite import ( record_file_name as sqlite_record_file_name, @@ -473,3 +419,89 @@ def check_bounds( detail=f"Too many records found in collection '{collection}'. " f'Please use pagination (/{collection}{alternative_url}).', ) + + +async def process_token( + abstract_config: Configuration, + instance_state: InstanceState, + api_key: str | None, + collection: str, +) -> tuple[TokenPermission, _ModelStore]: + + if api_key is None: + token_config = get_default_token_config(abstract_config, collection) + else: + token_elements = get_token_config_for_representation_and_collection( + abstract_config, + collection_name=collection, + token_representation=api_key, + ) + token_config = token_elements[1] if token_elements else None + + if not token_config: + detail = f"invalid token for collection '{collection}'" + raise HTTPException( + status_code=HTTP_401_UNAUTHORIZED, + detail=detail, + ) + + token_store, token_permissions, user_id = get_token_store( + abstract_config, + instance_state, + collection, + token_config.representation, + ) + final_permissions = join_default_token_permissions( + abstract_config, instance_state, token_permissions, collection + ) + + # Check for maintenance mode + if collection in instance_state.maintenance_mode: + if not ( + final_permissions.curated_read + and final_permissions.curated_write + and final_permissions.zones_access + ): + raise HTTPException( + status_code=HTTP_503_SERVICE_UNAVAILABLE, + detail=f"Collection '{collection}' is in maintenance mode", + ) + + if not final_permissions.incoming_read and not final_permissions.curated_read: + raise HTTPException( + status_code=HTTP_403_FORBIDDEN, + detail=f"No read access to curated or incoming data in collection '{collection}'.", + ) + return final_permissions, token_store + + +def get_required_incoming_labels( + abstract_config: Configuration, + collection_name: str, +) -> set[str]: + return set( + map( + lambda x: x[1], + get_required_incoming_info(abstract_config, collection_name), + ) + ) + + +def get_required_incoming_info( + abstract_config: Configuration, + collection_name: str, +) -> set[tuple[str, str]]: + return { + (token_name, this_collection_info.incoming_label) + for token_name, token_info in abstract_config.tokens.items() + for this_collection_name, this_collection_info in token_info.collections.items() + if this_collection_name == collection_name and mode_mapping[ + TokenModes(this_collection_info.mode) + ].incoming_write is True + } + + +def var_escape( + name: str, +) -> str: + return name.replace('_', '___').replace('-', '_0_') diff --git a/dump_things_service/validate.py b/dump_things_service/validate.py new file mode 100644 index 0000000..4dd6f6f --- /dev/null +++ b/dump_things_service/validate.py @@ -0,0 +1,98 @@ +from typing import Any + +from fastapi import ( + Depends, + HTTPException, +) +from pydantic import ( + BaseModel, + TypeAdapter, + ValidationError, +) +from starlette.responses import JSONResponse + +from dump_things_service import ( + HTTP_400_BAD_REQUEST, + HTTP_403_FORBIDDEN, + HTTP_422_UNPROCESSABLE_CONTENT, + Format, +) +from dump_things_service.abstract_config import ( + check_collection, + get_config, + get_default_token_name, +) +from dump_things_service.api_key import api_key_header_scheme +from dump_things_service.converter import FormatConverter +from dump_things_service.instance_state import get_instance_state +from dump_things_service.utils import ( + get_token_store, + join_default_token_permissions, + wrap_http_exception, +) + + +def validate_record( + collection: str, + data: BaseModel | str, + class_name: str, + model: Any, + input_format: Format, + api_key: str | None = Depends(api_key_header_scheme), +) -> JSONResponse: + + instance_state = get_instance_state() + abstract_config = get_config() + + if input_format == Format.json and isinstance(data, str): + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail='Invalid JSON data provided.' + ) + + if input_format == Format.ttl and not isinstance(data, str): + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail='Invalid ttl data provided.' + ) + + check_collection(abstract_config, collection) + + token = ( + get_default_token_name(abstract_config, collection) + if api_key is None + else api_key + ) + + store, token_permissions, user_id = get_token_store( + abstract_config, + instance_state, + collection, + token, + ) + final_permissions = join_default_token_permissions( + abstract_config, + instance_state, + token_permissions, + collection, + ) + if not final_permissions.incoming_write: + raise HTTPException( + status_code=HTTP_403_FORBIDDEN, + detail=f"Not authorized to validate records for collection '{collection}'.", + ) + + if input_format == Format.ttl: + with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Conversion error'): + json_object = FormatConverter( + abstract_config.collections[collection].schema_location, + input_format=Format.ttl, + output_format=Format.json, + ).convert(data, class_name) + with wrap_http_exception(ValidationError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): + TypeAdapter(getattr(model, class_name)).validate_python(json_object) + else: + # Try to convert it into TTL to detect potential errors before storing + # the record + with wrap_http_exception(ValueError, status_code=HTTP_422_UNPROCESSABLE_CONTENT, header='Validation error'): + instance_state.validators[collection].validate(data) + + return JSONResponse(True) diff --git a/pyproject.toml b/pyproject.toml index a3dc3fe..99abeed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ authors = [ { name = "Christian Mönch", email = "christian.moench@web.de" }, ] classifiers = [ + "License :: OSI Approved :: MIT License", "Development Status :: 4 - Beta", "Programming Language :: Python", "Programming Language :: Python :: 3.8", @@ -53,6 +54,8 @@ dump-things-pid-check = "dump_things_service.commands.check_pids:main" dump-things-create-merged-schema = "dump_things_service.commands.create_merged_schema:main" dump-things-gitaudit-report = "dump_things_service.commands.gitaudit_report:main" dump-things-gitaudit-rebuild-index = "dump_things_service.commands.gitaudit_rebuild_index:main" +dump-things-load-config = "dump_things_service.commands.load_config:main" +dump-things-hash-token = "dump_things_service.commands.hash_token:main" [tool.hatch.build.targets.wheel] exclude = [