From f0e3ad6eda04a2965247bf9d71092a186f1d98f3 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 27 Mar 2026 09:55:38 +0100 Subject: [PATCH 1/7] add RecordComparer-class This commit adds the class RecordComparer. Instances of this class can compare dictionaries, while ignoring certain elements. This can be useful, for example, to determine if two records are equal, except for the content of the key 'annotations'. --- .../dtc_plugins/common/record_comparer.py | 113 ++++++++++++++++++ .../dtc_plugins/common/tests/__init__.py | 0 .../common/tests/test_record_comparer.py | 34 ++++++ 3 files changed, 147 insertions(+) create mode 100644 dump_things_pyclient/commands/dtc_plugins/common/record_comparer.py create mode 100644 dump_things_pyclient/commands/dtc_plugins/common/tests/__init__.py create mode 100644 dump_things_pyclient/commands/dtc_plugins/common/tests/test_record_comparer.py diff --git a/dump_things_pyclient/commands/dtc_plugins/common/record_comparer.py b/dump_things_pyclient/commands/dtc_plugins/common/record_comparer.py new file mode 100644 index 0000000..b1b8601 --- /dev/null +++ b/dump_things_pyclient/commands/dtc_plugins/common/record_comparer.py @@ -0,0 +1,113 @@ +from __future__ import annotations + + +JSON = dict[str, 'JSON'] | list['JSON'] | int | str | float | bool | None + + +class RecordComparer: + """Compare dictionaries, ignoring specified elements + + `RecordComparer` compares two dictionaries, ignoring the elements that + are specified by `ignore_spec`. + + The specification defines a path from the + "root" of the dictionary, by specifying dictionary keys or list indices. + Keys and indices are separated by a dot, i.e., by `.` Indices can be + integers or `*`. `*` matches any index. For example: + + Given the dictionary `r = {'key_1': {'key_1_1': 11', 'key_1_2': 22}}`, the + specification `'key_1.key_1_2'` matches `r['key_1']['key_2']`. This + entry will be ignored during comparison. Therefore, the comparison of + the two records: + + `r1 = {'key_1': {'key_1_1': 11, 'key_1_2': 22}}` + `r2 = {'key_1': {'key_1_1': 11, 'key_1_2': 44}}` + + with ignore_spec: `key_1.key_1_2`, `r1` and `r2` are considered to + be equal. For example, to ignore `annotations` when comparing two + dictionaries, `ignore_spec` should be `'annotations'`. + + Indices are specified by `'[]'`, where `` is either an + integer or `*`. For example, the comparison of the two records: + + `r3 = {'a': [{'b': 1, 'x': 2}, {'b': 3, 'y': 4}]}` + `r4 = {'a': [{'b': 1, 'x': 2}, {'y': 4}]}` + + with ignore_spec: `a.[1].b` would yield `True`, i.e., the two records are + considered as equal. + + With `ignore_spec`: `a.[*].b`, `r3` would be considered to be equal to: + + `{'a': [{'x': 2}, {'y': 4}]}` + + because all `b`-keys in elements of `r3['a']` would be deleted. + + Note: ignore_specs only work reliably, if the dictionary keys do not + contain `.` or `[`. + """ + def __init__( + self, + *, + ignore_spec: str | None = None, + ): + self.key_list = None if not ignore_spec else ignore_spec.split('.') + + def is_equal( + self, + record_a: JSON, + record_b: JSON, + ) -> bool: + if self.key_list: + return self.clean(record_a, []) == self.clean(record_b, []) + return record_a == record_b + + def clean( + self, + record: JSON, + path: list, + ) -> JSON: + if isinstance(record, dict): + return { + k: self.clean(v, path + [k]) + for k, v in record.items() + if not self.matches(path + [k]) + } + elif isinstance(record, list): + return [ + self.clean(e, path + [f'[{i}]']) + for i, e in enumerate(record) + if not self.matches(path + [f'[{i}]']) + ] + return record + + @staticmethod + def remove_empty_structures( + record: JSON, + ) -> JSON: + if isinstance(record, dict): + return { + k: v for k, v in record.items() if v not in ({}, []) + } + elif isinstance(record, list): + return [ + e for e in record if e not in ({}, []) + ] + return record + + def matches( + self, + path: list[str], + ) -> bool: + """Match path to self.key_list. All elements must match""" + if len(path) < len(self.key_list): + return False + + for element, key in zip(path, self.key_list): + if element[0] == '[' and key[0] == '[': + if key[1] == '*': + continue + elif key[1:-1] != element[1:-1]: + return False + elif element != key: + return False + return True diff --git a/dump_things_pyclient/commands/dtc_plugins/common/tests/__init__.py b/dump_things_pyclient/commands/dtc_plugins/common/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dump_things_pyclient/commands/dtc_plugins/common/tests/test_record_comparer.py b/dump_things_pyclient/commands/dtc_plugins/common/tests/test_record_comparer.py new file mode 100644 index 0000000..d8931f2 --- /dev/null +++ b/dump_things_pyclient/commands/dtc_plugins/common/tests/test_record_comparer.py @@ -0,0 +1,34 @@ + +from ..record_comparer import RecordComparer + + + +def test_example(): + ignore_spec = 'key_1.key_1_2' + comparer = RecordComparer(ignore_spec=ignore_spec) + r1 = {'key_1': {'key_1_1': 11, 'key_1_2': 22}} + r2 = {'key_1': {'key_1_1': 11, 'key_1_2': 44}} + assert comparer.is_equal(r1, r2) is True, f'Unexpected non-equal result for {r1!r} =({ignore_spec})= {r2!r} wi' + + +def test_remove_empty_structures(): + comparer = RecordComparer() + assert comparer.remove_empty_structures({'b': 1, 'a': {}}) == {'b': 1} + + +def test_index(): + ignore_spec = 'a.[1].b' + comparer = RecordComparer(ignore_spec=ignore_spec) + assert comparer.is_equal( + {'a': [{'b': 1, 'x': 2}, {'b': 3, 'y': 4}]}, + {'a': [{'b': 1, 'x': 2}, {'y': 4}]} + ) + + +def test_wildcard_index(): + ignore_spec = 'a.[*].b' + comparer = RecordComparer(ignore_spec=ignore_spec) + assert comparer.is_equal( + {'a': [{'b': 1, 'x': 2}, {'b': 3, 'y': 4}]}, + {'a': [{'x': 2}, {'y': 4}]} + ) -- 2.52.0 From bd231d21de696ff1473adf7fa77bb6b72ff69a05 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 27 Mar 2026 13:19:13 +0100 Subject: [PATCH 2/7] add --only-if-modifying option to post-records This commit adds the flag --only-if-modifying to the dtc subcommand `post-records`. If `--curate` is provided, it will check whether an identical record already exists in the curated area of the collection, and ---if the record already exists--- refuse to overwrite it. Record matching can be "relaxed" by exluding elements from the matching process. This can be done using the option `--ignore-spec`. --- .../commands/dtc_plugins/post_records.py | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/dump_things_pyclient/commands/dtc_plugins/post_records.py b/dump_things_pyclient/commands/dtc_plugins/post_records.py index acc8132..4092b38 100644 --- a/dump_things_pyclient/commands/dtc_plugins/post_records.py +++ b/dump_things_pyclient/commands/dtc_plugins/post_records.py @@ -7,11 +7,12 @@ import rich_click as click from rich.console import Console from rich.progress import track +from .common.record_comparer import RecordComparer from ...communicate import ( HTTPError, curated_write_record, collection_write_record, - get_session, + get_session, curated_read_record_with_pid, incoming_read_record_with_pid, ) from .common.prefix import de_prefix @@ -55,6 +56,26 @@ console = Console(file=sys.stderr) is_flag=True, help='ignore errors when posting a record and continue with remaining records', ) +@click.option( + '--only-if-modifying', + help='if provided, and if `--curated` is provided, a record will only be ' + 'posted, if it does not yet exist ' + 'in the destination, or if the posted record modifies the existing ' + 'record (to ignore certain record entries, for example, `annotations`, ' + 'when comparing records, use the option --ignore-spec.', + default=False, + is_flag=True, +) +@click.option( + '--ignore-spec', + metavar='IGNORE_SPEC', + help='define record-attributes, for example `annotations`, that should be ' + 'ignored when checking for existing records (for a description of the ' + 'specification syntax check the docstring of ' + '`dump_things_pyclient.commands.dtc_plugins.common.record_comparer.RecordComparer`).', + default=None, + is_flag=False, +) @click.option( '--dry-run', '-d', help='if provided, do not alter any data, instead print what would be done', @@ -69,6 +90,8 @@ def cli( curated, author_id, ignore_errors, + only_if_modifying, + ignore_spec, dry_run, ): """Read records of class CLASS from standard input and store them in @@ -104,6 +127,8 @@ def cli( curated, author_id, ignore_errors, + only_if_modifying, + ignore_spec, dry_run, ) ) @@ -117,6 +142,8 @@ def post_records( curated, author_id, ignore_errors, + only_if_modifying, + ignore_spec, dry_run, ) -> int: token = obj @@ -131,6 +158,16 @@ def post_records( write_record = collection_write_record keyword_args = {} + record_comparer = None + if only_if_modifying: + if not curated: + console.print('[yellow]Warning[/yellow]: ignoring --only-if-modifying because --curated was not provided') + else: + record_comparer = RecordComparer(ignore_spec=ignore_spec) + else: + if ignore_spec: + console.print('[yellow]Warning[/yellow]: ignoring --ignore-spec because --only-if-modifying was not provided') + failed = [] session = get_session() for index, line in zip(count(), track(sys.stdin, console=console)): @@ -153,6 +190,19 @@ def post_records( else: class_name = cls + if record_comparer: + existing_record = curated_read_record_with_pid( + service_url=service_url, + collection=collection, + pid=record['pid'], + token=token, + session=session, + ) + if existing_record: + if record_comparer.is_equal(existing_record, record): + console.print(f'skipping writing of record [green]{record["pid"]}[/green] because a matching record already exists') + continue + if dry_run: if curated: console.print(f'[DRY_RUN]:WRITE record [green]"{record["pid"]}"[/green] of class "{class_name}" to curated area of collection "{collection}" on "{service_url}"') -- 2.52.0 From 469da8353407a9d2d0bdd64a5ee45d5d56edf1cc Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Fri, 27 Mar 2026 15:49:00 +0100 Subject: [PATCH 3/7] fix help text of `--only-if-modifying` in `post-records` --- dump_things_pyclient/commands/dtc_plugins/post_records.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dump_things_pyclient/commands/dtc_plugins/post_records.py b/dump_things_pyclient/commands/dtc_plugins/post_records.py index 4092b38..2a57502 100644 --- a/dump_things_pyclient/commands/dtc_plugins/post_records.py +++ b/dump_things_pyclient/commands/dtc_plugins/post_records.py @@ -62,7 +62,7 @@ console = Console(file=sys.stderr) 'posted, if it does not yet exist ' 'in the destination, or if the posted record modifies the existing ' 'record (to ignore certain record entries, for example, `annotations`, ' - 'when comparing records, use the option --ignore-spec.', + 'when comparing records, use the option --ignore-spec).', default=False, is_flag=True, ) -- 2.52.0 From 4512c19d30804be6729cea3629d923f2230f9071 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Sat, 28 Mar 2026 09:07:38 +0100 Subject: [PATCH 4/7] add `--only-if-modifying` to `auto-curate` --- .../commands/dtc_plugins/auto_curate.py | 82 ++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/dump_things_pyclient/commands/dtc_plugins/auto_curate.py b/dump_things_pyclient/commands/dtc_plugins/auto_curate.py index ba833b1..5607669 100644 --- a/dump_things_pyclient/commands/dtc_plugins/auto_curate.py +++ b/dump_things_pyclient/commands/dtc_plugins/auto_curate.py @@ -24,6 +24,7 @@ from ...communicate import ( incoming_read_labels, incoming_read_records, ) +from .common.record_comparer import RecordComparer subcommand_name = 'auto-curate' @@ -80,7 +81,9 @@ stl_info = False help='read the change set at CHANGE_SET_DIR and post it to the curated area of the given destination server and collection. ' 'If `--add-annotations` is provided in addition, the annotations that were recorded in the records in the incoming areas ' 'are added to the respective records before posting them to the destination (--post-change-set and ' - '--create-change-set are mutually exclusive)', + '--create-change-set are mutually exclusive). If `--only-if-modifying` is provided, a record from the change ' + 'will only be posted, if an "identical" record is not already in the destination. Here "identical" means that the ' + 'records are identical if the attributed defined in `--ignore-spec` are ignored.', type=click.Path( exists=True, file_okay=False, @@ -138,6 +141,27 @@ stl_info = False default=False, is_flag=True, ) +@click.option( + '--only-if-modifying', + help='if provided a record will only be posted, if it does not yet exist ' + 'in the destination, or if the posted record modifies the existing ' + 'record (to ignore certain record entries, for example, `annotations`, ' + 'when comparing records, use the option --ignore-spec). NOTE: even if ' + 'a record is no posted, it will be removed from its inbox (unless ' + '`--keep-inboxes` is provided).', + default=False, + is_flag=True, +) +@click.option( + '--ignore-spec', + metavar='IGNORE_SPEC', + help='define record-attributes, for example `annotations`, that should be ' + 'ignored when checking for existing records (for a description of the ' + 'specification syntax check the docstring of ' + '`dump_things_pyclient.commands.dtc_plugins.common.record_comparer.RecordComparer`).', + default=None, + is_flag=False, +) @click.option( '--dry-run', '-d', help='if provided, do not alter any data, instead print what would be done', @@ -161,6 +185,8 @@ def cli( include, list_labels, list_records, + only_if_modifying, + ignore_spec, dry_run, ): """Automatically move records from the incoming areas of the collection @@ -197,6 +223,8 @@ def cli( include, list_labels, list_records, + only_if_modifying, + ignore_spec, dry_run, ) ) @@ -224,6 +252,8 @@ def auto_curate( include, list_labels, list_records, + only_if_modifying, + ignore_spec, dry_run, ): curator_token = obj @@ -384,6 +414,8 @@ def auto_curate( author_id=author_id, pid=pid, add_annotations=add_annotations, + only_if_modifying=only_if_modifying, + ignore_spec=ignore_spec, dry_run=dry_run, session=session, ) @@ -403,6 +435,8 @@ def auto_curate( author_id=author_id, pid=pid, keep_inboxes=keep_inboxes, + only_if_modifying=only_if_modifying, + ignore_spec=ignore_spec, dry_run=dry_run, session=session, ) @@ -429,9 +463,19 @@ def _curate_records( author_id: str | None, pid: str | None, keep_inboxes: bool, + only_if_modifying: bool, + ignore_spec: str | None, dry_run: bool, session: Session, ) -> int: + + record_comparer = None + if only_if_modifying: + record_comparer = RecordComparer(ignore_spec=ignore_spec) + else: + if ignore_spec: + console.print('[yellow]Warning[/yellow]: ignoring --ignore-spec because --only-if-modifying was not provided') + for record, _, _, _, _ in source: if pid: if record['pid'] not in pid: @@ -445,6 +489,19 @@ def _curate_records( console.print(f'[yellow]Warning[/yellow]: could not determine class in record [yellow]{record["pid"]}[/yellow], ignoring it.') continue + if record_comparer: + existing_record = curated_read_record_with_pid( + service_url=service_url, + collection=collection, + pid=record['pid'], + token=curator_token, + session=session, + ) + if existing_record: + if record_comparer.is_equal(existing_record, record): + console.print(f'skipping writing of record [green]{record["pid"]}[/green] because a matching record already exists') + continue + if dry_run: console.print(f'[DRY_RUN]:WRITE record [green]"{record["pid"]}"[/green] of class "{class_name}" to collection "{destination_collection}" on "{destination_service_url}"') else: @@ -654,10 +711,20 @@ def _post_change_set( author_id: str | None, pid: str | None, add_annotations: bool, + only_if_modifying: bool, + ignore_spec: str | None, dry_run: bool, session: Session, ): pid_file_name = _pid_to_filename(pid) if pid else None + + record_comparer = None + if only_if_modifying: + record_comparer = RecordComparer(ignore_spec=ignore_spec) + else: + if ignore_spec: + console.print('[yellow]Warning[/yellow]: ignoring --ignore-spec because --only-if-modifying was not provided') + for file_name in source: if pid_file_name and pid_file_name != file_name: @@ -685,6 +752,19 @@ def _post_change_set( console.print(f'[yellow]Warning[/yellow]: could not determine class of record [yellow]{record["pid"]}[/yellow], ignoring it.') continue + if record_comparer: + existing_record = curated_read_record_with_pid( + service_url=destination_service_url, + collection=destination_collection, + pid=record['pid'], + token=destination_token, + session=session, + ) + if existing_record: + if record_comparer.is_equal(existing_record, record): + console.print(f'skipping writing of record [green]{record["pid"]}[/green] because a matching record already exists') + continue + if dry_run: console.print(f'[DRY_RUN]:WRITE record [green]"{record["pid"]}"[/green] of class "{class_name}" to collection "{destination_collection}" on "{destination_service_url}"') continue -- 2.52.0 From ccf810aec75485c9de75ae4b5f9d3f1c2ee9a16f Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Mon, 30 Mar 2026 22:00:51 +0200 Subject: [PATCH 5/7] replace `--ignore-spec` with `--jsonpath-spec` Use `jsonpath_ng` to implement comparsion with ignored record elements. --- .../commands/dtc_plugins/auto_curate.py | 42 +++---- .../dtc_plugins/common/record_comparer.py | 113 ++++-------------- .../common/tests/test_record_comparer.py | 20 ++-- .../commands/dtc_plugins/post_records.py | 28 +++-- pyproject.toml | 1 + uv.lock | 11 ++ 6 files changed, 82 insertions(+), 133 deletions(-) diff --git a/dump_things_pyclient/commands/dtc_plugins/auto_curate.py b/dump_things_pyclient/commands/dtc_plugins/auto_curate.py index 5607669..93fd812 100644 --- a/dump_things_pyclient/commands/dtc_plugins/auto_curate.py +++ b/dump_things_pyclient/commands/dtc_plugins/auto_curate.py @@ -83,7 +83,7 @@ stl_info = False 'are added to the respective records before posting them to the destination (--post-change-set and ' '--create-change-set are mutually exclusive). If `--only-if-modifying` is provided, a record from the change ' 'will only be posted, if an "identical" record is not already in the destination. Here "identical" means that the ' - 'records are identical if the attributed defined in `--ignore-spec` are ignored.', + 'records are identical if the attributed defined in `--jsonpath-spec` are ignored.', type=click.Path( exists=True, file_okay=False, @@ -146,19 +146,21 @@ stl_info = False help='if provided a record will only be posted, if it does not yet exist ' 'in the destination, or if the posted record modifies the existing ' 'record (to ignore certain record entries, for example, `annotations`, ' - 'when comparing records, use the option --ignore-spec). NOTE: even if ' + 'when comparing records, use the option --jsonpath-spec). NOTE: even if ' 'a record is no posted, it will be removed from its inbox (unless ' '`--keep-inboxes` is provided).', default=False, is_flag=True, ) @click.option( - '--ignore-spec', + '--jsonpath-spec', metavar='IGNORE_SPEC', - help='define record-attributes, for example `annotations`, that should be ' - 'ignored when checking for existing records (for a description of the ' - 'specification syntax check the docstring of ' - '`dump_things_pyclient.commands.dtc_plugins.common.record_comparer.RecordComparer`).', + help='a jsonpath-expression that defines record-attributes, that should be ' + 'ignored when checking for existing records. Every element that matches ' + 'the expression will be ignored when comparing records (for a ' + 'description of the specification syntax check the documentation of ' + 'https://pypi.org/project/jsonpath-ng/). For example, to ignore ' + '`annotations` use `--jsonpath_spec "annotations"`.', default=None, is_flag=False, ) @@ -186,7 +188,7 @@ def cli( list_labels, list_records, only_if_modifying, - ignore_spec, + jsonpath_spec, dry_run, ): """Automatically move records from the incoming areas of the collection @@ -224,7 +226,7 @@ def cli( list_labels, list_records, only_if_modifying, - ignore_spec, + jsonpath_spec, dry_run, ) ) @@ -253,7 +255,7 @@ def auto_curate( list_labels, list_records, only_if_modifying, - ignore_spec, + jsonpath_spec, dry_run, ): curator_token = obj @@ -415,7 +417,7 @@ def auto_curate( pid=pid, add_annotations=add_annotations, only_if_modifying=only_if_modifying, - ignore_spec=ignore_spec, + jsonpath_spec=jsonpath_spec, dry_run=dry_run, session=session, ) @@ -436,7 +438,7 @@ def auto_curate( pid=pid, keep_inboxes=keep_inboxes, only_if_modifying=only_if_modifying, - ignore_spec=ignore_spec, + jsonpath_spec=jsonpath_spec, dry_run=dry_run, session=session, ) @@ -464,17 +466,17 @@ def _curate_records( pid: str | None, keep_inboxes: bool, only_if_modifying: bool, - ignore_spec: str | None, + jsonpath_spec: str | None, dry_run: bool, session: Session, ) -> int: record_comparer = None if only_if_modifying: - record_comparer = RecordComparer(ignore_spec=ignore_spec) + record_comparer = RecordComparer(jsonpath_spec=jsonpath_spec) else: - if ignore_spec: - console.print('[yellow]Warning[/yellow]: ignoring --ignore-spec because --only-if-modifying was not provided') + if jsonpath_spec: + console.print('[yellow]Warning[/yellow]: ignoring --jsonpath-spec because --only-if-modifying was not provided') for record, _, _, _, _ in source: if pid: @@ -712,7 +714,7 @@ def _post_change_set( pid: str | None, add_annotations: bool, only_if_modifying: bool, - ignore_spec: str | None, + jsonpath_spec: str | None, dry_run: bool, session: Session, ): @@ -720,10 +722,10 @@ def _post_change_set( record_comparer = None if only_if_modifying: - record_comparer = RecordComparer(ignore_spec=ignore_spec) + record_comparer = RecordComparer(jsonpath_spec=jsonpath_spec) else: - if ignore_spec: - console.print('[yellow]Warning[/yellow]: ignoring --ignore-spec because --only-if-modifying was not provided') + if jsonpath_spec: + console.print('[yellow]Warning[/yellow]: ignoring --jsonpath-spec because --only-if-modifying was not provided') for file_name in source: diff --git a/dump_things_pyclient/commands/dtc_plugins/common/record_comparer.py b/dump_things_pyclient/commands/dtc_plugins/common/record_comparer.py index b1b8601..75e01ab 100644 --- a/dump_things_pyclient/commands/dtc_plugins/common/record_comparer.py +++ b/dump_things_pyclient/commands/dtc_plugins/common/record_comparer.py @@ -1,5 +1,10 @@ from __future__ import annotations +from jsonpath_ng import ( + parse, + jsonpath, +) + JSON = dict[str, 'JSON'] | list['JSON'] | int | str | float | bool | None @@ -8,106 +13,40 @@ class RecordComparer: """Compare dictionaries, ignoring specified elements `RecordComparer` compares two dictionaries, ignoring the elements that - are specified by `ignore_spec`. + are specified by `jsonpath_spec`. - The specification defines a path from the - "root" of the dictionary, by specifying dictionary keys or list indices. - Keys and indices are separated by a dot, i.e., by `.` Indices can be - integers or `*`. `*` matches any index. For example: + The comparator uses `jsonpath-ng` (`https://pypi.org/project/jsonpath-ng/`). + All elements that match the expression in `jsonpath_spec` will be removed + before comparing two records (the specification language is defined in the + jsonpath-ng documentation). - Given the dictionary `r = {'key_1': {'key_1_1': 11', 'key_1_2': 22}}`, the - specification `'key_1.key_1_2'` matches `r['key_1']['key_2']`. This - entry will be ignored during comparison. Therefore, the comparison of - the two records: + Given the dictionary `r = {'key_1': {'key_1_1': 11, 'key_1_2': 22}}`, the + specification `'key_1.key_1_2'` matches `r['key_1']['key_1_2']`. This + entry will be ignored during comparison. Therefore, the two records: `r1 = {'key_1': {'key_1_1': 11, 'key_1_2': 22}}` `r2 = {'key_1': {'key_1_1': 11, 'key_1_2': 44}}` - with ignore_spec: `key_1.key_1_2`, `r1` and `r2` are considered to + with `jsonpath_spec`: `key_1.key_1_2`, `r1` and `r2` are considered to be equal. For example, to ignore `annotations` when comparing two - dictionaries, `ignore_spec` should be `'annotations'`. - - Indices are specified by `'[]'`, where `` is either an - integer or `*`. For example, the comparison of the two records: - - `r3 = {'a': [{'b': 1, 'x': 2}, {'b': 3, 'y': 4}]}` - `r4 = {'a': [{'b': 1, 'x': 2}, {'y': 4}]}` - - with ignore_spec: `a.[1].b` would yield `True`, i.e., the two records are - considered as equal. - - With `ignore_spec`: `a.[*].b`, `r3` would be considered to be equal to: - - `{'a': [{'x': 2}, {'y': 4}]}` - - because all `b`-keys in elements of `r3['a']` would be deleted. - - Note: ignore_specs only work reliably, if the dictionary keys do not - contain `.` or `[`. + dictionaries, `jsonpath_spec` should be `'annotations'`. """ def __init__( self, *, - ignore_spec: str | None = None, + jsonpath_spec: str | None = None, ): - self.key_list = None if not ignore_spec else ignore_spec.split('.') + self.expr = parse(jsonpath_spec) def is_equal( self, - record_a: JSON, - record_b: JSON, + record_a: dict, + record_b: dict, ) -> bool: - if self.key_list: - return self.clean(record_a, []) == self.clean(record_b, []) - return record_a == record_b - - def clean( - self, - record: JSON, - path: list, - ) -> JSON: - if isinstance(record, dict): - return { - k: self.clean(v, path + [k]) - for k, v in record.items() - if not self.matches(path + [k]) - } - elif isinstance(record, list): - return [ - self.clean(e, path + [f'[{i}]']) - for i, e in enumerate(record) - if not self.matches(path + [f'[{i}]']) - ] - return record - - @staticmethod - def remove_empty_structures( - record: JSON, - ) -> JSON: - if isinstance(record, dict): - return { - k: v for k, v in record.items() if v not in ({}, []) - } - elif isinstance(record, list): - return [ - e for e in record if e not in ({}, []) - ] - return record - - def matches( - self, - path: list[str], - ) -> bool: - """Match path to self.key_list. All elements must match""" - if len(path) < len(self.key_list): - return False - - for element, key in zip(path, self.key_list): - if element[0] == '[' and key[0] == '[': - if key[1] == '*': - continue - elif key[1:-1] != element[1:-1]: - return False - elif element != key: - return False - return True + return self.expr.filter( + lambda _: True, + record_a, + ) == self.expr.filter( + lambda _: True, + record_b, + ) diff --git a/dump_things_pyclient/commands/dtc_plugins/common/tests/test_record_comparer.py b/dump_things_pyclient/commands/dtc_plugins/common/tests/test_record_comparer.py index d8931f2..752ad36 100644 --- a/dump_things_pyclient/commands/dtc_plugins/common/tests/test_record_comparer.py +++ b/dump_things_pyclient/commands/dtc_plugins/common/tests/test_record_comparer.py @@ -2,23 +2,17 @@ from ..record_comparer import RecordComparer - def test_example(): - ignore_spec = 'key_1.key_1_2' - comparer = RecordComparer(ignore_spec=ignore_spec) + jsonpath_spec = 'key_1.key_1_2' + comparer = RecordComparer(jsonpath_spec=jsonpath_spec) r1 = {'key_1': {'key_1_1': 11, 'key_1_2': 22}} r2 = {'key_1': {'key_1_1': 11, 'key_1_2': 44}} - assert comparer.is_equal(r1, r2) is True, f'Unexpected non-equal result for {r1!r} =({ignore_spec})= {r2!r} wi' - - -def test_remove_empty_structures(): - comparer = RecordComparer() - assert comparer.remove_empty_structures({'b': 1, 'a': {}}) == {'b': 1} + assert comparer.is_equal(r1, r2) is True, f'Unexpected non-equal result for {r1!r} =({jsonpath_spec})= {r2!r} wi' def test_index(): - ignore_spec = 'a.[1].b' - comparer = RecordComparer(ignore_spec=ignore_spec) + jsonpath_spec = 'a.[1].b' + comparer = RecordComparer(jsonpath_spec=jsonpath_spec) assert comparer.is_equal( {'a': [{'b': 1, 'x': 2}, {'b': 3, 'y': 4}]}, {'a': [{'b': 1, 'x': 2}, {'y': 4}]} @@ -26,8 +20,8 @@ def test_index(): def test_wildcard_index(): - ignore_spec = 'a.[*].b' - comparer = RecordComparer(ignore_spec=ignore_spec) + jsonpath_spec = 'a.[*].b' + comparer = RecordComparer(jsonpath_spec=jsonpath_spec) assert comparer.is_equal( {'a': [{'b': 1, 'x': 2}, {'b': 3, 'y': 4}]}, {'a': [{'x': 2}, {'y': 4}]} diff --git a/dump_things_pyclient/commands/dtc_plugins/post_records.py b/dump_things_pyclient/commands/dtc_plugins/post_records.py index 2a57502..22bc37a 100644 --- a/dump_things_pyclient/commands/dtc_plugins/post_records.py +++ b/dump_things_pyclient/commands/dtc_plugins/post_records.py @@ -7,7 +7,6 @@ import rich_click as click from rich.console import Console from rich.progress import track -from .common.record_comparer import RecordComparer from ...communicate import ( HTTPError, curated_write_record, @@ -15,6 +14,7 @@ from ...communicate import ( get_session, curated_read_record_with_pid, incoming_read_record_with_pid, ) from .common.prefix import de_prefix +from .common.record_comparer import RecordComparer subcommand_name = 'post-records' @@ -62,17 +62,19 @@ console = Console(file=sys.stderr) 'posted, if it does not yet exist ' 'in the destination, or if the posted record modifies the existing ' 'record (to ignore certain record entries, for example, `annotations`, ' - 'when comparing records, use the option --ignore-spec).', + 'when comparing records, use the option --jsonpath-spec).', default=False, is_flag=True, ) @click.option( - '--ignore-spec', + '--jsonpath-spec', metavar='IGNORE_SPEC', - help='define record-attributes, for example `annotations`, that should be ' - 'ignored when checking for existing records (for a description of the ' - 'specification syntax check the docstring of ' - '`dump_things_pyclient.commands.dtc_plugins.common.record_comparer.RecordComparer`).', + help='a jsonpath-expression that defines record-attributes, that should be ' + 'ignored when checking for existing records. Every element that matches ' + 'the expression will be ignored when comparing records (for a ' + 'description of the specification syntax check the documentation of ' + 'https://pypi.org/project/jsonpath-ng/). For example, to ignore ' + '`annotations` use `--jsonpath_spec "annotations"`.', default=None, is_flag=False, ) @@ -91,7 +93,7 @@ def cli( author_id, ignore_errors, only_if_modifying, - ignore_spec, + jsonpath_spec, dry_run, ): """Read records of class CLASS from standard input and store them in @@ -128,7 +130,7 @@ def cli( author_id, ignore_errors, only_if_modifying, - ignore_spec, + jsonpath_spec, dry_run, ) ) @@ -143,7 +145,7 @@ def post_records( author_id, ignore_errors, only_if_modifying, - ignore_spec, + jsonpath_spec, dry_run, ) -> int: token = obj @@ -163,10 +165,10 @@ def post_records( if not curated: console.print('[yellow]Warning[/yellow]: ignoring --only-if-modifying because --curated was not provided') else: - record_comparer = RecordComparer(ignore_spec=ignore_spec) + record_comparer = RecordComparer(jsonpath_spec=jsonpath_spec) else: - if ignore_spec: - console.print('[yellow]Warning[/yellow]: ignoring --ignore-spec because --only-if-modifying was not provided') + if jsonpath_spec: + console.print('[yellow]Warning[/yellow]: ignoring --jsonpath-spec because --only-if-modifying was not provided') failed = [] session = get_session() diff --git a/pyproject.toml b/pyproject.toml index 57b9c16..dd20950 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ authors = [ ] dependencies = [ "click", + "jsonpath-ng", "pyyaml", "requests", "rich-click", diff --git a/uv.lock b/uv.lock index 235c217..4e8f645 100644 --- a/uv.lock +++ b/uv.lock @@ -421,6 +421,7 @@ version = "0.2.16" source = { virtual = "." } dependencies = [ { name = "click" }, + { name = "jsonpath-ng" }, { name = "pyyaml" }, { name = "requests" }, { name = "rich-click" }, @@ -449,6 +450,7 @@ requires-dist = [ { name = "click" }, { name = "dump-things-service", marker = "extra == 'tests'", specifier = ">=5.6.1" }, { name = "dump-things-service", marker = "extra == 'ttl'", specifier = ">=5.6.1" }, + { name = "jsonpath-ng" }, { name = "pytest", marker = "extra == 'tests'", specifier = ">=9.0.1" }, { name = "pyyaml" }, { name = "requests" }, @@ -1058,6 +1060,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/90/0d93963711f811efe528e3cead2f2bfb78c196df74d8a24fe8d655288e50/jsonasobj2-1.0.4-py3-none-any.whl", hash = "sha256:12e86f86324d54fcf60632db94ea74488d5314e3da554c994fe1e2c6f29acb79", size = 6324, upload-time = "2021-06-02T17:43:27.126Z" }, ] +[[package]] +name = "jsonpath-ng" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/32/58/250751940d75c8019659e15482d548a4aa3b6ce122c515102a4bfdac50e3/jsonpath_ng-1.8.0.tar.gz", hash = "sha256:54252968134b5e549ea5b872f1df1168bd7defe1a52fed5a358c194e1943ddc3", size = 74513, upload-time = "2026-02-24T14:42:06.182Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/99/33c7d78a3fb70d545fd5411ac67a651c81602cc09c9cf0df383733f068c5/jsonpath_ng-1.8.0-py3-none-any.whl", hash = "sha256:b8dde192f8af58d646fc031fac9c99fe4d00326afc4148f1f043c601a8cfe138", size = 67844, upload-time = "2026-02-28T00:53:19.637Z" }, +] + [[package]] name = "jsonpointer" version = "3.0.0" -- 2.52.0 From 1ebc58f38c489e8ec0cb2dc104deefae9691d787 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 31 Mar 2026 09:57:59 +0200 Subject: [PATCH 6/7] add tests for --only-if-modifying in post_records --- .../dtc_plugins/common/record_comparer.py | 18 +++-- .../tests/test_post_records.py | 78 +++++++++++++++++++ 2 files changed, 88 insertions(+), 8 deletions(-) diff --git a/dump_things_pyclient/commands/dtc_plugins/common/record_comparer.py b/dump_things_pyclient/commands/dtc_plugins/common/record_comparer.py index 75e01ab..27081bc 100644 --- a/dump_things_pyclient/commands/dtc_plugins/common/record_comparer.py +++ b/dump_things_pyclient/commands/dtc_plugins/common/record_comparer.py @@ -36,17 +36,19 @@ class RecordComparer: *, jsonpath_spec: str | None = None, ): - self.expr = parse(jsonpath_spec) + self.expr = None if jsonpath_spec is None else parse(jsonpath_spec) def is_equal( self, record_a: dict, record_b: dict, ) -> bool: - return self.expr.filter( - lambda _: True, - record_a, - ) == self.expr.filter( - lambda _: True, - record_b, - ) + if self.expr: + return self.expr.filter( + lambda _: True, + record_a, + ) == self.expr.filter( + lambda _: True, + record_b, + ) + return record_a == record_b diff --git a/dump_things_pyclient/tests/test_post_records.py b/dump_things_pyclient/tests/test_post_records.py index 21451c8..634aa6d 100644 --- a/dump_things_pyclient/tests/test_post_records.py +++ b/dump_things_pyclient/tests/test_post_records.py @@ -221,3 +221,81 @@ def test_dtc_post_record_any_class(dump_things_service): ) assert all(r in cleaned_disk_record for r in document_records) + + +def test_dtc_post_records_if_changes(dump_things_service, monkeypatch): + from dump_things_pyclient.commands.dtc_plugins.post_records import console + + port, store = dump_things_service + + print_calls = [] + monkeypatch.setattr( + console, + 'print', + lambda *args: print_calls.extend(args), + ) + + existing_record = { + "pid": f"test:post_records_if_changes", + "given_name": f"klaus", + 'schema_type': 'test:Person', + 'annotations': { + 'https://submitter.example.com': 'submitter_1', + 'https://counter.example.com': '1', + }, + } + + new_record = { + k: v for k, v in existing_record.items() if k not in ('annotations',) + } + new_record['annotations'] = { + 'https://submitter.example.com': 'submitter_2', + 'https://counter.example.com': '2', + } + + runner = CliRunner() + result = runner.invoke( + cli, + [ + '--token=token-curator', + 'post-records', + '--curated', + f'http://127.0.0.1:{port}', 'collection_1', '*', + ], + input=json.dumps(existing_record, ensure_ascii=False) + ) + assert result.exit_code == 0 + print_calls = [] + + # Try to post an identical record with `--only-if-modifying` + result = runner.invoke( + cli, + [ + '--token=token-curator', + 'post-records', + '--curated', + '--only-if-modifying', + f'http://127.0.0.1:{port}', 'collection_1', '*', + ], + input=json.dumps(new_record, ensure_ascii=False) + ) + assert result.exit_code == 0 + assert 'skipping writing of record [green]test:post_records_if_changes[/green] because a matching record already exists' not in print_calls + print_calls = [] + + # Try to post the new record with `--only-if-mddifying`, ignoring + # `annotations`. + result = runner.invoke( + cli, + [ + '--token=token-curator', + 'post-records', + '--curated', + '--only-if-modifying', + '--jsonpath-spec', 'annotations', + f'http://127.0.0.1:{port}', 'collection_1', '*', + ], + input=json.dumps(new_record, ensure_ascii=False) + ) + assert result.exit_code == 0 + assert 'skipping writing of record [green]test:post_records_if_changes[/green] because a matching record already exists' in print_calls -- 2.52.0 From 23714d4f8f6ce55574b2125dc6e7ad2099a287d2 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Tue, 31 Mar 2026 17:05:16 +0200 Subject: [PATCH 7/7] add tests for --only-if-modifying in auto-curate --- .../commands/dtc_plugins/auto_curate.py | 2 +- .../tests/test_auto_curate.py | 98 +++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) diff --git a/dump_things_pyclient/commands/dtc_plugins/auto_curate.py b/dump_things_pyclient/commands/dtc_plugins/auto_curate.py index 93fd812..620cf12 100644 --- a/dump_things_pyclient/commands/dtc_plugins/auto_curate.py +++ b/dump_things_pyclient/commands/dtc_plugins/auto_curate.py @@ -147,7 +147,7 @@ stl_info = False 'in the destination, or if the posted record modifies the existing ' 'record (to ignore certain record entries, for example, `annotations`, ' 'when comparing records, use the option --jsonpath-spec). NOTE: even if ' - 'a record is no posted, it will be removed from its inbox (unless ' + 'a record is not posted, it will be removed from its inbox (unless ' '`--keep-inboxes` is provided).', default=False, is_flag=True, diff --git a/dump_things_pyclient/tests/test_auto_curate.py b/dump_things_pyclient/tests/test_auto_curate.py index 6615586..86a849a 100644 --- a/dump_things_pyclient/tests/test_auto_curate.py +++ b/dump_things_pyclient/tests/test_auto_curate.py @@ -402,3 +402,101 @@ def test_keep_inboxes(dump_things_service, tmp_path_factory, create_changeset): ] for record in unique_records.values(): assert record in cleaned_incoming_records + + +def test_auto_curate_if_changes(dump_things_service, monkeypatch): + from dump_things_pyclient.commands.dtc_plugins.auto_curate import console + + port, store = dump_things_service + + print_calls = [] + monkeypatch.setattr( + console, + 'print', + lambda *args: print_calls.extend(args), + ) + + existing_record = { + "pid": f"test:auto_curate_if_changes", + "given_name": f"markus", + 'schema_type': 'test:Person', + 'annotations': { + 'https://submitter.example.com': 'submitter_1', + 'https://counter.example.com': '1', + }, + } + + new_record = { + k: v for k, v in existing_record.items() if k not in ('annotations',) + } + new_record['annotations'] = { + 'https://submitter.example.com': 'submitter_2', + 'https://counter.example.com': '2', + } + + runner = CliRunner() + + # Post the existing record directly into the curated area + result = runner.invoke( + cli, + [ + '--token=token-curator', + 'post-records', + '--curated', + f'http://127.0.0.1:{port}', 'collection_1', '*', + ], + input=json.dumps(existing_record, ensure_ascii=False) + ) + assert result.exit_code == 0 + print_calls = [] + + # Post the new record to the users inbox + result = runner.invoke( + cli, + [ + '--token=user_1', + 'post-records', + f'http://127.0.0.1:{port}', 'collection_1', '*', + ], + input=json.dumps(new_record, ensure_ascii=False) + ) + assert result.exit_code == 0 + print_calls = [] + + # Try to auto-curate the new record with `--only-if-modifying`, ignoring + # `annotations`. This should not post the record to the curated area, but + # emit a message that the record was not posted + result = runner.invoke( + cli, + [ + '--token=token-curator', + 'auto-curate', + '--include', 'test_user_1', + '--only-if-modifying', + '--jsonpath-spec', 'annotations', + '--keep-inboxes', + f'http://127.0.0.1:{port}', 'collection_1', + ], + input=json.dumps(new_record, ensure_ascii=False) + ) + assert result.exit_code == 0 + assert 'skipping writing of record [green]test:auto_curate_if_changes[/green] because a matching record already exists' in print_calls + print_calls = [] + + # Try to post the new record with `--only-if-modifying`, that should post + # the record to curated, because the annotations are different from the + # existing record. + result = runner.invoke( + cli, + [ + '--token=token-curator', + 'auto-curate', + '--include', 'test_user_1', + '--only-if-modifying', + '--keep-inboxes', + f'http://127.0.0.1:{port}', 'collection_1', + ], + input=json.dumps(new_record, ensure_ascii=False) + ) + assert result.exit_code == 0 + assert 'skipping writing of record [green]test:auto_curate_if_changes[/green] because a matching record already exists' not in print_calls -- 2.52.0