Add enrich-via-doi #2
3 changed files with 503 additions and 0 deletions
436
.forgejo/tools/enrich-via-doi.py
Normal file
436
.forgejo/tools/enrich-via-doi.py
Normal file
|
|
@ -0,0 +1,436 @@
|
|||
# /// script
|
||||
# requires-python = ">=3.12"
|
||||
# dependencies = [
|
||||
# "bidict",
|
||||
# "rich-click",
|
||||
# "lxml",
|
||||
# "requests_cache",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import json
|
||||
from urllib.parse import urljoin
|
||||
from pathlib import Path
|
||||
import re
|
||||
import warnings
|
||||
|
||||
from bidict import bidict
|
||||
from lxml import html
|
||||
from requests_cache import CachedSession
|
||||
import rich_click as click
|
||||
|
||||
|
||||
def consult_rules(license_uri: str, rules: list[dict]) -> str | None:
|
||||
"""Match a license URI against Rule records
|
||||
|
||||
This function tries to match the given license url against PIDs
|
||||
(expanded from curie to uri using hardcoded prefixes) or exact
|
||||
mappings.
|
||||
|
||||
"""
|
||||
|
||||
pmap = {
|
||||
"obo": "http://purl.obolibrary.org/obo/",
|
||||
"spdxlic": "https://spdx.org/licenses/",
|
||||
}
|
||||
|
||||
for rule in rules:
|
||||
# this assumes pid needs expansion but exact mappings do not
|
||||
# this ignores trailing / in exact mappings
|
||||
identifiers = [
|
||||
expand_curie(rule["pid"], pmap),
|
||||
*[x.rstrip("/") for x in rule.get("exact_mappings", [])],
|
||||
]
|
||||
if license_uri.rstrip("/") in identifiers:
|
||||
return rule["pid"]
|
||||
|
||||
|
||||
def csl_abstract(d: dict) -> str | None:
|
||||
"""Get abstract from csl
|
||||
|
||||
Some abstracts seen in the wild are marked up with jats tags, and
|
||||
the top-level may include (a combination of) sections, titles and
|
||||
paragraphs (usually, a section itself contains a title and one
|
||||
paragraph). We can use the paragraphs, and mix in the section
|
||||
titles. Otherwise, remove all tags (return text content).
|
||||
|
||||
"""
|
||||
if abstract := d.get("abstract", False):
|
||||
h = html.fromstring(abstract)
|
||||
if {x.tag for x in h} <= {"jats:p", "jats:title", "jats:sec"}:
|
||||
return jats2md(h)
|
||||
else:
|
||||
return h.text_content()
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def csl_license(d: dict) -> list:
|
||||
"""Get license from doi content-negotiation json"""
|
||||
license_urls = []
|
||||
for license in d.get("license", []):
|
||||
if license["content-version"] == "vor":
|
||||
# "version of record"
|
||||
license_urls.append(license["URL"])
|
||||
# deduplicate before returning, just in case
|
||||
return list(set(license_urls))
|
||||
|
||||
|
||||
def csl_publish_date(d: dict, allow_incomplete: bool = True) -> str | None:
|
||||
"""Get one publication date out of csl"""
|
||||
if "issued" in d:
|
||||
date = d["issued"]["date-parts"]
|
||||
elif "published-online" in d:
|
||||
date = d["published-online"]["date-parts"]
|
||||
else:
|
||||
return None
|
||||
|
||||
# partial date, a nested array of numbers
|
||||
if len(date[0]) == 1 or (len(date[0]) < 3 and not allow_incomplete):
|
||||
isodate = f"{date[0][0]}" # yyyy (only year is required)
|
||||
elif len(date[0]) == 2:
|
||||
isodate = f"{date[0][0]}-{date[0][1]:02}" # yyyy-mm
|
||||
else:
|
||||
isodate = f"{date[0][0]}-{date[0][1]:02}-{date[0][2]:02}" # yyyy-mm-dd
|
||||
|
||||
return isodate
|
||||
|
||||
|
||||
def discover_authors(
|
||||
publication: dict, known_people: bidict[str, str], citeproc_record: dict
|
||||
) -> list[dict]:
|
||||
|
||||
missing_attributions = []
|
||||
|
||||
# check which contributors with orcids are already declared
|
||||
declared_contributor_orcids = set()
|
||||
for attribution in publication.get("attributed_to", []):
|
||||
if (orcid := known_people.get(attribution.get("object"))) is not None:
|
||||
declared_contributor_orcids.add(orcid)
|
||||
|
||||
# compare to contributors with orcids in the citeproc record
|
||||
for author in citeproc_record.get("author", []):
|
||||
if (
|
||||
(orcid := author.get("ORCID")) is not None
|
||||
and orcid in known_people.values()
|
||||
and orcid not in declared_contributor_orcids
|
||||
):
|
||||
if author.get("sequence") == "first":
|
||||
r = "obo:MS_1002034" # first author
|
||||
elif author.get("sequence") == "additional":
|
||||
r = "obo:MS_1002036" # co-author
|
||||
else:
|
||||
r = "marcrel:aut"
|
||||
missing_attributions.append(
|
||||
{"object": known_people.inverse[orcid], "roles": [r]}
|
||||
)
|
||||
|
||||
return missing_attributions
|
||||
|
||||
|
||||
def expand_curie(curie: str, pmap: dict[str, str]) -> str:
|
||||
"""Expand curie to uri using a prefix map
|
||||
|
||||
If there is no prefix or the prefix is not defined in the prefix
|
||||
map, returns the input value. This is a simple helper. For more
|
||||
complex usecases, consider using the external curies package.
|
||||
|
||||
"""
|
||||
pat = re.compile(r"(?P<prefix>\w+):(?P<reference>.*)")
|
||||
if (m := re.match(pat, curie)) is not None and m["prefix"] in pmap:
|
||||
return pmap[m["prefix"]] + m["reference"]
|
||||
return curie
|
||||
|
||||
|
||||
def jats2md(span: html.HtmlElement, rstrip: bool = True) -> str:
|
||||
full_text = ""
|
||||
for elem in span:
|
||||
if elem.tag == "jats:title":
|
||||
if elem.text.lower() != "abstract":
|
||||
# we know an abstract is an abstract
|
||||
full_text += elem.text_content()
|
||||
full_text += ": " if not elem.text_content().endswith(".") else " "
|
||||
elif elem.tag == "jats:p":
|
||||
this_text = elem.text_content()
|
||||
for sub in elem:
|
||||
if sub.tag == "jats:ext-link":
|
||||
# wrap at least plain links for unambiguous parsing by hugo
|
||||
if (href := sub.get("xlink:href")) == sub.text_content():
|
||||
this_text = this_text.replace(href, f"<{href}>")
|
||||
full_text += this_text
|
||||
full_text += "\n\n"
|
||||
elif elem.tag == "jats:sec":
|
||||
full_text += jats2md(elem, rstrip=False)
|
||||
else:
|
||||
full_text += elem.text_content()
|
||||
return full_text.rstrip() if rstrip else full_text
|
||||
|
||||
|
||||
def pid_of(x: str | dict) -> str:
|
||||
"""Return a PID of an object, inlined or not
|
||||
|
||||
A shortcut - makes a pid string or an inlined dict (where pid is a
|
||||
property) equivalent. Does not do further validation, but it could
|
||||
be added here.
|
||||
|
||||
"""
|
||||
return x.get("pid", "") if isinstance(x, dict) else x
|
||||
|
||||
|
||||
def process_doi(paper: dict) -> str | None:
|
||||
"""Return a DOI from identifiers"""
|
||||
|
||||
for identifier in paper.get("identifiers", []):
|
||||
if (
|
||||
pid_of(identifier.get("creator")) == "ror:01fyxcz70"
|
||||
or identifier.get("schema_type") == "dlthings:DOI"
|
||||
):
|
||||
return identifier.get("notation")
|
||||
|
||||
|
||||
def process_orcid(person: dict) -> str | None:
|
||||
"""Return an ORCID from identifiers"""
|
||||
|
||||
for identifier in person.get("identifiers", []):
|
||||
if pid_of(identifier.get("creator")) == "ror:04fa4r544":
|
||||
return identifier.get("notation")
|
||||
|
||||
|
||||
def publishing_process(d: dict) -> dict[str, str] | None:
|
||||
res = {"object": "obo:IAO_0000444"}
|
||||
has_detail = False
|
||||
|
||||
if (pubdate := csl_publish_date(d)) is not None:
|
||||
has_detail = True
|
||||
res["at_time"] = pubdate
|
||||
|
||||
if (issn := d.get("ISSN")) is not None:
|
||||
has_detail = True
|
||||
# there can be more than one (e.g. different for print / online)
|
||||
# if that's the case, use the 1st - we have no more data at hand
|
||||
res["at_location"] = f"ISSN:{issn[0]}"
|
||||
|
||||
return res if has_detail else None
|
||||
|
||||
|
||||
def query_doi_citation(session: CachedSession, doi: str) -> str | None:
|
||||
doi_url = urljoin("https://doi.org/", doi)
|
||||
r = session.get(doi_url, headers={"Accept": "text/x-bibliography; style=apa"})
|
||||
if r.ok and (r.encoding != r.apparent_encoding == "utf-8"):
|
||||
# if it appears like utf-8, it likely is utf-8
|
||||
# see https://stackoverflow.com/questions/44203397/
|
||||
r.encoding = r.apparent_encoding
|
||||
return r.text if r.ok else None
|
||||
|
||||
|
||||
def query_doi_csl(session: CachedSession, doi: str) -> dict | None:
|
||||
doi_url = urljoin("https://doi.org", doi)
|
||||
r = session.get(
|
||||
doi_url, headers={"Accept": "application/vnd.citationstyles.csl+json"}
|
||||
)
|
||||
return r.json() if r.ok else None
|
||||
|
||||
|
||||
def remap_person_records(records: list[dict]) -> bidict[str, str]:
|
||||
"""Create a bidirectional mapping of PIDs and ORCIDs"""
|
||||
my_map = bidict()
|
||||
for record in records:
|
||||
if (orcid := process_orcid(record)) is not None:
|
||||
my_map[record["pid"]] = f"https://orcid.org/{orcid}"
|
||||
return my_map
|
||||
|
||||
|
||||
def rules_from_citeproc(citeproc_record: dict, known_rules: list[dict]) -> list[str]:
|
||||
res = []
|
||||
for url in csl_license(citeproc_record):
|
||||
if (license_pid := consult_rules(url, known_rules)) is not None:
|
||||
res.append(license_pid)
|
||||
return sorted(res)
|
||||
|
||||
|
||||
def short_name_from_citeproc(d: dict) -> str | None:
|
||||
"""Generate file name based on citeproc data
|
||||
|
||||
Combines last name of the first author, (short) container title,
|
||||
and date to form something that is human-readable and likely
|
||||
unique enough.
|
||||
|
||||
Required properties are usually present, but they are not
|
||||
required, so we proceed only if we find all three.
|
||||
|
||||
"""
|
||||
|
||||
if not (
|
||||
"author" in d
|
||||
and ("container-title-short" in d or "container-title" in d)
|
||||
and "issued" in d
|
||||
):
|
||||
return None
|
||||
|
||||
# first author (et al)
|
||||
author = d["author"]
|
||||
if len(author) == 1:
|
||||
# family is required (at least in crossref) - define default to be safe
|
||||
author_part = author[0].get("family", "unknown")
|
||||
else:
|
||||
author_part = author[0].get("family", "unknown") + "_etal"
|
||||
|
||||
# journal title (abbreviated)
|
||||
if container := d.get("container-title-short", False):
|
||||
journal_part = container.replace(" ", "_")
|
||||
elif ((container := d.get("container-title")) is not None) and container != []:
|
||||
# todo: iso4?
|
||||
journal_part = container.replace(" ", "_")
|
||||
else:
|
||||
# none of those are mandatory
|
||||
journal_part = d.get("group-title", "")
|
||||
institution = d.get("institution", [{}])[0].get("name")
|
||||
if institution == "bioRxiv":
|
||||
# "biorxiv-neuroscience" over "neuroscience"
|
||||
journal_part = institution + "-" + "journal_part"
|
||||
if journal_part == "":
|
||||
journal_part = "unknown"
|
||||
journal_part = re.sub(r"[^\w]", "", journal_part) # keep alphanumerics
|
||||
|
||||
date_part = csl_publish_date(d).replace("-", "_") # pyright:ignore
|
||||
|
||||
return "_".join((author_part, journal_part, date_part)) + ".md"
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument("input", type=click.File("rb"))
|
||||
@click.argument("output", type=click.File("wt"))
|
||||
@click.option(
|
||||
"--persons",
|
||||
type=click.File("rb"),
|
||||
help="Person records to discover authors (json lines).",
|
||||
)
|
||||
@click.option(
|
||||
"--rules",
|
||||
type=click.File("rb"),
|
||||
help="Rule records (json lines) to match licenses (json lines).",
|
||||
)
|
||||
@click.option(
|
||||
"--extras",
|
||||
is_flag=True,
|
||||
help="Add non-schema-compliant properties (starting with x_).",
|
||||
)
|
||||
def main(input, output, persons, rules, extras):
|
||||
"""Enrich record with metadata fetched via doi.org
|
||||
|
||||
Reads publication records from INPUT and outputs enriched records
|
||||
to OUTPUT. INPUT and OUTPUT should be in JSON lines format, and
|
||||
can be files or stdin / stdout (-).
|
||||
|
||||
With --persons, authors in the retrieved metadata will be
|
||||
cross-referenced with the provided Person records based on ORCID,
|
||||
and added to contributors (requires ORCID to be present in both
|
||||
sources). With --rules, licenses will be translated by checking
|
||||
PIDs and exact mappings of the provided Rule records. Both
|
||||
arguments can use JSON lines files or stdin (-).
|
||||
|
||||
Only the properties which are missing are updated (date is the
|
||||
exception, updated if more precise one is available).
|
||||
|
||||
If --extras is specified, the produced record will contain
|
||||
properties which are not compatible with the research information
|
||||
schema, but can be useful for page generators (x_citation and
|
||||
x_suggested_name).
|
||||
|
||||
Makes requests to doi.org (content negotiation) to fetch metadata
|
||||
(and, with --extras, also formatted citation). Uses caching to
|
||||
store requests in `$PWD/.cache` (valid for 2 hours).
|
||||
|
||||
"""
|
||||
|
||||
session = CachedSession(
|
||||
".cache/requests-cache/http_cache",
|
||||
backend="sqlite",
|
||||
match_headers=["Accept"],
|
||||
expire_after=7200,
|
||||
)
|
||||
|
||||
all_people = [json.loads(line) for line in persons] if persons is not None else []
|
||||
all_rules = [json.loads(line) for line in rules] if rules is not None else []
|
||||
pid_orcid_map = remap_person_records(all_people)
|
||||
|
||||
for line in input:
|
||||
paper = json.loads(line)
|
||||
doi = process_doi(paper)
|
||||
citeproc_metadata = query_doi_csl(session, doi) if doi is not None else None
|
||||
citation_text = (
|
||||
query_doi_citation(session, doi) if doi is not None and extras else None
|
||||
)
|
||||
|
||||
if citation_text is not None:
|
||||
paper["x_citation"] = citation_text
|
||||
|
||||
if citeproc_metadata is None:
|
||||
# nothing to do, emit unchanged
|
||||
click.echo(json.dumps(paper), output)
|
||||
continue
|
||||
|
||||
# contributors
|
||||
more_attributions = discover_authors(paper, pid_orcid_map, citeproc_metadata)
|
||||
if len(more_attributions) > 0:
|
||||
if "attributed_to" not in paper:
|
||||
paper["attributed_to"] = more_attributions
|
||||
else:
|
||||
paper["attributed_to"].extend(more_attributions)
|
||||
|
||||
# publishing activity (date / ISSN)
|
||||
citeproc_pp = publishing_process(citeproc_metadata)
|
||||
activities = paper.get("generated_by", [])
|
||||
|
||||
# find publishing process in publication
|
||||
pp_idx = None
|
||||
for i in range(len(activities)):
|
||||
if pid_of(activities[i].get("object")) == "obo:IAO_0000444": # Publishing process
|
||||
pp_idx = i
|
||||
break
|
||||
|
||||
# update publishing activity (date & issn)
|
||||
if citeproc_pp is not None:
|
||||
if "generated_by" not in paper:
|
||||
# no activities so far: add a list
|
||||
paper["generated_by"] = [citeproc_pp]
|
||||
elif pp_idx is None:
|
||||
# activities but no publishing process: append
|
||||
paper["generated_by"].append(citeproc_pp)
|
||||
else:
|
||||
# activities incl. publishing process: merge keeping original values
|
||||
paper["generated_by"][pp_idx] = (
|
||||
citeproc_pp | paper["generated_by"][pp_idx]
|
||||
)
|
||||
# override date if is more precise in citeproc
|
||||
if len(citeproc_pp.get("at_time", "").split("-")) > len(
|
||||
paper["generated_by"][pp_idx].get("at_time", "").split("-")
|
||||
):
|
||||
paper["generated_by"][pp_idx]["at_time"] = citeproc_pp["at_time"]
|
||||
|
||||
# title
|
||||
if paper.get("title") is None and citeproc_metadata.get("title") is not None:
|
||||
paper["title"] = citeproc_metadata.get("title")
|
||||
|
||||
# abstract
|
||||
if (
|
||||
paper.get("description") is None
|
||||
and (citeproc_abstract := csl_abstract(citeproc_metadata)) is not None
|
||||
):
|
||||
paper["description"] = citeproc_abstract
|
||||
|
||||
# rules (licenses)
|
||||
if paper.get("rules") is None:
|
||||
citeproc_rules = rules_from_citeproc(citeproc_metadata, all_rules)
|
||||
if len(citeproc_rules) > 0:
|
||||
paper["rules"] = citeproc_rules
|
||||
|
||||
# suggested output file name
|
||||
if extras and (sn := short_name_from_citeproc(citeproc_metadata)) is not None:
|
||||
paper["x_suggested_name"] = sn
|
||||
|
||||
click.echo(json.dumps(paper), output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
64
.forgejo/workflows/enrich_publications.yml
Normal file
64
.forgejo/workflows/enrich_publications.yml
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
name: Enrich publications via doi.org
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
pids:
|
||||
description: "Limit to these PIDs (comma-separated)"
|
||||
required: false
|
||||
default: ''
|
||||
type: string
|
||||
inbox:
|
||||
description: "Limit to inbox with this label"
|
||||
required: false
|
||||
default: ''
|
||||
type: string
|
||||
|
||||
env:
|
||||
DTC_TOKEN: ${{ secrets.POOLTOKEN }}
|
||||
DUMPTHINGS_APIURL: https://pool.psychoinformatics.de/api
|
||||
DUMPTHINGS_COLLECTION: public
|
||||
PERSON_CLASS: XYZPerson
|
||||
PUBLICATION_CLASS: XYZPublication
|
||||
RULE_CLASS: Rule
|
||||
|
||||
jobs:
|
||||
enrich-publications:
|
||||
name: Enrich publications
|
||||
runs-on: debian-latest
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
- name: Install metadata tools
|
||||
run: |
|
||||
uv tool install https://hub.psychoinformatics.de/orinoco/query-things.git \
|
||||
--with-executables-from dump-things-pyclient
|
||||
- name: Fetch script
|
||||
run: |
|
||||
wget https://hub.psychoinformatics.de/orinoco/knowledge-enrichment/raw/branch/main/.forgejo/tools/enrich-via-doi.py
|
||||
- name: Pre-fetch data
|
||||
run: |
|
||||
mkdir .cache
|
||||
dtc get-records $DUMPTHINGS_APIURL public -C $PERSON_CLASS > .cache/Person.jsonl
|
||||
dtc get-records $DUMPTHINGS_APIURL public -C $RULE_CLASS > .cache/Rule.jsonl
|
||||
- name: Process records
|
||||
run: |
|
||||
export INBOX_LABEL=${{ inputs.inbox }}
|
||||
export PIDS=${{ inputs.pids }}
|
||||
if [ -n "$PIDS" ]
|
||||
then
|
||||
IFS=',' read -ra PID_ARRAY <<< $PIDS
|
||||
for pid in ${PID_ARRAY[@]}
|
||||
do
|
||||
dtc get-records $DUMPTHINGS_APIURL $DUMPTHINGS_COLLECTION --pid $pid ${INBOX_LABEL:+--incoming $INBOX_LABEL} |
|
||||
uv run enrich-via-doi.py --persons .cache/Person.jsonl --rules .cache/Rule.jsonl - - |
|
||||
dtc post-records $DUMPTHINGS_APIURL $DUMPTHINGS_COLLECTION $PUBLICATION_CLASS
|
||||
done
|
||||
else
|
||||
dtc get-records $DUMPTHINGS_APIURL $DUMPTHINGS_COLLECTION --class $PUBLICATION_CLASS ${INBOX_LABEL:+--incoming $INBOX_LABEL} |
|
||||
uv run enrich-via-doi.py --persons .cache/Person.jsonl --rules .cache/Rule.jsonl - - |
|
||||
dtc post-records $DUMPTHINGS_APIURL $DUMPTHINGS_COLLECTION $PUBLICATION_CLASS
|
||||
fi
|
||||
|
|
@ -8,3 +8,6 @@ with additional machine-generated records.
|
|||
- ``.forgejo/tools/scrape-calendar.py`` (used by
|
||||
``.forgejo/workflows/scrape.yml``): scrapes three FZJ funding calendars and
|
||||
adds their events as XYZCompetition records (research information scheme)
|
||||
- ``.forgejo/tools/enrich-via-doi.py`` (used by
|
||||
``.forgejo/workflows/enrich_publications.yml``): reads publication records and
|
||||
extends them with external metadata available via doi.org content negotiation
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue