knowledge-enrichment/.forgejo/tools/scrape-calendar.py
2026-03-09 08:34:57 +01:00

264 lines
No EOL
10 KiB
Python

# /// script
# requires-python = ">=3.12"
# dependencies = [
# "dump-things-pyclient @ https://hub.psychoinformatics.de/datalink/dump-things-pyclient.git",
# "icalendar",
# "rich",
# "rich-click",
# ]
# ///
from icalendar import Calendar
from os import environ
import urllib.request
import rich_click as click
from dump_things_pyclient.communicate import (
collection_write_record,
collection_read_record_with_pid,
)
# example target format:
# https://hedgedoc.psychoinformatics.de/3cSouq0YSJ6m64_ArWpJEg?edit
# TODO:
# - create merit-based award in psyinf pool, add pid here
# - create record for this script in pool, add pid here
urls = {
'juniorgroup': 'https://webmail.fz-juelich.de/owa/calendar/d61ec0ce8d704cb293df97fbb3c8fe23@fz-juelich.de/2e3e5a7baff44a3780cb2f872bef77b07381212461382934376/calendar.ics',
'funding': 'https://webmail.fz-juelich.de/owa/calendar/d61ec0ce8d704cb293df97fbb3c8fe23@fz-juelich.de/a7ad1dee32cf49749872136f3a9223191125633822045414026/calendar.ics',
'award': 'https://webmail.fz-juelich.de/owa/calendar/d61ec0ce8d704cb293df97fbb3c8fe23@fz-juelich.de/2f6c8a7e45e5416f89c22258bba6ae0114099503221118509202/calendar.ics'
}
competition_types = {'funding': "xyzrins:competition-types/4e49ac7d-d6da-4131-806b-6425491e26fd",
'award': "xyzrins:competition-types/16e38ddd-6323-4fc7-abad-398609bf8541", # pid does not yet exist
'juniorgroup': "xyzrins:competition-types/4e49ac7d-d6da-4131-806b-6425491e26fd"}
def get_calender(url: str) -> dict:
with urllib.request.urlopen(url) as f:
calsource = f.read().decode('utf-8')
cal = Calendar.from_ical(calsource)
return cal
def create_competitions(pool: str = 'https://pool.psychoinformatics.de/api',
collection: str = 'public',
) -> None:
for competition_type, url in urls.items():
cal = get_calender(url)
competition_type = competition_types[competition_type]
for event in cal.events:
pid = 'xyzrins:competition/' + event.get("UID").ical_value
old_rec = _check_existing_record(pool, collection, pid)
if old_rec:
do_not_edit = _check_for_immutable_infos(old_rec)
tmpl = old_rec
else:
print(f"New record: {pid}")
tmpl = {'schema_type': 'xyzri:XYZCompetition'}
do_not_edit = []
record = _assemble_record(pid=pid,
info=event,
competition_type=competition_type,
do_not_edit=do_not_edit,
tmpl=tmpl,
calurl=url,
old_rec=old_rec)
if record is not None:
_upload_records(record, pool)
return
def _check_existing_record(pool: str,
collection: str,
pid: str) -> dict | None:
record = collection_read_record_with_pid(
service_url=pool,
collection=collection,
pid=pid,
token=environ['DTC_TOKEN']
)
return record
def _check_for_immutable_infos(rec: dict) -> list:
# get a list of all slots in the record. Treat those without an
# attribute about machine-generation as immutable
generated_infos = \
[dict['value'] for dict in rec['attributes'][0]['attributes'] \
if 'importedFrom' in rec['attributes'][0]['predicate']]
# don't touch keys if they don't have a machine-generated annotation
do_not_edit = [key for key in rec.keys() if key not in generated_infos]
return do_not_edit
def _extend_or_add(tmpl: dict,
key: str,
value: str | dict,
do_not_edit: list,
generated_infos: list,
generated: str | None = None,
is_list: bool = False
) -> (dict, list):
# If a key already exists in the template, update it. Otherwise,
# create it. However, do not touch the template if the key is not
# to be updated. Keep a record of machine-annotated items (generated) in
# generated_infos. Generated is a list of strings, which serve as
# flexible descriptions. E.g. "associated-with-funder"
if generated in do_not_edit:
return tmpl, generated_infos
if tmpl.get(key, None) is not None:
if type(tmpl[key]) == str:
# single value, just update
tmpl[key] = value
# there already is content, add to it
elif is_list:
tmpl[key].extend(value)
else:
tmpl[key].update(value)
else:
# key does not yet exist
if is_list:
tmpl[key] = [value]
else:
tmpl[key] = value
if generated is not None:
generated_infos.append(generated)
return tmpl, generated_infos
def _add_machine_prov(tmpl: dict,
generated_infos: list,
calurl: str,
scriptpid: str = "xyzrins:instruments/54be0232-d05e-4941-bfba-70716cfd6b05", # pid does not yet exist
) -> dict:
prov = {'attributes': [
{'predicate': 'http://purl.org/pav/importedFrom',
'value': calurl,
'attributes': []
}]}
for value in generated_infos:
new = {'predicate': 'prov:generated',
'value': value,
'characterized_by': [{
'predicate': 'prov:generated_by',
'object': scriptpid
}]}
prov['attributes'][0]['attributes'].append(new)
tmpl.update(prov)
return tmpl
def _assemble_record(pid: str,
info: dict,
competition_type: str,
do_not_edit: list,
tmpl: dict,
calurl: str,
old_rec: str | None = None,
) -> dict | None:
# Take a template (tmpl) and calendar (info), and write calendar
# information into the template.
# keep a list of machine-written infos
generated_infos = []
tmpl["pid"] = pid
tmpl["kind"] = competition_type
#funder, homepage = _find_funder(info)
#if funder:
# assoc = {'object': f"{funder}", #funder would need to be a valid object
# 'roles': ['marcrel:fnd']}
# tmpl, generated_infos = _extend_or_add(tmpl,
# "associated_with",
# assoc,
# do_not_edit,
# generated_infos,
# 'associated-with-funder',
# is_list=True)
#if homepage:
# homepage = {'predicate': 'foaf:homepage',
# 'value': f'{homepage}'}
# tmpl, generated_infos = _extend_or_add(tmpl,
# "attributes",
# homepage,
# do_not_edit,
# generated_infos,
# 'attribute-homepage',
# is_list=True)
tmpl, generated_infos = _extend_or_add(tmpl,
"application_deadline",
info.get("DTSTART").td.isoformat(),
do_not_edit,
generated_infos,
'deadline')
tmpl, generated_infos = _extend_or_add(tmpl,
'title',
info.get("SUMMARY").ical_value,
do_not_edit,
generated_infos,
'title')
# description needs to be stripped from newlines to be valid
desc = info.get("DESCRIPTION").ical_value.replace('\n', ' ')
if 'Please note' in desc:
desc = desc.split('Please note')[0]
tmpl, generated_infos = _extend_or_add(tmpl,
'description',
desc,
do_not_edit,
generated_infos,
'description')
# if the record already existed, check if anything changed
if old_rec is not None:
if old_rec == tmpl:
print(f"No change for existing cecord with PID {pid}.")
return None
tmpl = _add_machine_prov(tmpl, generated_infos, calurl)
return tmpl
def _find_funder(info: dict) -> (str, str):
funder = None
homepage = None
for line in info.get("DESCRIPTION").splitlines():
if line.startswith("Information Funding Organization: "):
# strip "Information Funding Organization: ", split url from name
funder, homepage = line[34:].split('<')
homepage = homepage.strip('>')
return funder, homepage
def _upload_records(record: dict,
pool: str = 'https://pool.psychoinformatics.de/api',
):
print(f"submitting record with pid {record['pid']}...")
collection_write_record(
service_url=pool,
collection='public',
class_name='XYZCompetition',
record=record,
format='json',
token=environ['DTC_TOKEN']
)
return
@click.command()
@click.option('--dtc-api-url', '-a', default='https://pool.psychoinformatics.de/api')
@click.option('--dtc-collection', '-c', default='public')
def main(
dtc_api_url: str = 'https://pool.psychoinformatics.de/api',
dtc_collection: str = 'public',
) -> None:
"""
Scrape the outlook funding calendars of the FZJ and write events as
Competition records into the knowledge pool at --dtc-api-url, into the
collection determined by --dtc-collection.
"""
if environ.get('DTC_TOKEN', None) is None:
print("DTC_TOKEN required in environment! Aborting.")
return
create_competitions(pool=dtc_api_url,
collection=dtc_collection)
if __name__ == '__main__':
main()