modify headerid to also work on HTML files #59

Merged
o-druska merged 3 commits from html_headerid into master 2024-05-13 09:11:59 +00:00
3 changed files with 194 additions and 25 deletions

View file

@ -5,12 +5,28 @@ This plugin adds an anchor to each heading so you can deep-link to headers.
It is intended for formats such as reStructuredText that do not natively
generate these anchors.
The ``HEADERID_LINK_CHAR`` config can be set to use a different char from ``*``
for anchor text.
For Markdown, this plugin is less relevant since the Python-Markdown library
includes a Table of Contents extension that will generate link anchors.
To enable the ``toc`` extension, add a line similar to the following example
to your Pelican settings file::
MD_EXTENSIONS = ["codehilite(css_class=highlight)", "extra", "toc"]
Parameters
----------
Set parameters in ``pelicanconf.py`` config file.
HEADERID_LINK_CHAR: str
The ``HEADERID_LINK_CHAR`` config can be set to use a different char from ``*``
for anchor text.
GENERATE_IDS: bool
If ``True``, the plugin will create IDs for headings
that have neither ``id`` nor ``name`` attribute.
Used by link-anchors to reference headings.
SUBTREE: str
Only parse a subtree of the entire page.
Provide an ``id``.
Sets the root of the parsing process from ``<html>``
to the first tag with a matching ``id`` attribute.

View file

@ -1,31 +1,180 @@
from pelican import readers
from pelican.readers import PelicanHTMLTranslator
from pelican import signals
from docutils import nodes
from bs4 import BeautifulSoup as bs
import re
import copy
# parameter defaults
LINK_CHAR = '*'
GENERATE_IDS = False
SUBTREE = None
def init_headerid(sender):
"""Parse pelican settings to get parameters from `pelicanconf.py`.
Is registered as callback function to the `initialized` signal.
Parameters
----------
sender: pelican.Pelican
Pelican object containing process meta information
Returns
-------
None
"""
global LINK_CHAR
global GENERATE_IDS
global SUBTREE
char = sender.settings.get('HEADERID_LINK_CHAR')
if char:
LINK_CHAR = char
LINK_CHAR = bs(char, 'html.parser')
GENERATE_IDS = bool(sender.settings.get('GENERATE_IDS'))
SUBTREE = sender.settings.get('SUBTREE')
def headerid_main(page_generator, content):
"""`headerid` plugin logic
Intercepts HTML generated by Pelican to add in a link anchor
for every heading-tag that has an identifier.
Is registered as CallBack function to the `page_generator_write_page`
signal.
Parameters
----------
page_generator: pelican.generators.PagesGenerator
No usage in current implementation.
Still needed for CallBack.
content: pelican.contents.Page
Pelican Page object containing the HTML string to modify
Returns
-------
None
"""
content._content = _add_link_anchors(content._content)
def _add_link_anchors(raw_html: str) -> str:
r"""
Adds the following link anchor (1) to every headline-tag (2)
that has an identifier.
- (1) <a class="headerlink" href="#{anchor_target}"
title="Permalink to this headline">
{LINK_CHAR}
</a>
- (2) Tags, that match `^h*\d` (Python RegEx)
HTML parsing and tree modifications are done via `BeautifulSoup`.
Parameters
----------
raw_html: str
Python string representing DOM tree
Returns
-------
str
Input DOM tree with the added link anchors (1)
Notes
-----
Makes use of global `SUBTREE` and `LINK_CHAR` variables.
See `README` for further information.
"""
soup = bs(raw_html, 'html.parser')
headers = []
if SUBTREE:
subtree_tag = soup.find(id=SUBTREE)
if subtree_tag:
headers = subtree_tag.find_all(re.compile(r'^h*\d'))
else:
headers = soup.find_all(re.compile(r'^h*\d'))
# add in link anchor
for tag in headers:
if _has_anchor(str(tag)):
continue
anchor_target = _get_target(tag)
if anchor_target is None:
continue
anchor = soup.new_tag("a", attrs={"href": f"#{anchor_target}",
"title": "Permalink to this headline",
"class": "headerlink"})
# parse LINK_CHAR for HTML-tags to insert those into anchor
anchor.append(copy.deepcopy(LINK_CHAR))
tag.append(anchor)
return str(soup)
def _has_anchor(inp: str) -> bool:
"""
Take in an HTML string and check for an anchor-tag
with href-Attribute starting with `#`.
Parameters
----------
inp: str
`inp` is interpreted as HTML and parsed via `BeautifulSoup`
Returns
-------
bool
"""
soup = bs(inp, 'html.parser')
return any(tag['href'].startswith('#') for tag in soup.find_all('a'))
def _get_target(tag) -> str:
"""Get identifier for given HTML tag
Take a tag and extract something an anchor-tag can reference
like ID or name.
If neither present && GENERATE_IDS flag,
create and set ID based on tag contents.
Parameters
----------
tag: bs4.element.Tag
HTML tag to get target for
Returns
-------
str
Referenceable ID or name
Notes
-----
Makes use of the global `GENERATE_IDS` flag.
See `README` for further information.
"""
if tag.attrs.get('id'):
return tag.attrs.get('id')
elif tag.attrs.get('name'):
return tag.attrs.get('name')
elif GENERATE_IDS:
tag['id'] = re.sub(r'\W+', '', tag.text)
anchor_target = re.sub(r'\W+', '', tag.text)
return anchor_target
else:
return None
def register():
# Connect CallBack function to signal.
# Function gets executed when pelican issues the signal.
signals.initialized.connect(init_headerid)
class HeaderIDPatchedPelicanHTMLTranslator(PelicanHTMLTranslator):
def depart_title(self, node):
close_tag = self.context[-1]
parent = node.parent
if isinstance(parent, nodes.section) and parent.hasattr('ids') and parent['ids']:
anchor_name = parent['ids'][0]
# add permalink anchor
if close_tag.startswith('</h'):
self.body.append(
'<a class="headerlink" href="#%s" title="Permalink to this headline">%s</a>' %
(anchor_name, LINK_CHAR))
PelicanHTMLTranslator.depart_title(self, node)
readers.PelicanHTMLTranslator = HeaderIDPatchedPelicanHTMLTranslator
signals.page_generator_write_page.connect(headerid_main)

View file

@ -18,7 +18,7 @@ LOCALE = 'en_US.UTF-8'
#
PATH = 'content'
PLUGIN_PATHS = ['pelican-plugins']
PLUGINS = ['tipue_search', 'sitemap']
PLUGINS = ['tipue_search', 'sitemap', 'headerid']
SITEMAP = { 'format': 'xml' }
THEME = 'theme'
@ -41,8 +41,6 @@ EXTRA_PATH_METADATA = {
"static/site.webmanifest": {'path': ''},
}
#HEADERID_LINK_CHAR = '<i class="icon-link"></i>'
FEED_ALL_ATOM = None
AUTHOR_SAVE_AS = False
@ -55,3 +53,9 @@ MENUITEMS = ( ('About', 'about.html'),
('Explore', 'explore.html'),
('Publications', 'publications.html'),
)
#
# headerid parameters
#
HEADERID_LINK_CHAR = ' #'
GENERATE_IDS = True