modify headerid to also work on HTML files #59

Merged
o-druska merged 3 commits from html_headerid into master 2024-05-13 09:11:59 +00:00
3 changed files with 194 additions and 25 deletions

View file

@ -5,12 +5,28 @@ This plugin adds an anchor to each heading so you can deep-link to headers.
It is intended for formats such as reStructuredText that do not natively It is intended for formats such as reStructuredText that do not natively
generate these anchors. generate these anchors.
The ``HEADERID_LINK_CHAR`` config can be set to use a different char from ``*``
for anchor text.
For Markdown, this plugin is less relevant since the Python-Markdown library For Markdown, this plugin is less relevant since the Python-Markdown library
includes a Table of Contents extension that will generate link anchors. includes a Table of Contents extension that will generate link anchors.
To enable the ``toc`` extension, add a line similar to the following example To enable the ``toc`` extension, add a line similar to the following example
to your Pelican settings file:: to your Pelican settings file::
MD_EXTENSIONS = ["codehilite(css_class=highlight)", "extra", "toc"] MD_EXTENSIONS = ["codehilite(css_class=highlight)", "extra", "toc"]
Parameters
----------
Set parameters in ``pelicanconf.py`` config file.
HEADERID_LINK_CHAR: str
The ``HEADERID_LINK_CHAR`` config can be set to use a different char from ``*``
for anchor text.
GENERATE_IDS: bool
If ``True``, the plugin will create IDs for headings
that have neither ``id`` nor ``name`` attribute.
Used by link-anchors to reference headings.
SUBTREE: str
Only parse a subtree of the entire page.
Provide an ``id``.
Sets the root of the parsing process from ``<html>``
to the first tag with a matching ``id`` attribute.

View file

@ -1,31 +1,180 @@
from pelican import readers
from pelican.readers import PelicanHTMLTranslator
from pelican import signals from pelican import signals
from docutils import nodes from bs4 import BeautifulSoup as bs
import re
import copy
# parameter defaults
LINK_CHAR = '*' LINK_CHAR = '*'
GENERATE_IDS = False
SUBTREE = None
def init_headerid(sender): def init_headerid(sender):
"""Parse pelican settings to get parameters from `pelicanconf.py`.
Is registered as callback function to the `initialized` signal.
Parameters
----------
sender: pelican.Pelican
Pelican object containing process meta information
Returns
-------
None
"""
global LINK_CHAR global LINK_CHAR
global GENERATE_IDS
global SUBTREE
char = sender.settings.get('HEADERID_LINK_CHAR') char = sender.settings.get('HEADERID_LINK_CHAR')
if char: if char:
LINK_CHAR = char LINK_CHAR = bs(char, 'html.parser')
GENERATE_IDS = bool(sender.settings.get('GENERATE_IDS'))
SUBTREE = sender.settings.get('SUBTREE')
def headerid_main(page_generator, content):
"""`headerid` plugin logic
Intercepts HTML generated by Pelican to add in a link anchor
for every heading-tag that has an identifier.
Is registered as CallBack function to the `page_generator_write_page`
signal.
Parameters
----------
page_generator: pelican.generators.PagesGenerator
No usage in current implementation.
Still needed for CallBack.
content: pelican.contents.Page
Pelican Page object containing the HTML string to modify
Returns
-------
None
"""
content._content = _add_link_anchors(content._content)
def _add_link_anchors(raw_html: str) -> str:
r"""
Adds the following link anchor (1) to every headline-tag (2)
that has an identifier.
- (1) <a class="headerlink" href="#{anchor_target}"
title="Permalink to this headline">
{LINK_CHAR}
</a>
- (2) Tags, that match `^h*\d` (Python RegEx)
HTML parsing and tree modifications are done via `BeautifulSoup`.
Parameters
----------
raw_html: str
Python string representing DOM tree
Returns
-------
str
Input DOM tree with the added link anchors (1)
Notes
-----
Makes use of global `SUBTREE` and `LINK_CHAR` variables.
See `README` for further information.
"""
soup = bs(raw_html, 'html.parser')
headers = []
if SUBTREE:
subtree_tag = soup.find(id=SUBTREE)
if subtree_tag:
headers = subtree_tag.find_all(re.compile(r'^h*\d'))
else:
headers = soup.find_all(re.compile(r'^h*\d'))
# add in link anchor
for tag in headers:
if _has_anchor(str(tag)):
continue
anchor_target = _get_target(tag)
if anchor_target is None:
continue
anchor = soup.new_tag("a", attrs={"href": f"#{anchor_target}",
"title": "Permalink to this headline",
"class": "headerlink"})
# parse LINK_CHAR for HTML-tags to insert those into anchor
anchor.append(copy.deepcopy(LINK_CHAR))
tag.append(anchor)
return str(soup)
def _has_anchor(inp: str) -> bool:
"""
Take in an HTML string and check for an anchor-tag
with href-Attribute starting with `#`.
Parameters
----------
inp: str
`inp` is interpreted as HTML and parsed via `BeautifulSoup`
Returns
-------
bool
"""
soup = bs(inp, 'html.parser')
return any(tag['href'].startswith('#') for tag in soup.find_all('a'))
def _get_target(tag) -> str:
"""Get identifier for given HTML tag
Take a tag and extract something an anchor-tag can reference
like ID or name.
If neither present && GENERATE_IDS flag,
create and set ID based on tag contents.
Parameters
----------
tag: bs4.element.Tag
HTML tag to get target for
Returns
-------
str
Referenceable ID or name
Notes
-----
Makes use of the global `GENERATE_IDS` flag.
See `README` for further information.
"""
if tag.attrs.get('id'):
return tag.attrs.get('id')
elif tag.attrs.get('name'):
return tag.attrs.get('name')
elif GENERATE_IDS:
tag['id'] = re.sub(r'\W+', '', tag.text)
anchor_target = re.sub(r'\W+', '', tag.text)
return anchor_target
else:
return None
def register(): def register():
# Connect CallBack function to signal.
# Function gets executed when pelican issues the signal.
signals.initialized.connect(init_headerid) signals.initialized.connect(init_headerid)
signals.page_generator_write_page.connect(headerid_main)
class HeaderIDPatchedPelicanHTMLTranslator(PelicanHTMLTranslator):
def depart_title(self, node):
close_tag = self.context[-1]
parent = node.parent
if isinstance(parent, nodes.section) and parent.hasattr('ids') and parent['ids']:
anchor_name = parent['ids'][0]
# add permalink anchor
if close_tag.startswith('</h'):
self.body.append(
'<a class="headerlink" href="#%s" title="Permalink to this headline">%s</a>' %
(anchor_name, LINK_CHAR))
PelicanHTMLTranslator.depart_title(self, node)
readers.PelicanHTMLTranslator = HeaderIDPatchedPelicanHTMLTranslator

View file

@ -18,7 +18,7 @@ LOCALE = 'en_US.UTF-8'
# #
PATH = 'content' PATH = 'content'
PLUGIN_PATHS = ['pelican-plugins'] PLUGIN_PATHS = ['pelican-plugins']
PLUGINS = ['tipue_search', 'sitemap'] PLUGINS = ['tipue_search', 'sitemap', 'headerid']
SITEMAP = { 'format': 'xml' } SITEMAP = { 'format': 'xml' }
THEME = 'theme' THEME = 'theme'
@ -41,8 +41,6 @@ EXTRA_PATH_METADATA = {
"static/site.webmanifest": {'path': ''}, "static/site.webmanifest": {'path': ''},
} }
#HEADERID_LINK_CHAR = '<i class="icon-link"></i>'
FEED_ALL_ATOM = None FEED_ALL_ATOM = None
AUTHOR_SAVE_AS = False AUTHOR_SAVE_AS = False
@ -55,3 +53,9 @@ MENUITEMS = ( ('About', 'about.html'),
('Explore', 'explore.html'), ('Explore', 'explore.html'),
('Publications', 'publications.html'), ('Publications', 'publications.html'),
) )
#
# headerid parameters
#
HEADERID_LINK_CHAR = ' #'
GENERATE_IDS = True