2024-05-13 09:11:59 +00:00
3 changed files with 194 additions and 25 deletions
--- a/pelican-plugins/headerid/README.rst
+++ b/pelican-plugins/headerid/README.rst
@ -5,12 +5,28 @@ This plugin adds an anchor to each heading so you can deep-link to headers.
 It is intended for formats such as reStructuredText that do not natively
 generate these anchors.

-The ``HEADERID_LINK_CHAR`` config can be set to use a different char from ``*``
-for anchor text.
-
 For Markdown, this plugin is less relevant since the Python-Markdown library
 includes a Table of Contents extension that will generate link anchors.
 To enable the ``toc`` extension, add a line similar to the following example
 to your Pelican settings file::

    MD_EXTENSIONS = ["codehilite(css_class=highlight)", "extra", "toc"]
+
+Parameters
+----------
+Set parameters in ``pelicanconf.py`` config file.
+
+HEADERID_LINK_CHAR: str
+    The ``HEADERID_LINK_CHAR`` config can be set to use a different char from ``*``
+    for anchor text.
+
+GENERATE_IDS: bool
+    If ``True``, the plugin will create IDs for headings
+    that have neither ``id`` nor ``name`` attribute.
+    Used by link-anchors to reference headings.
+
+SUBTREE: str
+    Only parse a subtree of the entire page.
+    Provide an ``id``.
+    Sets the root of the parsing process from ``<html>``
+    to the first tag with a matching ``id`` attribute.
--- a/pelican-plugins/headerid/headerid.py
+++ b/pelican-plugins/headerid/headerid.py
@ -1,31 +1,180 @@
-from pelican import readers
-from pelican.readers import PelicanHTMLTranslator
 from pelican import signals
-from docutils import nodes
+from bs4 import BeautifulSoup as bs
+import re
+import copy

+# parameter defaults
 LINK_CHAR = '*'
+GENERATE_IDS = False
+SUBTREE = None


 def init_headerid(sender):
+    """Parse pelican settings to get parameters from `pelicanconf.py`.
+    Is registered as callback function to the `initialized` signal.
+
+    Parameters
+    ----------
+    sender: pelican.Pelican
+        Pelican object containing process meta information
+
+    Returns
+    -------
+
+    None
+    """
    global LINK_CHAR
+    global GENERATE_IDS
+    global SUBTREE
+
    char = sender.settings.get('HEADERID_LINK_CHAR')
    if char:
-        LINK_CHAR = char
+        LINK_CHAR = bs(char, 'html.parser')
+
+    GENERATE_IDS = bool(sender.settings.get('GENERATE_IDS'))
+    SUBTREE = sender.settings.get('SUBTREE')
+
+
+def headerid_main(page_generator, content):
+    """`headerid` plugin logic
+
+    Intercepts HTML generated by Pelican to add in a link anchor
+    for every heading-tag that has an identifier.
+
+    Is registered as CallBack function to the `page_generator_write_page`
+    signal.
+
+    Parameters
+    ----------
+    page_generator: pelican.generators.PagesGenerator
+        No usage in current implementation.
+        Still needed for CallBack.
+
+    content: pelican.contents.Page
+        Pelican Page object containing the HTML string to modify
+
+    Returns
+    -------
+    None
+    """
+    content._content = _add_link_anchors(content._content)
+
+
+def _add_link_anchors(raw_html: str) -> str:
+    r"""
+    Adds the following link anchor (1) to every headline-tag (2)
+    that has an identifier.
+        - (1)   <a class="headerlink" href="#{anchor_target}"
+                title="Permalink to this headline">
+                    {LINK_CHAR}
+                </a>
+        - (2) Tags, that match `^h*\d` (Python RegEx)
+
+    HTML parsing and tree modifications are done via `BeautifulSoup`.
+
+    Parameters
+    ----------
+    raw_html: str
+        Python string representing DOM tree
+
+    Returns
+    -------
+    str
+        Input DOM tree with the added link anchors (1)
+
+    Notes
+    -----
+    Makes use of global `SUBTREE` and `LINK_CHAR` variables.
+    See `README` for further information.
+    """
+    soup = bs(raw_html, 'html.parser')
+    headers = []
+
+    if SUBTREE:
+        subtree_tag = soup.find(id=SUBTREE)
+        if subtree_tag:
+            headers = subtree_tag.find_all(re.compile(r'^h*\d'))
+    else:
+        headers = soup.find_all(re.compile(r'^h*\d'))
+
+    # add in link anchor
+    for tag in headers:
+        if _has_anchor(str(tag)):
+            continue
+
+        anchor_target = _get_target(tag)
+        if anchor_target is None:
+            continue
+
+        anchor = soup.new_tag("a", attrs={"href": f"#{anchor_target}",
+                                          "title": "Permalink to this headline",
+                                          "class": "headerlink"})
+
+        # parse LINK_CHAR for HTML-tags to insert those into anchor
+        anchor.append(copy.deepcopy(LINK_CHAR))
+        tag.append(anchor)
+
+    return str(soup)
+
+
+def _has_anchor(inp: str) -> bool:
+    """
+    Take in an HTML string and check for an anchor-tag
+    with href-Attribute starting with `#`.
+
+    Parameters
+    ----------
+    inp: str
+        `inp` is interpreted as HTML and parsed via `BeautifulSoup`
+
+    Returns
+    -------
+    bool
+    """
+    soup = bs(inp, 'html.parser')
+    return any(tag['href'].startswith('#') for tag in soup.find_all('a'))
+
+
+def _get_target(tag) -> str:
+    """Get identifier for given HTML tag
+
+    Take a tag and extract something an anchor-tag can reference
+    like ID or name.
+
+    If neither present && GENERATE_IDS flag,
+    create and set ID based on tag contents.
+
+    Parameters
+    ----------
+    tag: bs4.element.Tag
+        HTML tag to get target for
+
+    Returns
+    -------
+    str
+        Referenceable ID or name
+
+    Notes
+    -----
+    Makes use of the global `GENERATE_IDS` flag.
+    See `README` for further information.
+    """
+
+    if tag.attrs.get('id'):
+        return tag.attrs.get('id')
+    elif tag.attrs.get('name'):
+        return tag.attrs.get('name')
+    elif GENERATE_IDS:
+        tag['id'] = re.sub(r'\W+', '', tag.text)
+        anchor_target = re.sub(r'\W+', '', tag.text)
+        return anchor_target
+    else:
+        return None
+

 def register():
+    # Connect CallBack function to signal.
+    # Function gets executed when pelican issues the signal.
+
    signals.initialized.connect(init_headerid)
-
-
-    class HeaderIDPatchedPelicanHTMLTranslator(PelicanHTMLTranslator):
-        def depart_title(self, node):
-            close_tag = self.context[-1]
-            parent = node.parent
-            if isinstance(parent, nodes.section) and parent.hasattr('ids') and parent['ids']:
-                anchor_name = parent['ids'][0]
-                # add permalink anchor
-                if close_tag.startswith('</h'):
-                    self.body.append(
-                        '<a class="headerlink" href="#%s" title="Permalink to this headline">%s</a>' %
-                        (anchor_name, LINK_CHAR))
-            PelicanHTMLTranslator.depart_title(self, node)
-    readers.PelicanHTMLTranslator = HeaderIDPatchedPelicanHTMLTranslator
+    signals.page_generator_write_page.connect(headerid_main)
--- a/pelicanconf.py
+++ b/pelicanconf.py
@ -18,7 +18,7 @@ LOCALE = 'en_US.UTF-8'
 #
 PATH = 'content'
 PLUGIN_PATHS = ['pelican-plugins']
-PLUGINS = ['tipue_search', 'sitemap']
+PLUGINS = ['tipue_search', 'sitemap', 'headerid']
 SITEMAP = { 'format': 'xml' }

 THEME = 'theme'
@ -41,8 +41,6 @@ EXTRA_PATH_METADATA = {
    "static/site.webmanifest": {'path': ''},
 }

-#HEADERID_LINK_CHAR = '<i class="icon-link"></i>'
-
 FEED_ALL_ATOM = None
 AUTHOR_SAVE_AS = False

@ -55,3 +53,9 @@ MENUITEMS = ( ('About', 'about.html'),
              ('Explore', 'explore.html'),
              ('Publications', 'publications.html'),
 )
+
+#
+# headerid parameters
+#
+HEADERID_LINK_CHAR = ' #'
+GENERATE_IDS = True