2024-05-13 09:11:59 +00:00
3 changed files with 194 additions and 25 deletions
--- a/pelican-plugins/headerid/README.rst
+++ b/pelican-plugins/headerid/README.rst
@ -5,12 +5,28 @@ This plugin adds an anchor to each heading so you can deep-link to headers.
 It is intended for formats such as reStructuredText that do not natively
 generate these anchors.
 The ``HEADERID_LINK_CHAR`` config can be set to use a different char from ``*``
 for anchor text.
 For Markdown, this plugin is less relevant since the Python-Markdown library
 includes a Table of Contents extension that will generate link anchors.
 To enable the ``toc`` extension, add a line similar to the following example
 to your Pelican settings file::
    MD_EXTENSIONS = ["codehilite(css_class=highlight)", "extra", "toc"]
 Parameters
 ----------
 Set parameters in ``pelicanconf.py`` config file.
 HEADERID_LINK_CHAR: str
    The ``HEADERID_LINK_CHAR`` config can be set to use a different char from ``*``
    for anchor text.
 GENERATE_IDS: bool
    If ``True``, the plugin will create IDs for headings
    that have neither ``id`` nor ``name`` attribute.
    Used by link-anchors to reference headings.
 SUBTREE: str
    Only parse a subtree of the entire page.
    Provide an ``id``.
    Sets the root of the parsing process from ``<html>``
    to the first tag with a matching ``id`` attribute.
--- a/pelican-plugins/headerid/headerid.py
+++ b/pelican-plugins/headerid/headerid.py
@ -1,31 +1,180 @@
 from pelican import readers
 from pelican.readers import PelicanHTMLTranslator
 from pelican import signals
-from docutils import nodes
+from bs4 import BeautifulSoup as bs
 import re
 import copy
 # parameter defaults
 LINK_CHAR = '*'
 GENERATE_IDS = False
 SUBTREE = None
 def init_headerid(sender):
    """Parse pelican settings to get parameters from `pelicanconf.py`.
    Is registered as callback function to the `initialized` signal.
    Parameters
    ----------
    sender: pelican.Pelican
        Pelican object containing process meta information
    Returns
    -------
    None
    """
    global LINK_CHAR
    global GENERATE_IDS
    global SUBTREE
    char = sender.settings.get('HEADERID_LINK_CHAR')
    if char:
-        LINK_CHAR = char
+        LINK_CHAR = bs(char, 'html.parser')
    GENERATE_IDS = bool(sender.settings.get('GENERATE_IDS'))
    SUBTREE = sender.settings.get('SUBTREE')
 def headerid_main(page_generator, content):
    """`headerid` plugin logic
    Intercepts HTML generated by Pelican to add in a link anchor
    for every heading-tag that has an identifier.
    Is registered as CallBack function to the `page_generator_write_page`
    signal.
    Parameters
    ----------
    page_generator: pelican.generators.PagesGenerator
        No usage in current implementation.
        Still needed for CallBack.
    content: pelican.contents.Page
        Pelican Page object containing the HTML string to modify
    Returns
    -------
    None
    """
    content._content = _add_link_anchors(content._content)
 def _add_link_anchors(raw_html: str) -> str:
    r"""
    Adds the following link anchor (1) to every headline-tag (2)
    that has an identifier.
        - (1)   <a class="headerlink" href="#{anchor_target}"
                title="Permalink to this headline">
                    {LINK_CHAR}
                </a>
        - (2) Tags, that match `^h*\d` (Python RegEx)
    HTML parsing and tree modifications are done via `BeautifulSoup`.
    Parameters
    ----------
    raw_html: str
        Python string representing DOM tree
    Returns
    -------
    str
        Input DOM tree with the added link anchors (1)
    Notes
    -----
    Makes use of global `SUBTREE` and `LINK_CHAR` variables.
    See `README` for further information.
    """
    soup = bs(raw_html, 'html.parser')
    headers = []
    if SUBTREE:
        subtree_tag = soup.find(id=SUBTREE)
        if subtree_tag:
            headers = subtree_tag.find_all(re.compile(r'^h*\d'))
    else:
        headers = soup.find_all(re.compile(r'^h*\d'))
    # add in link anchor
    for tag in headers:
        if _has_anchor(str(tag)):
            continue
        anchor_target = _get_target(tag)
        if anchor_target is None:
            continue
        anchor = soup.new_tag("a", attrs={"href": f"#{anchor_target}",
                                          "title": "Permalink to this headline",
                                          "class": "headerlink"})
        # parse LINK_CHAR for HTML-tags to insert those into anchor
        anchor.append(copy.deepcopy(LINK_CHAR))
        tag.append(anchor)
    return str(soup)
 def _has_anchor(inp: str) -> bool:
    """
    Take in an HTML string and check for an anchor-tag
    with href-Attribute starting with `#`.
    Parameters
    ----------
    inp: str
        `inp` is interpreted as HTML and parsed via `BeautifulSoup`
    Returns
    -------
    bool
    """
    soup = bs(inp, 'html.parser')
    return any(tag['href'].startswith('#') for tag in soup.find_all('a'))
 def _get_target(tag) -> str:
    """Get identifier for given HTML tag
    Take a tag and extract something an anchor-tag can reference
    like ID or name.
    If neither present && GENERATE_IDS flag,
    create and set ID based on tag contents.
    Parameters
    ----------
    tag: bs4.element.Tag
        HTML tag to get target for
    Returns
    -------
    str
        Referenceable ID or name
    Notes
    -----
    Makes use of the global `GENERATE_IDS` flag.
    See `README` for further information.
    """
    if tag.attrs.get('id'):
        return tag.attrs.get('id')
    elif tag.attrs.get('name'):
        return tag.attrs.get('name')
    elif GENERATE_IDS:
        tag['id'] = re.sub(r'\W+', '', tag.text)
        anchor_target = re.sub(r'\W+', '', tag.text)
        return anchor_target
    else:
        return None
 def register():
    # Connect CallBack function to signal.
    # Function gets executed when pelican issues the signal.
    signals.initialized.connect(init_headerid)
-
+    signals.page_generator_write_page.connect(headerid_main)
    class HeaderIDPatchedPelicanHTMLTranslator(PelicanHTMLTranslator):
        def depart_title(self, node):
            close_tag = self.context[-1]
            parent = node.parent
            if isinstance(parent, nodes.section) and parent.hasattr('ids') and parent['ids']:
                anchor_name = parent['ids'][0]
                # add permalink anchor
                if close_tag.startswith('</h'):
                    self.body.append(
                        '<a class="headerlink" href="#%s" title="Permalink to this headline">%s</a>' %
                        (anchor_name, LINK_CHAR))
            PelicanHTMLTranslator.depart_title(self, node)
    readers.PelicanHTMLTranslator = HeaderIDPatchedPelicanHTMLTranslator
--- a/pelicanconf.py
+++ b/pelicanconf.py
@ -18,7 +18,7 @@ LOCALE = 'en_US.UTF-8'
 #
 PATH = 'content'
 PLUGIN_PATHS = ['pelican-plugins']
-PLUGINS = ['tipue_search', 'sitemap']
+PLUGINS = ['tipue_search', 'sitemap', 'headerid']
 SITEMAP = { 'format': 'xml' }
 THEME = 'theme'
@ -41,8 +41,6 @@ EXTRA_PATH_METADATA = {
    "static/site.webmanifest": {'path': ''},
 }
 #HEADERID_LINK_CHAR = '<i class="icon-link"></i>'
 FEED_ALL_ATOM = None
 AUTHOR_SAVE_AS = False
@ -55,3 +53,9 @@ MENUITEMS = ( ('About', 'about.html'),
              ('Explore', 'explore.html'),
              ('Publications', 'publications.html'),
 )
 #
 # headerid parameters
 #
 HEADERID_LINK_CHAR = ' #'
 GENERATE_IDS = True