Merge pull request #2763 from getnikola/header-deduplication

Header/id deduplication
getnikola · May 21, 2017 · 26af25c · 26af25c
2 parents a54a3e1 + 5237119
commit 26af25c
Show file tree

Hide file tree

Showing 4 changed files with 78 additions and 5 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -4,6 +4,8 @@ New in master
 Features
 --------
 
+* New ``deduplicate_ids``, for preventing duplication of HTML id
+  attributes (Issue #2570)
 * Ported gallery image layout to base theme (Issue #2775)
 * Better error handling when posts can't be parsed (Issue #2771)
 * Use ``.theme`` files to store theme metadata (Issue #2758)

diff --git a/docs/manual.txt b/docs/manual.txt
@@ -1938,7 +1938,7 @@ add_header_permalinks
           text-decoration: none;
       }
 
-   Additionally, you can provide a custom list of XPath expressions which should be used for finding headers (``{hx}}`` is replaced by headers h1 through h6).
+   Additionally, you can provide a custom list of XPath expressions which should be used for finding headers (``{hx}`` is replaced by headers h1 through h6).
    This is required if you use a custom theme that does not use ``"e-content entry-content"`` as a class for post and page contents.
 
    .. code:: python
@@ -1947,10 +1947,30 @@ add_header_permalinks
         # Include *every* header (not recommended):
         # HEADER_PERMALINKS_XPATH_LIST = ['*//{hx}']
 
+
+deduplicate_ids
+   Prevent duplicated IDs in HTML output. An incrementing counter is added to
+   offending IDs. If used alongside ``add_header_permalinks``, it will fix
+   those links (it must run **after** that filter)
+
+   IDs are numbered from the bottom up, which is useful for indexes (updates
+   appear at the top). There are exceptions, which may be configured using
+   ``DEDUPLICATE_IDS_TOP_CLASSES`` — if any of those classes appears sin the
+   document, the IDs are rewritten top-down, which is useful for posts/pages
+   (updates appear at the bottom).
+
+   Note that in rare cases, permalinks might not always be *permanent* in case
+   of edits.
+
+   .. code:: python
+
+      DEDUPLICATE_IDS_TOP_CLASSES = ('postpage', 'storypage')
+
     You can also use a file blacklist (``HEADER_PERMALINKS_FILE_BLACKLIST``),
     useful for some index pages. Paths include the output directory (eg.
     ``output/index.html``)
 
+
 You can apply filters to specific posts or pages by using the ``filters`` metadata field:
 
 .. code:: restructuredtext

diff --git a/nikola/conf.py.in b/nikola/conf.py.in
@@ -587,7 +587,7 @@ GITHUB_COMMIT_SOURCE = True
 # HTML_TIDY_EXECUTABLE = 'tidy5'
 
 # List of XPath expressions which should be used for finding headers
-# ({hx}} is replaced by headers h1 through h6).
+# ({hx} is replaced by headers h1 through h6).
 # You must change this if you use a custom theme that does not use
 # "e-content entry-content" as a class for post and page contents.
 # HEADER_PERMALINKS_XPATH_LIST = ['*//div[@class="e-content entry-content"]//{hx}']

diff --git a/nikola/filters.py b/nikola/filters.py
@@ -442,6 +442,57 @@ def add_header_permalinks(fname, xpath_list=None, file_blacklist=None):
             new_node = lxml.html.fragment_fromstring('<a href="#{0}" class="headerlink" title="Permalink to this heading">¶</a>'.format(hid))
             node.append(new_node)
 
-    data = lxml.html.tostring(doc, encoding="unicode")
-    with io.open(fname, 'w+', encoding='utf-8') as outf:
-        outf.write(data)
+    with io.open(fname, 'w', encoding='utf-8') as outf:
+        outf.write(lxml.html.tostring(doc, encoding="unicode"))
+
+
+@_ConfigurableFilter(top_classes='DEDUPLICATE_IDS_TOP_CLASSES')
+@apply_to_text_file
+def deduplicate_ids(data, top_classes=None):
+    """Post-process HTML via lxml to deduplicate IDs."""
+    if not top_classes:
+        top_classes = ('postpage', 'storypage')
+    doc = lxml.html.document_fromstring(data)
+    elements = doc.xpath('//*')
+    all_ids = [element.attrib.get('id') for element in elements]
+    seen_ids = set()
+    duplicated_ids = set()
+    for i in all_ids:
+        if i is not None and i in seen_ids:
+            duplicated_ids.add(i)
+        else:
+            seen_ids.add(i)
+
+    if duplicated_ids:
+        # Well, that sucks.
+        for i in duplicated_ids:
+            # Results are ordered the same way they are ordered in document
+            offending_elements = doc.xpath('//*[@id="{}"]'.format(i))
+            counter = 2
+            # If this is a story or a post, do it from top to bottom, because
+            # updates to those are more likely to appear at the bottom of pages.
+            # For anything else, including indexes, do it from bottom to top,
+            # because new posts appear at the top of pages.
+            # We also leave the first result out, so there is one element with
+            # "plain" ID
+            if any(doc.find_class(c) for c in top_classes):
+                off = offending_elements[1:]
+            else:
+                off = offending_elements[-2::-1]
+            for e in off:
+                new_id = i
+                while new_id in seen_ids:
+                    new_id = '{0}-{1}'.format(i, counter)
+                    counter += 1
+                e.attrib['id'] = new_id
+                seen_ids.add(new_id)
+                # Find headerlinks that we can fix.
+                headerlinks = e.find_class('headerlink')
+                for hl in headerlinks:
+                    # We might get headerlinks of child elements
+                    if hl.attrib['href'] == '#' + i:
+                        hl.attrib['href'] = '#' + new_id
+                        break
+        return lxml.html.tostring(doc, encoding='unicode')
+    else:
+        return data