Fix #2570 -- new deduplicate_ids filter

Signed-off-by: Chris Warrick <kwpolska@gmail.com>
getnikola · May 14, 2017 · c393e22 · c393e22
1 parent 3b748b0
commit c393e22
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 0 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -4,6 +4,8 @@ New in master
 Features
 --------
 
+* New ``deduplicate_ids``, for preventing duplication of HTML id
+  attributes (Issue #2570)
 * New ``add_header_permalinks`` filter, for Sphinx-style header links
   (Issue #2636)
 

diff --git a/docs/manual.txt b/docs/manual.txt
@@ -1928,6 +1928,11 @@ add_header_permalinks
         # Include *every* header (not recommended):
         # HEADER_PERMALINKS_XPATH_LIST = ['*//{hx}']
 
+deduplicate_ids
+   Prevent duplicated IDs in HTML output. An incrementing counter is added to
+   offending IDs. If used alongside ``add_header_permalinks``, it will fix
+   those links (it must run **after** that filter)
+
 You can apply filters to specific posts or pages by using the ``filters`` metadata field:
 
 .. code:: restructuredtext

diff --git a/nikola/filters.py b/nikola/filters.py
@@ -436,3 +436,37 @@ def add_header_permalinks(data, xpath_list=None):
             new_node = lxml.html.fragment_fromstring('<a href="#{0}" class="headerlink" title="Permalink to this heading">¶</a>'.format(hid))
             node.append(new_node)
     return lxml.html.tostring(doc, encoding="unicode")
+
+@apply_to_text_file
+def deduplicate_ids(data):
+    """Post-process HTML via lxml to deduplicate IDs."""
+    doc = lxml.html.document_fromstring(data)
+    elements = doc.xpath('//*')
+    all_ids = [element.attrib.get('id') for element in elements]
+    seen_ids = set()
+    duplicated_ids = set()
+    for i in all_ids:
+        if i is not None and i in seen_ids:
+            duplicated_ids.add(i)
+        else:
+            seen_ids.add(i)
+
+    if duplicated_ids:
+        # Well, that sucks.
+        for i in duplicated_ids:
+            # Results are ordered the same way they are ordered in document
+            offending_elements = doc.xpath('//*[@id="{}"]'.format(i))
+            counter = 2
+            for e in offending_elements[1:]:
+                new_id = '{0}-{1}'.format(i, counter)
+                e.attrib['id'] = new_id
+                counter += 1
+                # Find headerlinks that we can fix.
+                headerlinks = e.find_class('headerlink')
+                for hl in headerlinks:
+                    # We might get headerlinks of child elements
+                    if hl.attrib['href'] == '#' + i:
+                        hl.attrib['href'] = '#' + new_id
+        return lxml.html.tostring(doc, encoding='unicode')
+    else:
+        return data