Smarter ID rewrite ordering

Signed-off-by: Chris Warrick <kwpolska@gmail.com>
getnikola · May 15, 2017 · 91204c8 · 91204c8
1 parent 18c76b1
commit 91204c8
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 2 deletions.
diff --git a/docs/manual.txt b/docs/manual.txt
@@ -1933,6 +1933,19 @@ deduplicate_ids
    offending IDs. If used alongside ``add_header_permalinks``, it will fix
    those links (it must run **after** that filter)
 
+   IDs are numbered from the bottom up, which is useful for indexes (updates
+   appear at the top). There are exceptions, which may be configured using
+   ``DEDUPLICATE_IDS_TOP_CLASSES`` — if any of those classes appears sin the
+   document, the IDs are rewritten top-down, which is useful for posts/pages
+   (updates appear at the bottom).
+
+   Note that in rare cases, permalinks might not always be *permanent* in case
+   of edits.
+
+   .. code:: python
+
+      DEDUPLICATE_IDS_TOP_CLASSES = ('postpage', 'storypage')
+
 You can apply filters to specific posts or pages by using the ``filters`` metadata field:
 
 .. code:: restructuredtext

diff --git a/nikola/filters.py b/nikola/filters.py
@@ -439,9 +439,12 @@ def add_header_permalinks(data, xpath_list=None):
     return lxml.html.tostring(doc, encoding="unicode")
 
 
+@_ConfigurableFilter(top_classes='DEDUPLICATE_IDS_TOP_CLASSES')
 @apply_to_text_file
-def deduplicate_ids(data):
+def deduplicate_ids(data, top_classes=None):
     """Post-process HTML via lxml to deduplicate IDs."""
+    if not top_classes:
+        top_classes = ('postpage', 'storypage')
     doc = lxml.html.document_fromstring(data)
     elements = doc.xpath('//*')
     all_ids = [element.attrib.get('id') for element in elements]
@@ -459,7 +462,17 @@ def deduplicate_ids(data):
             # Results are ordered the same way they are ordered in document
             offending_elements = doc.xpath('//*[@id="{}"]'.format(i))
             counter = 2
-            for e in offending_elements[-2::-1]:
+            # If this is a story or a post, do it from top to bottom, because
+            # updates to those are more likely to appear at the bottom of pages.
+            # For anything else, including indexes, do it from bottom to top,
+            # because new posts appear at the top of pages.
+            # We also leave the first result out, so there is one element with
+            # "plain" ID
+            if any(doc.find_class(c) for c in top_classes):
+                off = offending_elements[1:]
+            else:
+                off = offending_elements[-2::-1]
+            for e in off:
                 new_id = i
                 while doc.xpath('//*[@id="{}"]'.format(new_id)):
                     new_id = '{0}-{1}'.format(i, counter)