make it fail

ralsina · ralsina · commit 8bd33e41b23a · 2015-06-04T18:24:13.000-03:00
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -4,6 +4,7 @@ New in master
 Features
 --------
 
+* New html5lib serializer creates better HTML (Issue #1768)
 * New --get-path option for ``nikola install_theme`` (Issue #1762)
 * New `nikola rst2html` command (Issue #1710)
 * New `nikola status` command (Issue #1740)
diff --git a/nikola/nikola.py b/nikola/nikola.py
@@ -974,9 +974,7 @@ def render_template(self, template_name, output_name, context):
         parser = lxml.html.HTMLParser(remove_blank_text=True)
         doc = lxml.html.document_fromstring(data, parser)
         doc.rewrite_links(lambda dst: self.url_replacer(src, dst, context['lang']))
-        data = b'<!DOCTYPE html>\n' + lxml.html.tostring(doc, encoding='utf8', method='html', pretty_print=True)
-        with open(output_name, "wb+") as post_file:
-            post_file.write(data)
+        utils.save_doc(doc, output_name)
 
     def url_replacer(self, src, dst, lang=None, url_type=None):
         """URL mangler.
@@ -1131,8 +1129,7 @@ def generic_rss_renderer(self, lang, title, link, description, timeline, output_
                         try:
                             body = doc.body
                             data = (body.text or '') + ''.join(
-                                [lxml.html.tostring(child, encoding='unicode')
-                                    for child in body.iterchildren()])
+                                [utils.doc_tostring(child) for child in body.iterchildren()])
                         except IndexError:  # No body there, it happens sometimes
                             data = ''
                     except lxml.etree.ParserError as e:
@@ -1694,8 +1691,7 @@ def atom_link(link_rel, link_type, link_href):
                     try:
                         body = doc.body
                         data = (body.text or '') + ''.join(
-                            [lxml.html.tostring(child, encoding='unicode')
-                                for child in body.iterchildren()])
+                            [utils.doc_tostring(child) for child in body.iterchildren()])
                     except IndexError:  # No body there, it happens sometimes
                         data = ''
                 except lxml.etree.ParserError as e:
diff --git a/nikola/post.py b/nikola/post.py
@@ -579,9 +579,9 @@ def text(self, lang=None, teaser_only=False, strip_html=False, show_read_more_li
             hyphenate(document, lang)
 
         try:
-            data = lxml.html.tostring(document.body, encoding='unicode')
+            data = utils.doc_tostring(document.body)
         except:
-            data = lxml.html.tostring(document, encoding='unicode')
+            data = utils.doc_tostring(document)
 
         if teaser_only:
             teaser = TEASER_REGEXP.split(data)[0]
@@ -604,9 +604,9 @@ def text(self, lang=None, teaser_only=False, strip_html=False, show_read_more_li
                 # This closes all open tags and sanitizes the broken HTML
                 document = lxml.html.fromstring(teaser)
                 try:
-                    data = lxml.html.tostring(document.body, encoding='unicode')
+                    data = utils.doc_tostring(document.body)
                 except IndexError:
-                    data = lxml.html.tostring(document, encoding='unicode')
+                    data = utils.doc_tostring(document)
 
         if data and strip_html:
             try:
@@ -621,9 +621,9 @@ def text(self, lang=None, teaser_only=False, strip_html=False, show_read_more_li
                 try:
                     document = lxml.html.fromstring(data)
                     demote_headers(document, self.demote_headers)
-                    data = lxml.html.tostring(document.body, encoding='unicode')
+                    data = utils.doc_tostring(document.body)
                 except (lxml.etree.ParserError, IndexError):
-                    data = lxml.html.tostring(document, encoding='unicode')
+                    data = utils.doc_tostring(document)
 
         return data
 
diff --git a/nikola/utils.py b/nikola/utils.py
@@ -28,6 +28,7 @@
 
 from __future__ import print_function, unicode_literals, absolute_import
 import calendar
+import codecs
 import datetime
 import dateutil.tz
 import hashlib
@@ -47,6 +48,8 @@
 import warnings
 import PyRSS2Gen as rss
 from collections import defaultdict, Callable
+
+import html5lib
 from logbook.more import ExceptionHandler, ColorizedStderrHandler
 from pygments.formatters import HtmlFormatter
 from zipfile import ZipFile as zipf
@@ -69,7 +72,7 @@
            'adjust_name_for_index_path', 'adjust_name_for_index_link',
            'NikolaPygmentsHTML', 'create_redirect', 'TreeNode',
            'flatten_tree_structure', 'parse_escaped_hierarchical_category_name',
-           'join_hierarchical_category_path']
+           'join_hierarchical_category_path', 'doc_tostring', 'save_doc']
 
 # Are you looking for 'generic_rss_renderer'?
 # It's defined in nikola.nikola.Nikola (the site object).
@@ -1617,3 +1620,19 @@ def escape(s):
         return s.replace('\\', '\\\\').replace('/', '\\/')
 
     return '/'.join([escape(p) for p in category_path])
+
+
+def doc_tostring(doc, **opts):
+    """Convert a LXML doc to a string. Always returns unicode."""
+    print('======>', doc); sys.stdout.flush()
+    data = html5lib.serializer.serialize(doc, tree="lxml", **opts)
+    return data
+
+
+def save_doc(doc, dst, **opts):
+    """Serialize a LXML doc and save it in the path given by dst."""
+    dst_dir = os.path.dirname(dst)
+    makedirs(dst_dir)
+    data = doc_tostring(doc, **opts)
+    with codecs.open(dst, 'wb+', 'utf8') as outf:
+        outf.write(data)
diff --git a/requirements.txt b/requirements.txt
@@ -12,3 +12,4 @@ logbook>=0.7.0
 blinker>=1.3
 setuptools>=5.4.1
 natsort>=3.5.2
+html5lib>=0.999