Merge pull request #2037 from getnikola/encodelinks

Use encodelink() everywhere
getnikola · Sep 10, 2015 · ef6af2b · ef6af2b
2 parents 078a8b8 + f15f871
commit ef6af2b
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 23 deletions.
diff --git a/nikola/nikola.py b/nikola/nikola.py
@@ -1252,7 +1252,7 @@ def generic_rss_renderer(self, lang, title, link, description, timeline, output_
         """Take all necessary data, and render a RSS feed in output_path."""
         rss_obj = utils.ExtendedRSS2(
             title=title,
-            link=link,
+            link=utils.encodelink(link),
             description=description,
             lastBuildDate=datetime.datetime.utcnow(),
             generator='https://getnikola.com/',
@@ -1447,7 +1447,9 @@ def register_path_handler(self, kind, f):
 
     def link(self, *args):
         """Create a link."""
-        return self.path(*args, is_link=True)
+        url = self.path(*args, is_link=True)
+        url = utils.encodelink(url)
+        return url
 
     def abs_link(self, dst, protocol_relative=False):
         """Get an absolute link."""
@@ -1459,6 +1461,7 @@ def abs_link(self, dst, protocol_relative=False):
         url = urlparse(dst).geturl()
         if protocol_relative:
             url = url.split(":", 1)[1]
+        url = utils.encodelink(url)
         return url
 
     def rel_link(self, src, dst):
@@ -1473,7 +1476,7 @@ def rel_link(self, src, dst):
         parsed_src = urlsplit(src)
         parsed_dst = urlsplit(dst)
         if parsed_src[:2] != parsed_dst[:2]:
-            return dst
+            return utils.encodelink(dst)
         # Now both paths are on the same site and absolute
         src_elems = parsed_src.path.split('/')[1:]
         dst_elems = parsed_dst.path.split('/')[1:]
@@ -1484,7 +1487,9 @@ def rel_link(self, src, dst):
         else:
             i += 1
         # Now i is the longest common prefix
-        return '/'.join(['..'] * (len(src_elems) - i - 1) + dst_elems[i:])
+        url = '/'.join(['..'] * (len(src_elems) - i - 1) + dst_elems[i:])
+        url = utils.encodelink(url)
+        return url
 
     def file_exists(self, path, not_empty=False):
         """Check if the file exists. If not_empty is True, it also must not be empty."""
@@ -1635,7 +1640,7 @@ def scan_posts(self, really=False, ignore_quit=False, quiet=False):
                             utils.LOGGER.error('Tag {0} is used in: {1}'.format(other_tag, ', '.join([p.source_path for p in self.posts_per_tag[other_tag]])))
                             quit = True
                     else:
-                        slugged_tags.add(utils.slugify(tag, force=True))
+                        slugged_tags.add(utils.slugify(tag))
                     self.posts_per_tag[tag].append(post)
                 for lang in self.config['TRANSLATIONS'].keys():
                     self.tags_per_language[lang].extend(post.tags_for_language(lang))
@@ -1792,7 +1797,7 @@ def atom_link(link_rel, link_type, link_href):
             link = lxml.etree.Element("link")
             link.set("rel", link_rel)
             link.set("type", link_type)
-            link.set("href", link_href)
+            link.set("href", utils.encodelink(link_href))
             return link
 
         deps = []
@@ -1828,7 +1833,7 @@ def atom_link(link_rel, link_type, link_href):
         feed_root = lxml.etree.Element("feed", nsmap=nslist)
         feed_root.addprevious(lxml.etree.ProcessingInstruction(
             "xml-stylesheet",
-            'href="' + feed_xsl_link + '" type="text/xsl media="all"'))
+            'href="' + utils.encodelink(feed_xsl_link) + '" type="text/xsl media="all"'))
         feed_root.set("{http://www.w3.org/XML/1998/namespace}lang", lang)
         feed_root.set("xmlns", "http://www.w3.org/2005/Atom")
         feed_title = lxml.etree.SubElement(feed_root, "title")

diff --git a/nikola/plugins/command/check.py b/nikola/plugins/command/check.py
@@ -212,7 +212,7 @@ def analyze(self, fname, find_sources=False, check_remote=False):
                 # Quietly ignore files that don’t exist; use `nikola check -f` instead (Issue #1831)
                 return False
 
-            if '.html' == fname[-5:]:  # DISABLED
+            if '.html' == fname[-5:]:
                 d = lxml.html.fromstring(open(filename, 'rb').read())
                 extra_objs = lxml.html.fromstring('<html/>')
 
@@ -323,8 +323,9 @@ def analyze(self, fname, find_sources=False, check_remote=False):
                         target_filename = os.path.abspath(
                             os.path.join(self.site.config['OUTPUT_FOLDER'], unquote(target.lstrip('/'))))
                     else:  # Relative path
+                        unquoted_target = unquote(target).encode('utf-8') if sys.version_info.major >= 3 else unquote(target).decode('utf-8')
                         target_filename = os.path.abspath(
-                            os.path.join(os.path.dirname(filename), unquote(target)))
+                            os.path.join(os.path.dirname(filename).encode('utf-8'), unquoted_target))
 
                 elif url_type in ('full_path', 'absolute'):
                     if url_type == 'absolute':
@@ -340,9 +341,10 @@ def analyze(self, fname, find_sources=False, check_remote=False):
 
                 if any(re.search(x, target_filename) for x in self.whitelist):
                     continue
+
                 elif target_filename not in self.existing_targets:
                     if os.path.exists(target_filename):
-                        self.logger.notice("Good link {0} => {1}".format(target, target_filename))
+                        self.logger.notice(u"Good link {0} => {1}".format(target, target_filename))
                         self.existing_targets.add(target_filename)
                     else:
                         rv = True
@@ -352,7 +354,7 @@ def analyze(self, fname, find_sources=False, check_remote=False):
                             self.logger.warn("\n".join(deps[filename]))
                             self.logger.warn("===============================\n")
         except Exception as exc:
-            self.logger.error("Error with: {0} {1}".format(filename, exc))
+            self.logger.error(u"Error with: {0} {1}".format(filename, exc))
         return rv
 
     def scan_links(self, find_sources=False, check_remote=False):

diff --git a/nikola/plugins/task/sitemap/__init__.py b/nikola/plugins/task/sitemap/__init__.py
@@ -40,7 +40,7 @@
     import urllib.robotparser as robotparser  # NOQA
 
 from nikola.plugin_categories import LateTask
-from nikola.utils import config_changed, apply_filters
+from nikola.utils import apply_filters, config_changed, encodelink
 
 
 urlset_header = """<?xml version="1.0" encoding="UTF-8"?>
@@ -158,10 +158,10 @@ def scan_locs():
                     if post:
                         for lang in kw['translations']:
                             alt_url = post.permalink(lang=lang, absolute=True)
-                            if loc == alt_url:
+                            if encodelink(loc) == alt_url:
                                 continue
                             alternates.append(alternates_format.format(lang, alt_url))
-                    urlset[loc] = loc_format.format(loc, lastmod, ''.join(alternates))
+                    urlset[loc] = loc_format.format(encodelink(loc), lastmod, ''.join(alternates))
                 for fname in files:
                     if kw['strip_indexes'] and fname == kw['index_file']:
                         continue  # We already mapped the folder
@@ -201,7 +201,7 @@ def scan_locs():
                                 path = path.replace(os.sep, '/')
                                 lastmod = self.get_lastmod(real_path)
                                 loc = urljoin(base_url, base_path + path)
-                                sitemapindex[loc] = sitemap_format.format(loc, lastmod)
+                                sitemapindex[loc] = sitemap_format.format(encodelink(loc), lastmod)
                                 continue
                             else:
                                 continue  # ignores all XML files except those presumed to be RSS
@@ -215,10 +215,10 @@ def scan_locs():
                         if post:
                             for lang in kw['translations']:
                                 alt_url = post.permalink(lang=lang, absolute=True)
-                                if loc == alt_url:
+                                if encodelink(loc) == alt_url:
                                     continue
                                 alternates.append(alternates_format.format(lang, alt_url))
-                        urlset[loc] = loc_format.format(loc, lastmod, '\n'.join(alternates))
+                        urlset[loc] = loc_format.format(encodelink(loc), lastmod, '\n'.join(alternates))
 
         def robot_fetch(path):
             """Check if robots can fetch a file."""

diff --git a/nikola/post.py b/nikola/post.py
@@ -711,10 +711,9 @@ def remaining_paragraph_count(self):
     def source_link(self, lang=None):
         """Return absolute link to the post's source."""
         ext = self.source_ext(True)
-        return "/" + self.destination_path(
-            lang=lang,
-            extension=ext,
-            sep='/')
+        link = "/" + self.destination_path(lang=lang, extension=ext, sep='/')
+        link = utils.encodelink(link)
+        return link
 
     def destination_path(self, lang=None, extension='.html', sep=os.sep):
         """Destination path for this post, relative to output/.
@@ -751,6 +750,7 @@ def section_link(self, lang=None):
             link = urljoin('/' + slug + '/', self.index_file)
         else:
             link = '/' + slug + '/'
+        link = utils.encodelink(link)
         return link
 
     def section_name(self, lang=None):
@@ -807,6 +807,7 @@ def permalink(self, lang=None, absolute=False, extension='.html', query=None):
             link = link[:-index_len]
         if query:
             link = link + "?" + query
+        link = utils.encodelink(link)
         return link
 
     @property

diff --git a/nikola/utils.py b/nikola/utils.py
@@ -45,15 +45,24 @@
 import dateutil.parser
 import dateutil.tz
 import logbook
+try:
+    from urllib import quote as urlquote
+    from urllib import unquote as urlunquote
+    from urlparse import urlparse, urlunparse
+except ImportError:
+    from urllib.parse import quote as urlquote  # NOQA
+    from urllib.parse import unquote as urlunquote  # NOQA
+    from urllib.parse import urlparse, urlunparse  # NOQA
 import warnings
 import PyRSS2Gen as rss
-from collections import defaultdict, Callable
+from collections import defaultdict, Callable, OrderedDict
 from logbook.compat import redirect_logging
 from logbook.more import ExceptionHandler, ColorizedStderrHandler
 from pygments.formatters import HtmlFormatter
 from zipfile import ZipFile as zipf
 from doit import tools
 from unidecode import unidecode
+from unicodedata import normalize as unicodenormalize
 from pkg_resources import resource_filename
 from doit.cmdparse import CmdParse
 
@@ -725,7 +734,7 @@ def remove_file(source):
     elif os.path.isfile(source) or os.path.islink(source):
         os.remove(source)
 
-# slugify is copied from
+# slugify is adopted from
 # http://code.activestate.com/recipes/
 # 577257-slugify-make-a-string-usable-in-a-url-or-filename/
 _slugify_strip_re = re.compile(r'[^+\w\s-]')
@@ -783,9 +792,22 @@ def unslugify(value, discard_numbers=True):
     return value
 
 
+def encodelink(iri):
+    """Given an encoded or unencoded link string, return an encoded string suitable for use as a link in HTML and XML."""
+    iri = unicodenormalize('NFC', iri)
+    link = OrderedDict(urlparse(iri)._asdict())
+    link['path'] = urlquote(urlunquote(link['path']).encode('utf-8'))
+    try:
+        link['netloc'] = link['netloc'].encode('utf-8').decode('idna').encode('idna').decode('utf-8')
+    except UnicodeDecodeError:
+        link['netloc'] = link['netloc'].encode('idna').decode('utf-8')
+    encoded_link = urlunparse(link.values())
+    return encoded_link
+
 # A very slightly safer version of zip.extractall that works on
 # python < 2.6
 
+
 class UnsafeZipException(Exception):
 
     """Exception for unsafe zip files."""