Use encoelink() everywhere

getnikola · Sep 8, 2015 · 33db32f · 33db32f
1 parent dc1b848
commit 33db32f
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 22 deletions.
diff --git a/nikola/nikola.py b/nikola/nikola.py
@@ -1250,7 +1250,7 @@ def generic_rss_renderer(self, lang, title, link, description, timeline, output_
         """Take all necessary data, and render a RSS feed in output_path."""
         rss_obj = utils.ExtendedRSS2(
             title=title,
-            link=link,
+            link=utils.encodelink(link),
             description=description,
             lastBuildDate=datetime.datetime.utcnow(),
             generator='https://getnikola.com/',
@@ -1445,7 +1445,9 @@ def register_path_handler(self, kind, f):
 
     def link(self, *args):
         """Create a link."""
-        return self.path(*args, is_link=True)
+        url = self.path(*args, is_link=True)
+        url = utils.encodelink(url)
+        return url
 
     def abs_link(self, dst, protocol_relative=False):
         """Get an absolute link."""
@@ -1457,6 +1459,7 @@ def abs_link(self, dst, protocol_relative=False):
         url = urlparse(dst).geturl()
         if protocol_relative:
             url = url.split(":", 1)[1]
+        url = utils.encodelink(url)
         return url
 
     def rel_link(self, src, dst):
@@ -1471,7 +1474,7 @@ def rel_link(self, src, dst):
         parsed_src = urlsplit(src)
         parsed_dst = urlsplit(dst)
         if parsed_src[:2] != parsed_dst[:2]:
-            return dst
+            return utils.encodelink(dst)
         # Now both paths are on the same site and absolute
         src_elems = parsed_src.path.split('/')[1:]
         dst_elems = parsed_dst.path.split('/')[1:]
@@ -1482,7 +1485,9 @@ def rel_link(self, src, dst):
         else:
             i += 1
         # Now i is the longest common prefix
-        return '/'.join(['..'] * (len(src_elems) - i - 1) + dst_elems[i:])
+        url = '/'.join(['..'] * (len(src_elems) - i - 1) + dst_elems[i:])
+        url = utils.encodelink(url)
+        return url
 
     def file_exists(self, path, not_empty=False):
         """Check if the file exists. If not_empty is True, it also must not be empty."""
@@ -1633,7 +1638,7 @@ def scan_posts(self, really=False, ignore_quit=False, quiet=False):
                             utils.LOGGER.error('Tag {0} is used in: {1}'.format(other_tag, ', '.join([p.source_path for p in self.posts_per_tag[other_tag]])))
                             quit = True
                     else:
-                        slugged_tags.add(utils.slugify(tag, force=True))
+                        slugged_tags.add(utils.slugify(tag))
                     self.posts_per_tag[tag].append(post)
                 for lang in self.config['TRANSLATIONS'].keys():
                     self.tags_per_language[lang].extend(post.tags_for_language(lang))
@@ -1790,7 +1795,7 @@ def atom_link(link_rel, link_type, link_href):
             link = lxml.etree.Element("link")
             link.set("rel", link_rel)
             link.set("type", link_type)
-            link.set("href", link_href)
+            link.set("href", utils.encodelink(link_href))
             return link
 
         deps = []
@@ -1826,7 +1831,7 @@ def atom_link(link_rel, link_type, link_href):
         feed_root = lxml.etree.Element("feed", nsmap=nslist)
         feed_root.addprevious(lxml.etree.ProcessingInstruction(
             "xml-stylesheet",
-            'href="' + feed_xsl_link + '" type="text/xsl media="all"'))
+            'href="' + utils.encodelink(feed_xsl_link) + '" type="text/xsl media="all"'))
         feed_root.set("{http://www.w3.org/XML/1998/namespace}lang", lang)
         feed_root.set("xmlns", "http://www.w3.org/2005/Atom")
         feed_title = lxml.etree.SubElement(feed_root, "title")

diff --git a/nikola/plugins/command/check.py b/nikola/plugins/command/check.py
@@ -212,7 +212,7 @@ def analyze(self, fname, find_sources=False, check_remote=False):
                 # Quietly ignore files that don’t exist; use `nikola check -f` instead (Issue #1831)
                 return False
 
-            if '.html' == fname[-5:]:  # DISABLED
+            if '.html' == fname[-5:]:
                 d = lxml.html.fromstring(open(filename, 'rb').read())
                 extra_objs = lxml.html.fromstring('<html/>')
 
@@ -340,6 +340,7 @@ def analyze(self, fname, find_sources=False, check_remote=False):
 
                 if any(re.search(x, target_filename) for x in self.whitelist):
                     continue
+
                 elif target_filename not in self.existing_targets:
                     if os.path.exists(target_filename):
                         self.logger.notice("Good link {0} => {1}".format(target, target_filename))

diff --git a/nikola/plugins/task/sitemap/__init__.py b/nikola/plugins/task/sitemap/__init__.py
@@ -40,7 +40,7 @@
     import urllib.robotparser as robotparser  # NOQA
 
 from nikola.plugin_categories import LateTask
-from nikola.utils import config_changed, apply_filters
+from nikola.utils import apply_filters, config_changed, encodelink
 
 
 urlset_header = """<?xml version="1.0" encoding="UTF-8"?>
@@ -158,10 +158,10 @@ def scan_locs():
                     if post:
                         for lang in kw['translations']:
                             alt_url = post.permalink(lang=lang, absolute=True)
-                            if loc == alt_url:
+                            if encodelink(loc) == alt_url:
                                 continue
                             alternates.append(alternates_format.format(lang, alt_url))
-                    urlset[loc] = loc_format.format(loc, lastmod, ''.join(alternates))
+                    urlset[loc] = loc_format.format(encodelink(loc), lastmod, ''.join(alternates))
                 for fname in files:
                     if kw['strip_indexes'] and fname == kw['index_file']:
                         continue  # We already mapped the folder
@@ -201,7 +201,7 @@ def scan_locs():
                                 path = path.replace(os.sep, '/')
                                 lastmod = self.get_lastmod(real_path)
                                 loc = urljoin(base_url, base_path + path)
-                                sitemapindex[loc] = sitemap_format.format(loc, lastmod)
+                                sitemapindex[loc] = sitemap_format.format(encodelink(loc), lastmod)
                                 continue
                             else:
                                 continue  # ignores all XML files except those presumed to be RSS
@@ -215,10 +215,10 @@ def scan_locs():
                         if post:
                             for lang in kw['translations']:
                                 alt_url = post.permalink(lang=lang, absolute=True)
-                                if loc == alt_url:
+                                if encodelink(loc) == alt_url:
                                     continue
                                 alternates.append(alternates_format.format(lang, alt_url))
-                        urlset[loc] = loc_format.format(loc, lastmod, '\n'.join(alternates))
+                        urlset[loc] = loc_format.format(encodelink(loc), lastmod, '\n'.join(alternates))
 
         def robot_fetch(path):
             """Check if robots can fetch a file."""

diff --git a/nikola/post.py b/nikola/post.py
@@ -707,10 +707,11 @@ def remaining_paragraph_count(self):
     def source_link(self, lang=None):
         """Return absolute link to the post's source."""
         ext = self.source_ext(True)
-        return "/" + self.destination_path(
-            lang=lang,
-            extension=ext,
-            sep='/')
+
+
+        link = "/" + self.destination_path(lang=lang, extension=ext, sep='/')
+        link = utils.encodelink(link)
+        return link
 
     def destination_path(self, lang=None, extension='.html', sep=os.sep):
         """Destination path for this post, relative to output/.
@@ -747,6 +748,7 @@ def section_link(self, lang=None):
             link = urljoin('/' + slug + '/', self.index_file)
         else:
             link = '/' + slug + '/'
+        link = utils.encodelink(link)
         return link
 
     def section_name(self, lang=None):
@@ -803,6 +805,7 @@ def permalink(self, lang=None, absolute=False, extension='.html', query=None):
             link = link[:-index_len]
         if query:
             link = link + "?" + query
+        link = utils.encodelink(link)
         return link
 
     @property

diff --git a/nikola/utils.py b/nikola/utils.py
@@ -45,9 +45,17 @@
 import dateutil.parser
 import dateutil.tz
 import logbook
+try:
+    from urllib import quote as urlquote
+    from urllib import unquote as urlunquote
+    from urlparse import urlparse, urlunparse
+except ImportError:
+    from urllib.parse import quote as urlquote  # NOQA
+    from urllib.parse import unquote as urlunquote  # NOQA
+    from urllib.parse import urlparse, urlunparse  # NOQA
 import warnings
 import PyRSS2Gen as rss
-from collections import defaultdict, Callable
+from collections import defaultdict, Callable, OrderedDict
 from logbook.compat import redirect_logging
 from logbook.more import ExceptionHandler, ColorizedStderrHandler
 from pygments.formatters import HtmlFormatter
@@ -725,7 +733,7 @@ def remove_file(source):
     elif os.path.isfile(source) or os.path.islink(source):
         os.remove(source)
 
-# slugify is copied from
+# slugify is adopted from
 # http://code.activestate.com/recipes/
 # 577257-slugify-make-a-string-usable-in-a-url-or-filename/
 _slugify_strip_re = re.compile(r'[^+\w\s-]')
@@ -752,7 +760,7 @@ def slugify(value, force=False):
         # This is the standard state of slugify, which actually does some work.
         # It is the preferred style, especially for Western languages.
         value = unicode_str(unidecode(value))
-        value = _slugify_strip_re.sub('', value, re.UNICODE).strip().lower()
+        value = _slugify_strip_re.sub('', value, re.UNICODE).strip()
         return _slugify_hyphenate_re.sub('-', value, re.UNICODE)
     else:
         # This is the “disarmed” state of slugify, which lets the user
@@ -767,7 +775,7 @@ def slugify(value, force=False):
 
         for c in rc:
             value = value.replace(c, '-')
-        return value
+        return value.lower()
 
 
 def unslugify(value, discard_numbers=True):
@@ -783,6 +791,17 @@ def unslugify(value, discard_numbers=True):
     return value
 
 
+def encodelink(iri):
+    """Given an encoded or unencoded link string, return an encoded string suitable for use as a link in HTML and XML."""
+    link = OrderedDict(urlparse(iri).__dict__)
+    link['path'] = urlquote(urlunquote(link['path']))
+    try:
+        link['netloc'] = link['netloc'].encode('utf-8').decode('idna').encode('idna').decode('utf-8')
+    except UnicodeDecodeError:
+        link['netloc'] = link['netloc'].encode('idna').decode('utf-8')
+    encoded_link = urlunparse(link.values())
+    return encoded_link
+
 # A very slightly safer version of zip.extractall that works on
 # python < 2.6