Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #2578 from getnikola/wordpress-import-unescape-tags
Unescaping HTML in WordPress tags and categories (fixes #2557).
  • Loading branch information
felixfontein committed Dec 17, 2016
2 parents 16f0393 + 1f273f1 commit 3c45cd9
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 5 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
Expand Up @@ -4,6 +4,8 @@ New in master
Bugfixes
--------

* WordPress importer now correctly handles & etc. in tags.
(Issue #2557)
* If ``CODE_COLOR_SCHEME`` is empty, don’t generate ``code.css``
(Issue #2597)
* Don’t warn about ``nikolademo`` DISQUS account when comments are
Expand Down
8 changes: 4 additions & 4 deletions nikola/plugins/command/import_wordpress.py
Expand Up @@ -339,7 +339,7 @@ def _prepare(self, channel):
# cat_id = get_text_tag(cat, '{{{0}}}term_id'.format(wordpress_namespace), None)
cat_slug = get_text_tag(cat, '{{{0}}}category_nicename'.format(wordpress_namespace), None)
cat_parent_slug = get_text_tag(cat, '{{{0}}}category_parent'.format(wordpress_namespace), None)
cat_name = get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None)
cat_name = utils.html_unescape(get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None))
cat_path = [cat_name]
if cat_parent_slug in cat_map:
cat_path = cat_map[cat_parent_slug] + cat_path
Expand Down Expand Up @@ -824,16 +824,16 @@ def _create_metadata(self, status, excerpt, tags, categories, post_name=None):
if text in self._category_paths:
cats.append(self._category_paths[text])
else:
cats.append(utils.join_hierarchical_category_path([text]))
cats.append(utils.join_hierarchical_category_path([utils.html_unescape(text)]))
other_meta['categories'] = ','.join(cats)
if len(cats) > 0:
other_meta['category'] = cats[0]
if len(cats) > 1:
LOGGER.warn(('Post "{0}" has more than one category! ' +
'Will only use the first one.').format(post_name))
tags_cats = tags
tags_cats = [utils.html_unescape(tag) for tag in tags]
else:
tags_cats = tags + categories
tags_cats = [utils.html_unescape(tag) for tag in tags + categories]
return tags_cats, other_meta

_tag_sanitize_map = {True: {}, False: {}}
Expand Down
18 changes: 17 additions & 1 deletion nikola/utils.py
Expand Up @@ -94,7 +94,7 @@
'NikolaPygmentsHTML', 'create_redirect', 'TreeNode',
'flatten_tree_structure', 'parse_escaped_hierarchical_category_name',
'join_hierarchical_category_path', 'clean_before_deployment', 'indent',
'load_data')
'load_data', 'html_unescape')

# Are you looking for 'generic_rss_renderer'?
# It's defined in nikola.nikola.Nikola (the site object).
Expand Down Expand Up @@ -1943,3 +1943,19 @@ def load_data(path):
return
with io.open(path, 'r', encoding='utf8') as inf:
return loader.load(inf)


# see http://stackoverflow.com/a/2087433
try:
import html # Python 3.4 and newer
html_unescape = html.unescape
except (AttributeError, ImportError):
try:
from HTMLParser import HTMLParser # Python 2.6 and 2.7
except ImportError:
from html.parser import HTMLParser # Python 3 (up to 3.4)

def html_unescape(s):
"""Convert all named and numeric character references in the string s to the corresponding unicode characters."""
h = HTMLParser()
return h.unescape(s)

0 comments on commit 3c45cd9

Please sign in to comment.