Skip to content

Commit

Permalink
Unescaping HTML in WordPress tags and categories.
Browse files Browse the repository at this point in the history
  • Loading branch information
felixfontein committed Dec 4, 2016
1 parent eb1ebfb commit 8b193c7
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 5 deletions.
8 changes: 4 additions & 4 deletions nikola/plugins/command/import_wordpress.py
Expand Up @@ -339,7 +339,7 @@ def _prepare(self, channel):
# cat_id = get_text_tag(cat, '{{{0}}}term_id'.format(wordpress_namespace), None)
cat_slug = get_text_tag(cat, '{{{0}}}category_nicename'.format(wordpress_namespace), None)
cat_parent_slug = get_text_tag(cat, '{{{0}}}category_parent'.format(wordpress_namespace), None)
cat_name = get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None)
cat_name = utils.html_unescape(get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None))
cat_path = [cat_name]
if cat_parent_slug in cat_map:
cat_path = cat_map[cat_parent_slug] + cat_path
Expand Down Expand Up @@ -824,16 +824,16 @@ def _create_metadata(self, status, excerpt, tags, categories, post_name=None):
if text in self._category_paths:
cats.append(self._category_paths[text])
else:
cats.append(utils.join_hierarchical_category_path([text]))
cats.append(utils.join_hierarchical_category_path([utils.html_unescape(text)]))
other_meta['categories'] = ','.join(cats)
if len(cats) > 0:
other_meta['category'] = cats[0]
if len(cats) > 1:
LOGGER.warn(('Post "{0}" has more than one category! ' +
'Will only use the first one.').format(post_name))
tags_cats = tags
tags_cats = [utils.html_unescape(tag) for tag in tags]
else:
tags_cats = tags + categories
tags_cats = [utils.html_unescape(tag) for tag in tags + categories]
return tags_cats, other_meta

_tag_sanitize_map = {True: {}, False: {}}
Expand Down
19 changes: 18 additions & 1 deletion nikola/utils.py
Expand Up @@ -94,7 +94,7 @@
'NikolaPygmentsHTML', 'create_redirect', 'TreeNode',
'flatten_tree_structure', 'parse_escaped_hierarchical_category_name',
'join_hierarchical_category_path', 'clean_before_deployment', 'indent',
'load_data')
'load_data', 'html_unescape')

# Are you looking for 'generic_rss_renderer'?
# It's defined in nikola.nikola.Nikola (the site object).
Expand Down Expand Up @@ -1942,3 +1942,20 @@ def load_data(path):
return
with io.open(path, 'r', encoding='utf8') as inf:
return loader.load(inf)


# http://stackoverflow.com/a/2087433
if sys.version_info[0] == 3 and sys.version_info[1] >= 4:
import html # Python 3.4 and newer

html_unescape = html.unescape
else:
try:
from HTMLParser import HTMLParser # Python 2.6 and 2.7
except ImportError:
from html.parser import HTMLParser # Python 3 (up to 3.4)

def html_unescape(s):
"""Convert all named and numeric character references in the string s to the corresponding unicode characters."""
h = HTMLParser()
return h.unescape(s)

0 comments on commit 8b193c7

Please sign in to comment.