Skip to content

Commit 3c45cd9

Browse files
authoredDec 17, 2016
Merge pull request #2578 from getnikola/wordpress-import-unescape-tags
Unescaping HTML in WordPress tags and categories (fixes #2557).
2 parents 16f0393 + 1f273f1 commit 3c45cd9

File tree

3 files changed

+23
-5
lines changed

3 files changed

+23
-5
lines changed
 

‎CHANGES.txt

+2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ New in master
44
Bugfixes
55
--------
66

7+
* WordPress importer now correctly handles & etc. in tags.
8+
(Issue #2557)
79
* If ``CODE_COLOR_SCHEME`` is empty, don’t generate ``code.css``
810
(Issue #2597)
911
* Don’t warn about ``nikolademo`` DISQUS account when comments are

‎nikola/plugins/command/import_wordpress.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ def _prepare(self, channel):
339339
# cat_id = get_text_tag(cat, '{{{0}}}term_id'.format(wordpress_namespace), None)
340340
cat_slug = get_text_tag(cat, '{{{0}}}category_nicename'.format(wordpress_namespace), None)
341341
cat_parent_slug = get_text_tag(cat, '{{{0}}}category_parent'.format(wordpress_namespace), None)
342-
cat_name = get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None)
342+
cat_name = utils.html_unescape(get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None))
343343
cat_path = [cat_name]
344344
if cat_parent_slug in cat_map:
345345
cat_path = cat_map[cat_parent_slug] + cat_path
@@ -824,16 +824,16 @@ def _create_metadata(self, status, excerpt, tags, categories, post_name=None):
824824
if text in self._category_paths:
825825
cats.append(self._category_paths[text])
826826
else:
827-
cats.append(utils.join_hierarchical_category_path([text]))
827+
cats.append(utils.join_hierarchical_category_path([utils.html_unescape(text)]))
828828
other_meta['categories'] = ','.join(cats)
829829
if len(cats) > 0:
830830
other_meta['category'] = cats[0]
831831
if len(cats) > 1:
832832
LOGGER.warn(('Post "{0}" has more than one category! ' +
833833
'Will only use the first one.').format(post_name))
834-
tags_cats = tags
834+
tags_cats = [utils.html_unescape(tag) for tag in tags]
835835
else:
836-
tags_cats = tags + categories
836+
tags_cats = [utils.html_unescape(tag) for tag in tags + categories]
837837
return tags_cats, other_meta
838838

839839
_tag_sanitize_map = {True: {}, False: {}}

‎nikola/utils.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@
9494
'NikolaPygmentsHTML', 'create_redirect', 'TreeNode',
9595
'flatten_tree_structure', 'parse_escaped_hierarchical_category_name',
9696
'join_hierarchical_category_path', 'clean_before_deployment', 'indent',
97-
'load_data')
97+
'load_data', 'html_unescape')
9898

9999
# Are you looking for 'generic_rss_renderer'?
100100
# It's defined in nikola.nikola.Nikola (the site object).
@@ -1943,3 +1943,19 @@ def load_data(path):
19431943
return
19441944
with io.open(path, 'r', encoding='utf8') as inf:
19451945
return loader.load(inf)
1946+
1947+
1948+
# see http://stackoverflow.com/a/2087433
1949+
try:
1950+
import html # Python 3.4 and newer
1951+
html_unescape = html.unescape
1952+
except (AttributeError, ImportError):
1953+
try:
1954+
from HTMLParser import HTMLParser # Python 2.6 and 2.7
1955+
except ImportError:
1956+
from html.parser import HTMLParser # Python 3 (up to 3.4)
1957+
1958+
def html_unescape(s):
1959+
"""Convert all named and numeric character references in the string s to the corresponding unicode characters."""
1960+
h = HTMLParser()
1961+
return h.unescape(s)

0 commit comments

Comments
 (0)
Please sign in to comment.