Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Fix #1842 -- convert to utf-8 already
Signed-off-by: Chris Warrick <kwpolska@gmail.com>
  • Loading branch information
Kwpolska committed Jun 21, 2015
1 parent 0ed7211 commit fd96b57
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 19 deletions.
1 change: 1 addition & 0 deletions CHANGES.txt
Expand Up @@ -19,6 +19,7 @@ Features
Bugfixes
--------

* Don’t crash on non-UTF-8 files during sitemap generation (Issue #1842)
* Unnecessary rebuilds of yearly archives (Issue #1833)
* Quietly ignore non-existent files in ``nikola check -l`` (Issue #1831)
* Don’t rebuild all tag or category pages when changing tag/category descriptions
Expand Down
35 changes: 16 additions & 19 deletions nikola/plugins/task/sitemap/__init__.py
Expand Up @@ -164,31 +164,28 @@ def scan_locs():
if not robot_fetch(path):
continue

filehead = io.open(real_path, 'r', encoding='utf8').read(1024)
# read in binary mode to make ancient files work
fh = open(real_path, 'rb')
filehead = fh.read(1024)
fh.close()

if path.endswith('.html') or path.endswith('.htm'):
try:

""" ignores "html" files without doctype """
if u'<!doctype html' not in filehead.lower():
continue

""" ignores "html" files with noindex robot directives """
robots_directives = [u'<meta content="noindex" name="robots"',
u'<meta content="none" name="robots"',
u'<meta name="robots" content="noindex"',
u'<meta name="robots" content="none"']
if any([robot_directive in filehead.lower() for robot_directive in robots_directives]):
continue
""" ignores "html" files without doctype """
if b'<!doctype html' not in filehead.lower():
continue

except UnicodeDecodeError:
# ignore ancient files
# most non-utf8 files are worthless anyways
""" ignores "html" files with noindex robot directives """
robots_directives = [b'<meta content="noindex" name="robots"',
b'<meta content="none" name="robots"',
b'<meta name="robots" content="noindex"',
b'<meta name="robots" content="none"']
if any([robot_directive in filehead.lower() for robot_directive in robots_directives]):
continue

""" put Atom and RSS in sitemapindex[] instead of in urlset[], sitemap_path is included after it is generated """
# put Atom and RSS in sitemapindex[] instead of in urlset[],
# sitemap_path is included after it is generated
if path.endswith('.xml') or path.endswith('.atom') or path.endswith('.rss'):
known_elm_roots = (u'<feed', u'<rss', u'<urlset')
known_elm_roots = (b'<feed', b'<rss', b'<urlset')
if any([elm_root in filehead.lower() for elm_root in known_elm_roots]) and path != sitemap_path:
path = path.replace(os.sep, '/')
lastmod = self.get_lastmod(real_path)
Expand Down

0 comments on commit fd96b57

Please sign in to comment.