Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Exclude html meta robots exclusion directives from sitemaps
  • Loading branch information
da2x committed May 14, 2015
1 parent e89101f commit 3cf5039
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGES.txt
Expand Up @@ -10,6 +10,7 @@ Features
Bugfixes
--------

* Exclude `<meta content="noindex" name="robots">` from sitemaps
* new_post paths are now relative to CWD (Issue #1325)

New in v7.4.1
Expand Down
22 changes: 17 additions & 5 deletions nikola/plugins/task/sitemap/__init__.py
Expand Up @@ -163,21 +163,33 @@ def scan_locs():
continue
if not robot_fetch(path):
continue

filehead = io.open(real_path, 'r', encoding='utf8').read(1024)

if path.endswith('.html') or path.endswith('.htm'):
try:
if u'<!doctype html' not in io.open(real_path, 'r', encoding='utf8').read(1024).lower():
# ignores "html" files without doctype
# alexa-verify, google-site-verification, etc.

""" ignores "html" files without doctype """
if u'<!doctype html' not in filehead.lower():
continue

""" ignores "html" files with noindex robot directives """
robots_directives = [u'<meta content="noindex" name="robots"',
u'<meta content="none" name="robots"',
u'<meta name="robots" content="noindex"',
u'<meta name="robots" content="none"']
if any([robot_directive in filehead.lower() for robot_directive in robots_directives]):
continue

except UnicodeDecodeError:
# ignore ancient files
# most non-utf8 files are worthless anyways
continue

""" put Atom and RSS in sitemapindex[] instead of in urlset[], sitemap_path is included after it is generated """
if path.endswith('.xml') or path.endswith('.atom') or path.endswith('.rss'):
known_elm_roots = (u'<feed', u'<rss', u'<urlset')
filehead = io.open(real_path, 'r', encoding='utf8').read(512)
if any([elm_root in filehead for elm_root in known_elm_roots]) and path != sitemap_path:
if any([elm_root in filehead.lower() for elm_root in known_elm_roots]) and path != sitemap_path:
path = path.replace(os.sep, '/')
lastmod = self.get_lastmod(real_path)
loc = urljoin(base_url, base_path + path)
Expand Down

0 comments on commit 3cf5039

Please sign in to comment.