Skip to content

Commit

Permalink
import_page plugin
Browse files Browse the repository at this point in the history
  • Loading branch information
ralsina committed Aug 23, 2015
1 parent 82b38f4 commit be99f85
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 0 deletions.
18 changes: 18 additions & 0 deletions v7/import_page/README.md
@@ -0,0 +1,18 @@
WIP Plugin to import arbitrary web pages.

Usage:

```
nikola import_page http://en.wikipedia.org/wiki/Information_extraction
```

That will produce a information-extraction-wikipedia-the-free-encyclopedia.html that you can edit
and move into your stories/ folder.

You will need something like this in conf.py:

```
PAGES = (
("stories/*.html", "stories", "story.tmpl"),
)
```
9 changes: 9 additions & 0 deletions v7/import_page/import_page.plugin
@@ -0,0 +1,9 @@
[Core]
Name = import_page
Module = import_page

[Documentation]
Author = Roberto Alsina
Version = 0.1
Website = http://plugins.getnikola.com/#import_page
Description = Try to import arbitrary web content
82 changes: 82 additions & 0 deletions v7/import_page/import_page.py
@@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-

# Copyright © 2014, 2015 Miguel Ángel García

# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the
# Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the
# Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice
# shall be included in all copies or substantial portions of
# the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

from __future__ import unicode_literals, print_function

import codecs

import libextract.api
import lxml.html
import requests

from nikola.plugin_categories import Command
from nikola import utils

LOGGER = utils.get_logger('import_page', utils.STDERR_HANDLER)


doc_template = '''<!--
.. title: {title}
.. slug: {slug}
-->
{content}
'''

class CommandImportPage(Command):
"""Import a Page or Octopress blog."""

name = "import_page"
needs_config = False
doc_usage = "[options] page_url [page_url,...]"
doc_purpose = "import arbitrary web pages"

def _execute(self, options, args):
"""Import a Page."""
for url in args:
self._import_page(url)

def _import_page(self, url):
r = requests.get('http://en.wikipedia.org/wiki/Information_extraction')
if 199 < r.status_code < 300: # Got it
# Use the page's title
doc = lxml.html.fromstring(r.content)
title = doc.find('*//title').text_content().decode('utf-8')
slug = utils.slugify(title)
nodes = list(libextract.api.extract(r.content))
# Let's assume the node with more text is the good one
lengths = [len(n.text_content()) for n in nodes]
node = nodes[lengths.index(max(lengths))]
document = doc_template.format(
title = title,
slug = slug,
content = lxml.html.tostring(node, encoding='utf8', method='html', pretty_print=True).decode('utf8')
)
with codecs.open(slug + '.html', 'w+', encoding='utf-8' ) as outf:
outf.write(document)

else:
LOGGER.error('Error fetching URL: {}'.format(url))
2 changes: 2 additions & 0 deletions v7/import_page/requirements.txt
@@ -0,0 +1,2 @@
requests
libexxtract

0 comments on commit be99f85

Please sign in to comment.