Skip to content

Commit be99f85

Browse files
committedAug 23, 2015
import_page plugin
1 parent 82b38f4 commit be99f85

File tree

4 files changed

+111
-0
lines changed

4 files changed

+111
-0
lines changed
 

‎v7/import_page/README.md

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
WIP Plugin to import arbitrary web pages.
2+
3+
Usage:
4+
5+
```
6+
nikola import_page http://en.wikipedia.org/wiki/Information_extraction
7+
```
8+
9+
That will produce a information-extraction-wikipedia-the-free-encyclopedia.html that you can edit
10+
and move into your stories/ folder.
11+
12+
You will need something like this in conf.py:
13+
14+
```
15+
PAGES = (
16+
("stories/*.html", "stories", "story.tmpl"),
17+
)
18+
```

‎v7/import_page/import_page.plugin

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[Core]
2+
Name = import_page
3+
Module = import_page
4+
5+
[Documentation]
6+
Author = Roberto Alsina
7+
Version = 0.1
8+
Website = http://plugins.getnikola.com/#import_page
9+
Description = Try to import arbitrary web content

‎v7/import_page/import_page.py

+82
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Copyright © 2014, 2015 Miguel Ángel García
4+
5+
# Permission is hereby granted, free of charge, to any
6+
# person obtaining a copy of this software and associated
7+
# documentation files (the "Software"), to deal in the
8+
# Software without restriction, including without limitation
9+
# the rights to use, copy, modify, merge, publish,
10+
# distribute, sublicense, and/or sell copies of the
11+
# Software, and to permit persons to whom the Software is
12+
# furnished to do so, subject to the following conditions:
13+
#
14+
# The above copyright notice and this permission notice
15+
# shall be included in all copies or substantial portions of
16+
# the Software.
17+
#
18+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
19+
# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
20+
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
21+
# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
22+
# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23+
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
24+
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25+
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26+
27+
from __future__ import unicode_literals, print_function
28+
29+
import codecs
30+
31+
import libextract.api
32+
import lxml.html
33+
import requests
34+
35+
from nikola.plugin_categories import Command
36+
from nikola import utils
37+
38+
LOGGER = utils.get_logger('import_page', utils.STDERR_HANDLER)
39+
40+
41+
doc_template = '''<!--
42+
.. title: {title}
43+
.. slug: {slug}
44+
-->
45+
46+
{content}
47+
'''
48+
49+
class CommandImportPage(Command):
50+
"""Import a Page or Octopress blog."""
51+
52+
name = "import_page"
53+
needs_config = False
54+
doc_usage = "[options] page_url [page_url,...]"
55+
doc_purpose = "import arbitrary web pages"
56+
57+
def _execute(self, options, args):
58+
"""Import a Page."""
59+
for url in args:
60+
self._import_page(url)
61+
62+
def _import_page(self, url):
63+
r = requests.get('http://en.wikipedia.org/wiki/Information_extraction')
64+
if 199 < r.status_code < 300: # Got it
65+
# Use the page's title
66+
doc = lxml.html.fromstring(r.content)
67+
title = doc.find('*//title').text_content().decode('utf-8')
68+
slug = utils.slugify(title)
69+
nodes = list(libextract.api.extract(r.content))
70+
# Let's assume the node with more text is the good one
71+
lengths = [len(n.text_content()) for n in nodes]
72+
node = nodes[lengths.index(max(lengths))]
73+
document = doc_template.format(
74+
title = title,
75+
slug = slug,
76+
content = lxml.html.tostring(node, encoding='utf8', method='html', pretty_print=True).decode('utf8')
77+
)
78+
with codecs.open(slug + '.html', 'w+', encoding='utf-8' ) as outf:
79+
outf.write(document)
80+
81+
else:
82+
LOGGER.error('Error fetching URL: {}'.format(url))

‎v7/import_page/requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
requests
2+
libexxtract

0 commit comments

Comments
 (0)
Please sign in to comment.