|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +# Copyright © 2014, 2015 Miguel Ángel García |
| 4 | + |
| 5 | +# Permission is hereby granted, free of charge, to any |
| 6 | +# person obtaining a copy of this software and associated |
| 7 | +# documentation files (the "Software"), to deal in the |
| 8 | +# Software without restriction, including without limitation |
| 9 | +# the rights to use, copy, modify, merge, publish, |
| 10 | +# distribute, sublicense, and/or sell copies of the |
| 11 | +# Software, and to permit persons to whom the Software is |
| 12 | +# furnished to do so, subject to the following conditions: |
| 13 | +# |
| 14 | +# The above copyright notice and this permission notice |
| 15 | +# shall be included in all copies or substantial portions of |
| 16 | +# the Software. |
| 17 | +# |
| 18 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY |
| 19 | +# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE |
| 20 | +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR |
| 21 | +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS |
| 22 | +# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR |
| 23 | +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
| 24 | +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
| 25 | +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 26 | + |
| 27 | +from __future__ import unicode_literals, print_function |
| 28 | + |
| 29 | +import codecs |
| 30 | + |
| 31 | +import libextract.api |
| 32 | +import lxml.html |
| 33 | +import requests |
| 34 | + |
| 35 | +from nikola.plugin_categories import Command |
| 36 | +from nikola import utils |
| 37 | + |
| 38 | +LOGGER = utils.get_logger('import_page', utils.STDERR_HANDLER) |
| 39 | + |
| 40 | + |
| 41 | +doc_template = '''<!-- |
| 42 | +.. title: {title} |
| 43 | +.. slug: {slug} |
| 44 | +--> |
| 45 | +
|
| 46 | +{content} |
| 47 | +''' |
| 48 | + |
| 49 | +class CommandImportPage(Command): |
| 50 | + """Import a Page or Octopress blog.""" |
| 51 | + |
| 52 | + name = "import_page" |
| 53 | + needs_config = False |
| 54 | + doc_usage = "[options] page_url [page_url,...]" |
| 55 | + doc_purpose = "import arbitrary web pages" |
| 56 | + |
| 57 | + def _execute(self, options, args): |
| 58 | + """Import a Page.""" |
| 59 | + for url in args: |
| 60 | + self._import_page(url) |
| 61 | + |
| 62 | + def _import_page(self, url): |
| 63 | + r = requests.get('http://en.wikipedia.org/wiki/Information_extraction') |
| 64 | + if 199 < r.status_code < 300: # Got it |
| 65 | + # Use the page's title |
| 66 | + doc = lxml.html.fromstring(r.content) |
| 67 | + title = doc.find('*//title').text_content().decode('utf-8') |
| 68 | + slug = utils.slugify(title) |
| 69 | + nodes = list(libextract.api.extract(r.content)) |
| 70 | + # Let's assume the node with more text is the good one |
| 71 | + lengths = [len(n.text_content()) for n in nodes] |
| 72 | + node = nodes[lengths.index(max(lengths))] |
| 73 | + document = doc_template.format( |
| 74 | + title = title, |
| 75 | + slug = slug, |
| 76 | + content = lxml.html.tostring(node, encoding='utf8', method='html', pretty_print=True).decode('utf8') |
| 77 | + ) |
| 78 | + with codecs.open(slug + '.html', 'w+', encoding='utf-8' ) as outf: |
| 79 | + outf.write(document) |
| 80 | + |
| 81 | + else: |
| 82 | + LOGGER.error('Error fetching URL: {}'.format(url)) |
0 commit comments