Skip to content

Commit

Permalink
papermonk scraper for plosone.org
Browse files Browse the repository at this point in the history
This is a first version of a downloader module for papermonk, meant to
serve as an example for implementing other modules.

At the moment, the actual paper downloading code is not implemented
because it's silly to stuff everything into the "download" function,
maybe this concept should be split up into multiple methods instead.

version bump to: 0.0.1

(initial commit)
  • Loading branch information
kanzure committed Aug 8, 2013
0 parents commit bd4774a
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .gitignore
@@ -0,0 +1,8 @@
# don't track other node modules
node_modules/

# also ignore npm errors
npm-debug.log

# ignore temporary files leftover from vim
.*.sw*
1 change: 1 addition & 0 deletions AUTHORS
@@ -0,0 +1 @@
Bryan Bishop <kanzure@gmail.com> (http://heybryan.org/)
1 change: 1 addition & 0 deletions LICENSE
@@ -0,0 +1 @@
BSD
24 changes: 24 additions & 0 deletions README.md
@@ -0,0 +1,24 @@
# papermonk-downloader-plosone

This is a [papermonk](https://github.com/kanzure/papermonk) downloader that
scrapes abstracts, papers, pdfs, supplementary documents, and other data from
[PLOS ONE](http://www.plosone.org/).

* [More about PLOS ONE](http://www.plosone.org/static/information)
* [More about Public Library of Science](http://www.plos.org/about/what-is-plos/)

## installing

See [papermonk](https://github.com/kanzure/papermonk) for the main module. This
module is meant to be used as a plugin. However, it should also be possible to
use this module independently and in isolation from other papermonk modules.

## testing

```
node tests.js
```

## license

BSD
25 changes: 25 additions & 0 deletions index.js
@@ -0,0 +1,25 @@
urlparser = require("url-parser");

module.exports.test = function test(url) {
parsedurl = urlparser.parse(url);

if (parsedurl.hostname === "plosone.org")
return true;

if (parsedurl.hostname === "www.plosone.org")
return true;

if (parsedurl.href === "plosone.org")
return true;

if (parsedurl.href === "www.plosone.org")
return true;

return false;
};

// TODO: this should be split into multiple methods
module.exports.download = function download(url, options, callback) {
// TODO: this needs to be implemented
throw new Error("not implemented");
};
51 changes: 51 additions & 0 deletions package.json
@@ -0,0 +1,51 @@
{
"name": "papermonk-downloader-plosone",
"description": "plosone.org scraper",
"version": "0.0.1",
"readmeFilename": "README.md",
"homepage": "https://github.com/kanzure/papermonk-downloader-plosone",
"author": {
"name": "Bryan Bishop",
"email": "kanzure@gmail.com",
"url": "http://heybryan.org/"
},
"repository": {
"type": "git",
"url": "https://github.com/kanzure/papermonk-downloader-plosone"
},
"bugs": {
"url": "https://github.com/kanzure/papermonk-downloader-plosone/issues",
"email": "kanzure@gmail.com"
},
"license": "BSD",
"keywords": [
"papermonk",
"plos",
"plos one",
"plosone.org",
"public library of science",
"papers",
"pdf",
"pdfs",
"academic articles",
"academic papers",
"scholarly articles",
"scholarly papers",
"journals",
"scraping",
"spidering",
"crawling"
],
"engines": {
"node": ">0"
},
"devDependencies": {
},
"dependencies": {
"url-parser": "*",
"tape": ">0"
},
"optionalDependencies": {
},
"main": "./index.js"
}
53 changes: 53 additions & 0 deletions tests.js
@@ -0,0 +1,53 @@
var test = require("tape");

test("require against the module", function(t) {
var downloader = require("./");

t.ok(downloader, "must not be undefined");

t.end();
});

test("has a method called test", function(t) {
var downloader = require("./");

t.ok(downloader.test, "has a method called test");

t.end();
});

test("matches for a url with plosone.org", function(t) {
var downloader = require("./");

t.ok(downloader.test("http://plosone.org/"));
t.ok(downloader.test("http://plosone.org:80/"));
t.ok(downloader.test("http://plosone.org"));
t.ok(downloader.test("http://plosone.org:80"));
t.ok(downloader.test("http://www.plosone.org"));
t.ok(downloader.test("http://www.plosone.org:80"));
t.ok(downloader.test("http://www.plosone.org/"));
t.ok(downloader.test("http://www.plosone.org:80/"));

t.ok(downloader.test("https://plosone.org/"));
t.ok(downloader.test("https://plosone.org:80/"));
t.ok(downloader.test("https://plosone.org"));
t.ok(downloader.test("https://plosone.org:80"));
t.ok(downloader.test("https://www.plosone.org"));
t.ok(downloader.test("https://www.plosone.org:80"));
t.ok(downloader.test("https://www.plosone.org/"));
t.ok(downloader.test("https://www.plosone.org:80/"));

/*
// TODO: url-parser doesn't support these, maybe there's a better module?
t.ok(downloader.test("www.plosone.org"));
t.ok(downloader.test("www.plosone.org/"));
t.ok(downloader.test("www.plosone.org:80"));
t.ok(downloader.test("www.plosone.org:80/"));
t.ok(downloader.test("plosone.org"));
t.ok(downloader.test("plosone.org/"));
t.ok(downloader.test("plosone.org:80"));
t.ok(downloader.test("plosone.org:80/"));
*/

t.end();
});

0 comments on commit bd4774a

Please sign in to comment.