Skip to content

Commit bd4774a

Browse files
committedAug 8, 2013
papermonk scraper for plosone.org
This is a first version of a downloader module for papermonk, meant to serve as an example for implementing other modules. At the moment, the actual paper downloading code is not implemented because it's silly to stuff everything into the "download" function, maybe this concept should be split up into multiple methods instead. version bump to: 0.0.1 (initial commit)
0 parents  commit bd4774a

File tree

7 files changed

+163
-0
lines changed

7 files changed

+163
-0
lines changed
 

‎.gitignore

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# don't track other node modules
2+
node_modules/
3+
4+
# also ignore npm errors
5+
npm-debug.log
6+
7+
# ignore temporary files leftover from vim
8+
.*.sw*

‎AUTHORS

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Bryan Bishop <kanzure@gmail.com> (http://heybryan.org/)

‎LICENSE

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
BSD

‎README.md

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# papermonk-downloader-plosone
2+
3+
This is a [papermonk](https://github.com/kanzure/papermonk) downloader that
4+
scrapes abstracts, papers, pdfs, supplementary documents, and other data from
5+
[PLOS ONE](http://www.plosone.org/).
6+
7+
* [More about PLOS ONE](http://www.plosone.org/static/information)
8+
* [More about Public Library of Science](http://www.plos.org/about/what-is-plos/)
9+
10+
## installing
11+
12+
See [papermonk](https://github.com/kanzure/papermonk) for the main module. This
13+
module is meant to be used as a plugin. However, it should also be possible to
14+
use this module independently and in isolation from other papermonk modules.
15+
16+
## testing
17+
18+
```
19+
node tests.js
20+
```
21+
22+
## license
23+
24+
BSD

‎index.js

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
urlparser = require("url-parser");
2+
3+
module.exports.test = function test(url) {
4+
parsedurl = urlparser.parse(url);
5+
6+
if (parsedurl.hostname === "plosone.org")
7+
return true;
8+
9+
if (parsedurl.hostname === "www.plosone.org")
10+
return true;
11+
12+
if (parsedurl.href === "plosone.org")
13+
return true;
14+
15+
if (parsedurl.href === "www.plosone.org")
16+
return true;
17+
18+
return false;
19+
};
20+
21+
// TODO: this should be split into multiple methods
22+
module.exports.download = function download(url, options, callback) {
23+
// TODO: this needs to be implemented
24+
throw new Error("not implemented");
25+
};

‎package.json

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
{
2+
"name": "papermonk-downloader-plosone",
3+
"description": "plosone.org scraper",
4+
"version": "0.0.1",
5+
"readmeFilename": "README.md",
6+
"homepage": "https://github.com/kanzure/papermonk-downloader-plosone",
7+
"author": {
8+
"name": "Bryan Bishop",
9+
"email": "kanzure@gmail.com",
10+
"url": "http://heybryan.org/"
11+
},
12+
"repository": {
13+
"type": "git",
14+
"url": "https://github.com/kanzure/papermonk-downloader-plosone"
15+
},
16+
"bugs": {
17+
"url": "https://github.com/kanzure/papermonk-downloader-plosone/issues",
18+
"email": "kanzure@gmail.com"
19+
},
20+
"license": "BSD",
21+
"keywords": [
22+
"papermonk",
23+
"plos",
24+
"plos one",
25+
"plosone.org",
26+
"public library of science",
27+
"papers",
28+
"pdf",
29+
"pdfs",
30+
"academic articles",
31+
"academic papers",
32+
"scholarly articles",
33+
"scholarly papers",
34+
"journals",
35+
"scraping",
36+
"spidering",
37+
"crawling"
38+
],
39+
"engines": {
40+
"node": ">0"
41+
},
42+
"devDependencies": {
43+
},
44+
"dependencies": {
45+
"url-parser": "*",
46+
"tape": ">0"
47+
},
48+
"optionalDependencies": {
49+
},
50+
"main": "./index.js"
51+
}

‎tests.js

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
var test = require("tape");
2+
3+
test("require against the module", function(t) {
4+
var downloader = require("./");
5+
6+
t.ok(downloader, "must not be undefined");
7+
8+
t.end();
9+
});
10+
11+
test("has a method called test", function(t) {
12+
var downloader = require("./");
13+
14+
t.ok(downloader.test, "has a method called test");
15+
16+
t.end();
17+
});
18+
19+
test("matches for a url with plosone.org", function(t) {
20+
var downloader = require("./");
21+
22+
t.ok(downloader.test("http://plosone.org/"));
23+
t.ok(downloader.test("http://plosone.org:80/"));
24+
t.ok(downloader.test("http://plosone.org"));
25+
t.ok(downloader.test("http://plosone.org:80"));
26+
t.ok(downloader.test("http://www.plosone.org"));
27+
t.ok(downloader.test("http://www.plosone.org:80"));
28+
t.ok(downloader.test("http://www.plosone.org/"));
29+
t.ok(downloader.test("http://www.plosone.org:80/"));
30+
31+
t.ok(downloader.test("https://plosone.org/"));
32+
t.ok(downloader.test("https://plosone.org:80/"));
33+
t.ok(downloader.test("https://plosone.org"));
34+
t.ok(downloader.test("https://plosone.org:80"));
35+
t.ok(downloader.test("https://www.plosone.org"));
36+
t.ok(downloader.test("https://www.plosone.org:80"));
37+
t.ok(downloader.test("https://www.plosone.org/"));
38+
t.ok(downloader.test("https://www.plosone.org:80/"));
39+
40+
/*
41+
// TODO: url-parser doesn't support these, maybe there's a better module?
42+
t.ok(downloader.test("www.plosone.org"));
43+
t.ok(downloader.test("www.plosone.org/"));
44+
t.ok(downloader.test("www.plosone.org:80"));
45+
t.ok(downloader.test("www.plosone.org:80/"));
46+
t.ok(downloader.test("plosone.org"));
47+
t.ok(downloader.test("plosone.org/"));
48+
t.ok(downloader.test("plosone.org:80"));
49+
t.ok(downloader.test("plosone.org:80/"));
50+
*/
51+
52+
t.end();
53+
});

0 commit comments

Comments
 (0)
Please sign in to comment.