papermonk scraper for plosone.org

kanzure · kanzure · commit bd4774ad7df7 · 2013-08-07T20:50:43.000-05:00
This is a first version of a downloader module for papermonk, meant to
serve as an example for implementing other modules.

At the moment, the actual paper downloading code is not implemented
because it's silly to stuff everything into the "download" function,
maybe this concept should be split up into multiple methods instead.

version bump to: 0.0.1

(initial commit)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+# don't track other node modules
+node_modules/
+
+# also ignore npm errors
+npm-debug.log
+
+# ignore temporary files leftover from vim
+.*.sw*
diff --git a/AUTHORS b/AUTHORS
@@ -0,0 +1 @@
+Bryan Bishop <kanzure@gmail.com> (http://heybryan.org/)
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1 @@
+BSD
diff --git a/README.md b/README.md
@@ -0,0 +1,24 @@
+# papermonk-downloader-plosone
+
+This is a [papermonk](https://github.com/kanzure/papermonk) downloader that
+scrapes abstracts, papers, pdfs, supplementary documents, and other data from
+[PLOS ONE](http://www.plosone.org/).
+
+* [More about PLOS ONE](http://www.plosone.org/static/information)
+* [More about Public Library of Science](http://www.plos.org/about/what-is-plos/)
+
+## installing
+
+See [papermonk](https://github.com/kanzure/papermonk) for the main module. This
+module is meant to be used as a plugin. However, it should also be possible to
+use this module independently and in isolation from other papermonk modules.
+
+## testing
+
+```
+node tests.js
+```
+
+## license
+
+BSD
diff --git a/index.js b/index.js
@@ -0,0 +1,25 @@
+urlparser = require("url-parser");
+
+module.exports.test = function test(url) {
+    parsedurl = urlparser.parse(url);
+
+    if (parsedurl.hostname === "plosone.org")
+        return true;
+
+    if (parsedurl.hostname === "www.plosone.org")
+        return true;
+
+    if (parsedurl.href === "plosone.org")
+        return true;
+
+    if (parsedurl.href === "www.plosone.org")
+        return true;
+
+    return false;
+};
+
+// TODO: this should be split into multiple methods
+module.exports.download = function download(url, options, callback) {
+    // TODO: this needs to be implemented
+    throw new Error("not implemented");
+};
diff --git a/package.json b/package.json
@@ -0,0 +1,51 @@
+{
+  "name": "papermonk-downloader-plosone",
+  "description": "plosone.org scraper",
+  "version": "0.0.1",
+  "readmeFilename": "README.md",
+  "homepage": "https://github.com/kanzure/papermonk-downloader-plosone",
+  "author": {
+    "name": "Bryan Bishop",
+    "email": "kanzure@gmail.com",
+    "url": "http://heybryan.org/"
+  },
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/kanzure/papermonk-downloader-plosone"
+  },
+  "bugs": {
+    "url": "https://github.com/kanzure/papermonk-downloader-plosone/issues",
+    "email": "kanzure@gmail.com"
+  },
+  "license": "BSD",
+  "keywords": [
+    "papermonk",
+    "plos",
+    "plos one",
+    "plosone.org",
+    "public library of science",
+    "papers",
+    "pdf",
+    "pdfs",
+    "academic articles",
+    "academic papers",
+    "scholarly articles",
+    "scholarly papers",
+    "journals",
+    "scraping",
+    "spidering",
+    "crawling"
+  ],
+  "engines": {
+    "node": ">0"
+  },
+  "devDependencies": {
+  },
+  "dependencies": {
+    "url-parser": "*",
+    "tape": ">0"
+  },
+  "optionalDependencies": {
+  },
+  "main": "./index.js"
+}
diff --git a/tests.js b/tests.js
@@ -0,0 +1,53 @@
+var test = require("tape");
+
+test("require against the module", function(t) {
+    var downloader = require("./");
+
+    t.ok(downloader, "must not be undefined");
+
+    t.end();
+});
+
+test("has a method called test", function(t) {
+    var downloader = require("./");
+
+    t.ok(downloader.test, "has a method called test");
+
+    t.end();
+});
+
+test("matches for a url with plosone.org", function(t) {
+    var downloader = require("./");
+
+    t.ok(downloader.test("http://plosone.org/"));
+    t.ok(downloader.test("http://plosone.org:80/"));
+    t.ok(downloader.test("http://plosone.org"));
+    t.ok(downloader.test("http://plosone.org:80"));
+    t.ok(downloader.test("http://www.plosone.org"));
+    t.ok(downloader.test("http://www.plosone.org:80"));
+    t.ok(downloader.test("http://www.plosone.org/"));
+    t.ok(downloader.test("http://www.plosone.org:80/"));
+
+    t.ok(downloader.test("https://plosone.org/"));
+    t.ok(downloader.test("https://plosone.org:80/"));
+    t.ok(downloader.test("https://plosone.org"));
+    t.ok(downloader.test("https://plosone.org:80"));
+    t.ok(downloader.test("https://www.plosone.org"));
+    t.ok(downloader.test("https://www.plosone.org:80"));
+    t.ok(downloader.test("https://www.plosone.org/"));
+    t.ok(downloader.test("https://www.plosone.org:80/"));
+
+    /*
+    // TODO: url-parser doesn't support these, maybe there's a better module?
+    t.ok(downloader.test("www.plosone.org"));
+    t.ok(downloader.test("www.plosone.org/"));
+    t.ok(downloader.test("www.plosone.org:80"));
+    t.ok(downloader.test("www.plosone.org:80/"));
+    t.ok(downloader.test("plosone.org"));
+    t.ok(downloader.test("plosone.org/"));
+    t.ok(downloader.test("plosone.org:80"));
+    t.ok(downloader.test("plosone.org:80/"));
+    */
+
+    t.end();
+});

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Bryan Bishop <kanzure@gmail.com> (http://heybryan.org/)`