diff --git a/package.json b/package.json new file mode 100644 index 0000000..df85ff3 --- /dev/null +++ b/package.json @@ -0,0 +1,31 @@ +{ + "name": "scholar-scraper", + "version": "0.0.1", + "description": "Google Scholar profiles scraper", + "main": "scrape.js", + "scripts": { + "test": "mocha" + }, + "repository": { + "type": "git", + "url": "git@github.com:lintool/scholar-scraper.git" + }, + "keywords": [ + "Google", + "Scholar", + "h", + "index", + "citations", + "bibliometry" + ], + "author": "Jimmy Lin", + "license": "Apache", + "bugs": { + "url": "https://github.com/lintool/scholar-scraper/issues" + }, + "dependencies": { + "async": "~0.9.0", + "request": "~2.53.0", + "cheerio": "~0.18.0" + } +} diff --git a/scrape.js b/scrape.js index dc33387..54d090e 100644 --- a/scrape.js +++ b/scrape.js @@ -6,50 +6,50 @@ var async = require('async'); var people = require(process.argv[2]); var scrapeEntry = function(person, doneCallback) { - var url = people[person]; - var data = {}; - - // properly set the encoding, or we'll mangle accented characters: - // http://stackoverflow.com/questions/8332500/module-request-how-to-properly-retrieve-accented-characters-%EF%BF%BD-%EF%BF%BD-%EF%BF%BD - request({ encoding: 'binary', method: "GET", uri: url}, function(err, resp, body) { - $ = cheerio.load(new String(body)); - - try { - var photo = $('#gsc_prf_pup')[0].attribs.src.replace(/&/g, '&'); - var affiliation = $('.gsc_prf_il', '#gsc_prf_i')[0].children[0].data; - - var keywords_root = $('.gsc_prf_ila'); - var keywords = []; - - for (var i=0; i