From 427e7f2c2c39353658e0376355433ca5878ff4fe Mon Sep 17 00:00:00 2001 From: Matthew Mueller Date: Mon, 2 Feb 2015 19:32:08 -0800 Subject: [PATCH] written, needs to be tested better. --- .gitignore | 3 + Makefile | 5 +- Readme.md | 2 + examples/dribbble-search.js | 65 ++++ examples/github-stars.js | 68 +++- index.js | 666 ++++++++++++++++++++++++++---------- lib/adapters/curl.js | 35 ++ lib/adapters/phantom.js | 60 ++++ lib/formatters/html.js | 0 lib/formatters/rss.js | 0 lib/utils/absolute-urls.js | 53 +++ lib/utils/noscript.js | 14 + lib/x-ray.js | 228 ++++++++++++ package.json | 9 +- test/index.js | 0 test/x-ray.js | 105 ++++++ 16 files changed, 1133 insertions(+), 180 deletions(-) create mode 100644 examples/dribbble-search.js create mode 100644 lib/adapters/curl.js create mode 100644 lib/adapters/phantom.js create mode 100644 lib/formatters/html.js create mode 100644 lib/formatters/rss.js create mode 100644 lib/utils/absolute-urls.js create mode 100644 lib/utils/noscript.js create mode 100644 lib/x-ray.js create mode 100644 test/index.js create mode 100644 test/x-ray.js diff --git a/.gitignore b/.gitignore index 93f1361..30fb8fe 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ node_modules npm-debug.log +.DS_Store +examples/_* +examples/*.json diff --git a/Makefile b/Makefile index 4e9c8d3..28de99f 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ test: @./node_modules/.bin/mocha \ --require should \ - --reporter spec + --reporter spec \ + --timeout 20s -.PHONY: test \ No newline at end of file +.PHONY: test diff --git a/Readme.md b/Readme.md index db25313..6b51db9 100644 --- a/Readme.md +++ b/Readme.md @@ -3,6 +3,8 @@ structure any website +## + ## License (The MIT License) diff --git a/examples/dribbble-search.js b/examples/dribbble-search.js new file mode 100644 index 0000000..4d086af --- /dev/null +++ b/examples/dribbble-search.js @@ -0,0 +1,65 @@ +/** + * Module Dependencies + */ + +var xray = require('..'); +var write = require('fs').createWriteStream(__dirname + '/dribbble-search.json'); + +/** + * Base URL + */ + +var url = 'https://dribbble.com/search?q=form'; + +/** + * x-ray + */ + +xray(url) + .selector('.dribbbles > li') + .property('url', '.dribbble-img > .dribbble-link', 'href') + .property('name', '.dribbble-img img', 'alt') + .property('image', '.dribbble-img img', 'src') + .paginate('.next_page') + // .format(format) + .run(function(err, json) { + if (err) throw err; + // console.log(json); + // console.log(json); + // console.log('called'); + // var str = JSON.stringify(arr, true, 2); + // write(__dirname + '/dribble.json', str, 'utf8'); + // console.log('crawled!'); + }) + .end(function(err, json) { + console.log(err, json); + }) + .stream(write); + // .end(function(err) { + // if (err) throw err; + // console.log('crawled'); + // }); + +/** + * Images + */ + +function images($el, $) { + var imgs = []; + + $el.find('a').each(function(i, el) { + imgs.push($(el).attr('data-src')); + }) + + return imgs; +} + +/** + * Format + */ + +function format(json) { + json.tags = json.tags.split(','); + json.shots = +json.shots.replace(/\D+/g, ''); + json.followers = +json.followers.replace(/\D+/g, ''); +} diff --git a/examples/github-stars.js b/examples/github-stars.js index 401dbb2..e03add8 100644 --- a/examples/github-stars.js +++ b/examples/github-stars.js @@ -1,13 +1,71 @@ +/** + * Module Dependencies + */ -Structural - .url('https://github.com/stars/matthewmueller?direction=asc&language=javascript&sort=created') - .url('https://github.com/stars/matthewmueller?direction=desc&language=javascript&sort=created') - .key('title', 'div > ul > li.repo-list-item.public.source > h3.repo-list-name > a') +var xray = require('..'); +var array = require('array'); +var fmt = require('util').format; + +/** + * Urls + */ + +var languages = [ + 'JavaScript', + 'CSS', + 'CoffeeScript', + 'Ruby', + 'Python', + 'Shell', + 'Go', + 'C', + 'C++', + 'Objective-C', + 'PHP', + 'VimL', + 'Java', + 'Swift', + 'Scala', + 'TeX', + 'Perl', + 'Lua', + 'Clojure', + 'IDL', + 'Objective-C++', + 'Processing', + 'R', + 'Vala', + 'Lisp', + 'XSLT', + 'LiveScript', + 'TypeScript' +]; + +/** + * Get the urls + */ + +var base = 'https://github.com/stars/matthewmueller?direction=%s&language=%s&sort=created'; +var urls = []; +languages.forEach(function(language) { + language = decodeURIComponent(language); + urls.push(fmt(base, 'asc', language)); + urls.push(fmt(base, 'desc', language)); +}) + +/** + * Use + */ + +xray(urls) + .key('repo', 'div > ul > li.repo-list-item.public.source > h3.repo-list-name > a') .key('url', 'div > ul > li.repo-list-item.public.source > h3.repo-list-name > a', 'href') .key('author', 'ul > li.repo-list-item.public.source > h3.repo-list-name > a > span.prefix') .key('description', 'div > div > ul > li > p.repo-list-description') + .key('time', '.repo-list-meta time', 'datetime') .paginate('div.pagination > a:last-child') .json(function(err, json) { if (err) throw err; - console.log(json); + json = array(json).unique('repo').toArray(); + console.log(JSON.stringify(json, true, 2)); }) diff --git a/index.js b/index.js index 359a863..ae108a2 100644 --- a/index.js +++ b/index.js @@ -1,175 +1,497 @@ /** - * Module Dependencies + * Export `x-ray` */ -var cheerio = require('cheerio'); -var request = require('request'); -var debug = require('debug')('structural'); -var url = require('url'); - -/** - * Export `Structural` - */ - -module.exports = Structural; - -/** - * Initialize `Structural` - */ - -function Structural(url) { - if (!(this instanceof Structural)) return new Structural(url); - this.limit = Infinity; - this.url = url; - this.keys = []; -} - -/** - * Key - */ - -Structural.prototype.key = function(name, selector, attr) { - this.keys.push({ - name: name, - selector: selector, - attr: attr - }); - - return this; -}; - -/** - * json - */ - -Structural.prototype.json = function(fn) { - var paginate = this.paginateEl; - var limit = paginate ? this.limit : 1; - var url = this.url; - var self = this; - var out = []; - - debug('fetching: %s', url); - this.get(this.url, next); - - function next(err, col, $) { - if (err) return fn(err, stringify(out)); - out = out.concat(col); - if (--limit <= 0) return fn(null, stringify(out)); - var href = $(paginate).attr('href'); - if (!href) return fn(null, stringify(out)); - debug('next page: %s', href); - self.get(href, next); - } - - function stringify(obj) { - return JSON.stringify(obj, true, 2); - } -} - -/** - * Get - * - * @param {String} url - * @param {Function} fn - * @return {Structural} - * @api private - */ - -Structural.prototype.get = function(url, fn) { - var keys = this.keys; - - request.get(url, function(err, res, body) { - if (err) return fn(err); - else if (res.statusCode !== 200) return fn(new Error('status code: ' + res.statusCode)); - var $ = cheerio.load(body); - var out = []; - - absolute(url, $); - - keys.forEach(function(key) { - $(key.selector).each(function(i, el) { - if (!out[i]) out[i] = {}; - out[i][key.name] = key.attr ? $(el).attr(key.attr) : text([el]); - }); - }); - - return fn(null, out, $); - }); -} - -/** - * paginate - */ - -Structural.prototype.paginate = function(el, limit) { - this.paginateEl = el; - this.limit = limit; - return this; -}; - - -/** - * Change all the URLs into absolute urls - * - * @param {Cheerio} $ - * @return {$} - */ - -function absolute(path, $) { - var parts = url.parse(path); - var remote = parts.protocol + '//' + parts.hostname; - $('a[href]').each(abs); - - function abs(i, el) { - var $el = $(el); - var key = null; - var src = null; - - if (src = $el.attr('href')) { - key = 'href'; - } else if (src = $el.attr('src')) { - key = 'src'; - } else { - return; - } - - src = src.trim(); - - if (~src.indexOf('://')) { - return; - } else if (src[0] == '/') { - src = remote + src; - } else { - src = remote + '/' + parts.pathname.replace(/^\//, '') + '/' + src - } - - $el.attr(key, src); - } -} - -/** - * Fetch text, but trim at each node. - * - * @param {Array} elems - * @return {String} - * @api private - */ - -function text(elems) { - if (!elems) return ''; - - var ret = '', - len = elems.length, - elem; - - for (var i = 0; i < len; i ++) { - elem = elems[i]; - if (elem.type === 'text') ret += elem.data.trim(); - else if (elem.children && elem.type !== 'comment') { - ret += text(elem.children); - } - } - - return ret; -}; +module.exports = require('./lib/x-ray'); + + + +// /** +// * Module Dependencies +// */ +// +// var debug = require('debug')('x-ray'); +// var delegates = require('matthewmueller-delegates'); +// var Selector = require('./lib/selector'); +// var Nightmare = require('nightmare'); +// var type = require('component-type'); +// var traverse = require('traverse'); +// var extend = require('extend.js'); +// var cheerio = require('cheerio'); +// var noop = function() {}; +// var url = require('url'); +// var keys = Object.keys; +// var fs = require('fs'); +// +// /** +// * Export `Xray` +// */ +// +// module.exports = Xray; +// +// /** +// * Regexps +// */ +// +// var rselector = /([^\[]+)(?:\[([^\[]+)\])?/ +// +// /** +// * Initialize `Xray` +// * +// * @param {String} url +// * @return {Xray} +// * @api public +// */ +// +// function Xray(url) { +// if (!(this instanceof Xray)) return new Xray(url); +// +// this.nightmare = Nightmare(); +// this.limit = Infinity; +// this._throws = false; +// this._format = noop; +// this.selects = []; +// this.from = 5000; +// this._end = noop; +// this.to = 10000; +// this.url = url; +// this.keys = []; +// } +// +// /** +// * Delegate to nightmare +// */ +// +// delegates(Xray.prototype, 'nightmare') +// .method('screenshot') +// .method('useragent') +// .method('viewport') +// .method('evaluate') +// .method('refresh') +// .method('upload') +// .method('click') +// .method('goto') +// .method('type') +// .method('wait') +// .method('back') +// .method('url') +// .method('use') +// +// /** +// * throws +// */ +// +// Xray.prototype.throws = function(throws) { +// this._throws = !!throws; +// return this; +// }; +// +// /** +// * Selector +// * +// * @param {String|Object} parent +// * @param {Object} select +// * @return {Xray} +// */ +// +// Xray.prototype.select = function(parent, select) { +// if (!arguments.length) return this.selects; +// if ('object' == typeof parent) select = parent, parent = null; +// +// this.selects.push({ +// parent: parent, +// select: select +// }); +// +// return this; +// }; +// +// /** +// * Run +// * +// * @param {Function} fn +// * @return {Xray} +// * @api public +// */ +// +// Xray.prototype.run = function(fn) { +// var selectors = this.selects; +// var self = this; +// +// this.request(this.url, function(err, $) { +// if (err) return fn(err); +// var out = {}; +// +// selectors.forEach(function(selector) { +// var $el = selector.parent ? $(selector.parent) : $; +// +// }); +// +// var select = Select($); +// +// var obj = rmap(selectors, function(selector, i) { +// +// }); +// +// // Object.keys(selectors).forEach(function(sel) { +// // var val = selectors[sel]; +// +// // switch (type(val)) { +// // case 'string': select(val); +// // } +// // }) +// }) +// +// function Select($) { +// return function select(selector, i) { +// if (isArray(selector)) { +// +// } +// var m = selector.match(rselector); +// var $el = $(m[1]); +// if (i === undefined) return $el.get(); +// $el = $el.eq(i); +// if (!m[2] || m[2] == 'text') return text([$el[0]]); +// else if (m[2] == 'html') return $el.html(); +// else return $el.attr(m[2]); +// } +// } +// }; +// +// /** +// * Module Dependencies +// */ +// +// var isArray = Array.isArray; +// var keys = Object.keys; +// +// /** +// * Expose `rmap` +// */ +// +// function rmap(obj, fn, j) { +// if ('object' == typeof obj) { +// var ret = {}; +// keys(obj).map(function(k, i) { +// ret[k] = rmap(obj[k], fn, k); +// }); +// return ret; +// } else { +// return fn(obj, j || 0); +// } +// } +// +// // /** +// // * Add a property +// // * +// // * @param {String} name +// // * @param {String} selector +// // * @param {String|Function} attr +// // * @return {Xray} +// // * @api public +// // */ +// +// // Xray.prototype.property = function(name, selector, attr) { +// // this.keys.push({ +// // name: name, +// // selector: selector, +// // attr: attr +// // }); +// +// // return this; +// // }; +// +// /** +// * Run +// * +// * @param {Function} fn +// * @return {Xray} +// * @api public +// */ +// +// // Xray.prototype.run = function(fn) { +// // var paginate = this.paginateEl; +// // var limit = paginate ? this.limit : 1; +// // var pending = this.urls.length; +// // var throws = this._throws; +// // var format = this._format; +// // var urls = this.urls; +// // var first = true; +// // var self = this; +// +// // urls.forEach(function(url) { +// // debug('fetching: %s', url); +// // self.request(url, next); +// // }); +// +// // function next(err, arr, $) { +// // if (err) return done(err); +// +// // // format and call fn +// // arr.map(function(json) { +// // format(json); +// // self._stream && send(json); +// // fn(null, json); +// // }); +// +// // if (--limit <= 0) return done(null, arr); +// // var href = $(paginate).attr('href'); +// // if (!href) return done(null, arr); +// +// // // wait +// // setTimeout(function() { +// // debug('next page: %s', href); +// // self.request(href, next); +// // }, self.delay()); +// // } +// +// // function done(err, arr) { +// // var end = self._end; +// +// // if (err) { +// // fn(err); +// // if (throws) end(err); +// // return; +// // } +// +// // // if we aren't paginating any further +// // // then call the end function +// // if (!--pending) { +// // if (self._stream) { +// // send(); +// // self._stream && self._stream.end(); +// // } +// // end(); +// // } +// // } +// +// // function send(json) { +// // if (first && !json) return self._stream.end(); +// // else if (!json) return self._stream.write('\n]\n'); +// +// // var str = JSON.stringify(json, true, 2); +// +// // if (first) { +// // self._stream.write('[\n' + str); +// // first = false; +// // } else { +// // self._stream.write('\n,\n' + str); +// // } +// // } +// +// // return this; +// // } +// +// // /** +// // * Stream +// // */ +// +// // Xray.prototype.stream = function(file) { +// // if (!arguments.length) return this._stream; +// +// // this._stream = 'string' == typeof file +// // ? fs.createWriteStream(file) +// // : file; +// +// // return this; +// // }; +// +// +// /** +// * Get +// * +// * @param {String} url +// * @param {Function} fn +// * @return {Xray} +// * @api private +// */ +// +// Xray.prototype.request = function(url, fn) { +// var nightmare = this.nightmare; +// var format = this._format; +// var keys = this.keys; +// +// nightmare +// .goto(url) +// .evaluate(function() { +// return document.documentElement.outerHTML; +// }, load) +// .run(function(err) { +// if (err) return fn(err); +// }); +// +// function load(body) { +// try { +// var $ = cheerio.load(body); +// absolute(url, $); +// noscripts($) +// } catch (e) { +// return fn(e); +// } +// +// return fn(null, $); +// } +// }; +// +// // function response(body) { +// // var $ = cheerio.load(body); +// // var out = []; +// +// // absolute(url, $); +// // noscripts($) +// +// // keys.forEach(function(key) { +// // $(key.selector).each(function(i, el) { +// // if (!out[i]) out[i] = {}; +// +// // // 3rd param +// // if (key.attr) { +// // // string or fn +// // out[i][key.name] = 'string' == typeof key.attr +// // ? $(el).attr(key.attr) +// // : key.attr($(el), $); +// // } else { +// // out[i][key.name] = text([el]); +// // } +// +// // }); +// // }); +// +// // return fn(null, out, $); +// // } +// // } +// +// /** +// * Delay the next request +// */ +// +// Xray.prototype.delay = function(from, to) { +// if (arguments.length) { +// this.from = from; +// this.to = to; +// return this; +// } else { +// return Math.floor(Math.random() * this.to) + this.from; +// } +// }; +// +// +// /** +// * Paginate +// * +// * @param {Element} el +// * @param {Number} limit +// * @return {Xray} +// * @api public +// */ +// +// Xray.prototype.paginate = function(el, limit) { +// this.paginateEl = el; +// this.limit = limit; +// return this; +// }; +// +// /** +// * format +// * +// * @param {Function} format +// * @return {Xray} +// * @api public +// */ +// +// Xray.prototype.format = function(format) { +// this._format = format; +// return this; +// }; +// +// /** +// * Specify the result as HTML +// */ +// +// Xray.prototype.html = function(template) { +// +// }; +// +// +// /** +// * End +// */ +// +// Xray.prototype.end = function(end) { +// this._end = end; +// return this; +// }; +// +// +// /** +// * Change all the URLs into absolute urls +// * +// * @param {Cheerio} $ +// * @return {$} +// */ +// +// function absolute(path, $) { +// var parts = url.parse(path); +// var remote = parts.protocol + '//' + parts.hostname; +// $('a[href]').each(abs); +// +// function abs(i, el) { +// var $el = $(el); +// var key = null; +// var src = null; +// +// if (src = $el.attr('href')) { +// key = 'href'; +// } else if (src = $el.attr('src')) { +// key = 'src'; +// } else { +// return; +// } +// +// src = src.trim(); +// +// if (~src.indexOf('://')) { +// return; +// } else if (src[0] == '/') { +// src = remote + src; +// } else { +// src = remote + '/' + parts.pathname.replace(/^\//, '') + '/' + src +// } +// +// $el.attr(key, src); +// } +// } +// +// /** +// * Remove any