最近一直在做数据采集的事情,目的是使用java开发一套分析指定采集规则,模拟用户动作做数据提取。
因此定义了一套动作脚本,open,click,get,list,opentab,closetab。。。java解析脚本,调用phantomjs做数据提取,生成数据json文件,对外提供数据接口。采集引擎终于写的差不多了,虽然还有很多问题需要修改,但是终于不用加班了,嘿嘿嘿。-------jstarseven码字挺累的,转载请注明出处:http://www.cnblogs.com/jstarseven/p/6278197.html
言归正传,由于一直搞这些东西,突然想着拿js去写个采集玩一玩,就用tampermonkey,毕竟好久没玩了。简介:针对一些网站的数据列表,定义采集脚本,模拟用户操作,做列表数据提取,生成json数据格式化展示。
json采集脚本定义:
1 { 2 "type": "list", 3 "selector": "",//列表选择器 4 "max_page": 1,//采集页数 5 "page_selector": "",//翻页选择器 6 "iframe_selector": "",//iframe 选择器 7 "datas": [//采集字段定义 8 { 9 "selector": " ",//字段选择器 <此处为针对列表的子选择器> 10 "column": "title",//字段名称11 "from": "text",//采集类型12 "iframe_selector": "",//iframe选择器 防止一些网站怪异 一般不需要13 "open_tab": [//当前字段开新标签做采集14 {15 "selector": " ",//新标签字段选择器16 "column": " ",17 "from": "text",18 "iframe_selector": ""19 },20 {21 "selector": " ",22 "column": " ",23 "from": "text",24 "iframe_selector": ""25 },26 {27 "selector": " ",28 "column": " ",29 "from": "text",30 "iframe_selector": ""31 }32 ]33 },34 {35 "selector": " ",//字段选择器36 "column": " ",37 "from": "text",38 "iframe_selector": ""39 },40 {41 "selector": " ",//字段选择器42 "column": " ",43 "from": "text",44 "iframe_selector": ""45 }46 ]47 } 此处为针对列表的子选择器>
脚本定义好了,剩下的就是写js代码解析脚本,做数据采集,数据合并了。
那么怎么去解析实现呢,针对新开标签页的数据采集,怎么样要和之前的列表项数据做合并,保证数据的完整性呢?1.因为数据需要做存储,首先想到这么多数据该怎么存储呢,首先想到sessionStorage,但是sessionStorage在我新开标签页的时候数据不能共享, 那么就用localStorage,localStorage一般上限5m左右,足以存储一般列表的十几页数据。2.详情页面的数据和列表项数据合并,既然上面说到localStorage,那么就在localStorage里面放入一个指定的map,存放列表数据 针对列表的每一项做一个key,然后再新开标签的时候传递key,提取详情的数据,将详情页面数据,放入map中指定key的数据中。js实现map方便数据存储:
1 /* 2 * MAP对象,实现MAP功能 3 * 4 * 接口: 5 * size() 获取MAP元素个数 6 * isEmpty() 判断MAP是否为空 7 * clear() 删除MAP所有元素 8 * put(key, value) 向MAP中增加元素(key, value) 9 * remove(key) 删除指定KEY的元素,成功返回True,失败返回False 10 * get(key) 获取指定KEY的元素值VALUE,失败返回NULL 11 * element(index) 获取指定索引的元素(使用element.key,element.value获取KEY和VALUE),失败返回NULL 12 * containsKey(key) 判断MAP中是否含有指定KEY的元素 13 * containsValue(value) 判断MAP中是否含有指定VALUE的元素 14 * values() 获取MAP中所有VALUE的数组(ARRAY) 15 * keys() 获取MAP中所有KEY的数组(ARRAY) 16 */ 17 function Map() { 18 this.elements = []; 19 20 //获取MAP元素个数 21 this.size = function () { 22 return this.elements.length; 23 }; 24 25 //判断MAP是否为空 26 this.isEmpty = function () { 27 return (this.elements.length < 1); 28 }; 29 30 //删除MAP所有元素 31 this.clear = function () { 32 this.elements = []; 33 }; 34 35 //向MAP中增加元素(key, value) 36 this.put = function (_key, _value) { 37 for (var i = 0; i < this.elements.length; i++) { 38 if (this.elements[i].key == _key) { 39 this.elements[i].value = _value; 40 return; 41 } 42 } 43 this.elements.push({ 44 key: _key, 45 value: _value 46 }); 47 }; 48 49 //删除指定KEY的元素,成功返回True,失败返回False 50 this.remove = function (_key) { 51 var bln = false; 52 try { 53 for (var i = 0; i < this.elements.length; i++) { 54 if (this.elements[i].key == _key) { 55 this.elements.splice(i, 1); 56 return true; 57 } 58 } 59 } catch (e) { 60 bln = false; 61 } 62 return bln; 63 }; 64 65 //获取指定KEY的元素值VALUE,失败返回NULL 66 this.get = function (_key) { 67 try { 68 for (var i = 0; i < this.elements.length; i++) { 69 if (this.elements[i].key == _key) { 70 return this.elements[i].value; 71 } 72 } 73 } catch (e) { 74 return null; 75 } 76 }; 77 78 //获取指定索引的元素(使用element.key,element.value获取KEY和VALUE),失败返回NULL 79 this.element = function (_index) { 80 if (_index < 0 || _index >= this.elements.length) { 81 return null; 82 } 83 return this.elements[_index]; 84 }; 85 86 //判断MAP中是否含有指定KEY的元素 87 this.containsKey = function (_key) { 88 var bln = false; 89 try { 90 for (var i = 0; i < this.elements.length; i++) { 91 if (this.elements[i].key == _key) { 92 bln = true; 93 } 94 } 95 } catch (e) { 96 bln = false; 97 } 98 return bln; 99 };100 101 //判断MAP中是否含有指定VALUE的元素 102 this.containsValue = function (_value) {103 var bln = false;104 try {105 for (var i = 0; i < this.elements.length; i++) {106 if (this.elements[i].value == _value) {107 bln = true;108 }109 }110 } catch (e) {111 bln = false;112 }113 return bln;114 };115 116 //获取MAP中所有VALUE的数组(ARRAY) 117 this.values = function () {118 var arr = [];119 for (var i = 0; i < this.elements.length; i++) {120 arr.push(this.elements[i].value);121 }122 return arr;123 };124 125 //获取MAP中所有KEY的数组(ARRAY) 126 this.keys = function () {127 var arr = [];128 for (var i = 0; i < this.elements.length; i++) {129 arr.push(this.elements[i].key);130 }131 return arr;132 };133 }
js实现操作localStorage:
1 /** 2 *获取当前任务配置信息 3 */ 4 function getTaskDataMap() { 5 var data_maps = localStorage.getItem("data_maps"); 6 var datas = new Map(); 7 if (isNullParam(data_maps)) { 8 data_maps = datas; 9 } else {10 datas.elements = JSON.parse(data_maps).elements;11 return datas;12 }13 return data_maps;14 }15 16 /**17 *清空当前任务配置信息18 */19 function clearTaskDataMap() {20 localStorage.setItem("data_maps", "");21 }22 23 /**24 * 当前任务添加配置信息25 * @param step_id 脚本步骤id26 * @param config [doms,json]27 */28 function addTaskDataMap(key, values) {29 if (isNullParam(key) || isNullParam(values))30 return;31 var data_maps = getTaskDataMap();32 data_maps.put(key, values);33 localStorage.setItem("data_maps", JSON.stringify(data_maps));34 }
采用jquery.simulate.js实现点击
1 /*! 2 * jQuery Simulate v@VERSION - simulate browser mouse and keyboard events 3 * https://github.com/jquery/jquery-simulate 4 * 5 * Copyright jQuery Foundation and other contributors 6 * Released under the MIT license. 7 * http://jquery.org/license 8 * 9 * Date: @DATE 10 */ 11 12 ;(function ($, undefined) { 13 14 var rkeyEvent = /^key/, 15 rmouseEvent = /^(?:mouse|contextmenu)|click/; 16 17 $.fn.simulate = function (type, options) { 18 return this.each(function () { 19 new $.simulate(this, type, options); 20 }); 21 }; 22 23 $.simulate = function (elem, type, options) { 24 var method = $.camelCase("simulate-" + type); 25 26 this.target = elem; 27 this.options = options; 28 29 if (this[method]) { 30 this[method](); 31 } else { 32 this.simulateEvent(elem, type, options); 33 } 34 }; 35 36 $.extend($.simulate, { 37 38 keyCode: { 39 BACKSPACE: 8, 40 COMMA: 188, 41 DELETE: 46, 42 DOWN: 40, 43 END: 35, 44 ENTER: 13, 45 ESCAPE: 27, 46 HOME: 36, 47 LEFT: 37, 48 NUMPAD_ADD: 107, 49 NUMPAD_DECIMAL: 110, 50 NUMPAD_DIVIDE: 111, 51 NUMPAD_ENTER: 108, 52 NUMPAD_MULTIPLY: 106, 53 NUMPAD_SUBTRACT: 109, 54 PAGE_DOWN: 34, 55 PAGE_UP: 33, 56 PERIOD: 190, 57 RIGHT: 39, 58 SPACE: 32, 59 TAB: 9, 60 UP: 38 61 }, 62 63 buttonCode: { 64 LEFT: 0, 65 MIDDLE: 1, 66 RIGHT: 2 67 } 68 }); 69 70 $.extend($.simulate.prototype, { 71 72 simulateEvent: function (elem, type, options) { 73 var event = this.createEvent(type, options); 74 this.dispatchEvent(elem, type, event, options); 75 }, 76 77 createEvent: function (type, options) { 78 if (rkeyEvent.test(type)) { 79 return this.keyEvent(type, options); 80 } 81 82 if (rmouseEvent.test(type)) { 83 return this.mouseEvent(type, options); 84 } 85 }, 86 87 mouseEvent: function (type, options) { 88 var event, eventDoc, doc, body; 89 options = $.extend({ 90 bubbles: true, 91 cancelable: (type !== "mousemove"), 92 view: window, 93 detail: 0, 94 screenX: 0, 95 screenY: 0, 96 clientX: 1, 97 clientY: 1, 98 ctrlKey: false, 99 altKey: false,100 shiftKey: false,101 metaKey: false,102 button: 0,103 relatedTarget: undefined104 }, options);105 106 if (document.createEvent) {107 event = document.createEvent("MouseEvents");108 event.initMouseEvent(type, options.bubbles, options.cancelable,109 options.view, options.detail,110 options.screenX, options.screenY, options.clientX, options.clientY,111 options.ctrlKey, options.altKey, options.shiftKey, options.metaKey,112 options.button, options.relatedTarget || document.body.parentNode);113 114 // IE 9+ creates events with pageX and pageY set to 0.115 // Trying to modify the properties throws an error,116 // so we define getters to return the correct values.117 if (event.pageX === 0 && event.pageY === 0 && Object.defineProperty) {118 eventDoc = event.relatedTarget.ownerDocument || document;119 doc = eventDoc.documentElement;120 body = eventDoc.body;121 122 Object.defineProperty(event, "pageX", {123 get: function () {124 return options.clientX +125 ( doc && doc.scrollLeft || body && body.scrollLeft || 0 ) -126 ( doc && doc.clientLeft || body && body.clientLeft || 0 );127 }128 });129 Object.defineProperty(event, "pageY", {130 get: function () {131 return options.clientY +132 ( doc && doc.scrollTop || body && body.scrollTop || 0 ) -133 ( doc && doc.clientTop || body && body.clientTop || 0 );134 }135 });136 }137 } else if (document.createEventObject) {138 event = document.createEventObject();139 $.extend(event, options);140 // standards event.button uses constants defined here: http://msdn.microsoft.com/en-us/library/ie/ff974877(v=vs.85).aspx141 // old IE event.button uses constants defined here: http://msdn.microsoft.com/en-us/library/ie/ms533544(v=vs.85).aspx142 // so we actually need to map the standard back to oldIE143 event.button = {144 0: 1,145 1: 4,146 2: 2147 }[event.button] || ( event.button === -1 ? 0 : event.button );148 }149 150 return event;151 },152 153 keyEvent: function (type, options) {154 var event;155 options = $.extend({156 bubbles: true,157 cancelable: true,158 view: window,159 ctrlKey: false,160 altKey: false,161 shiftKey: false,162 metaKey: false,163 keyCode: 0,164 charCode: undefined165 }, options);166 167 if (document.createEvent) {168 try {169 event = document.createEvent("KeyEvents");170 event.initKeyEvent(type, options.bubbles, options.cancelable, options.view,171 options.ctrlKey, options.altKey, options.shiftKey, options.metaKey,172 options.keyCode, options.charCode);173 // initKeyEvent throws an exception in WebKit174 // see: http://stackoverflow.com/questions/6406784/initkeyevent-keypress-only-works-in-firefox-need-a-cross-browser-solution175 // and also https://bugs.webkit.org/show_bug.cgi?id=13368176 // fall back to a generic event until we decide to implement initKeyboardEvent177 } catch (err) {178 event = document.createEvent("Events");179 event.initEvent(type, options.bubbles, options.cancelable);180 $.extend(event, {181 view: options.view,182 ctrlKey: options.ctrlKey,183 altKey: options.altKey,184 shiftKey: options.shiftKey,185 metaKey: options.metaKey,186 keyCode: options.keyCode,187 charCode: options.charCode188 });189 }190 } else if (document.createEventObject) {191 event = document.createEventObject();192 $.extend(event, options);193 }194 195 if (!!/msie [\w.]+/.exec(navigator.userAgent.toLowerCase()) || (({}).toString.call(window.opera) === "[object Opera]")) {196 event.keyCode = (options.charCode > 0) ? options.charCode : options.keyCode;197 event.charCode = undefined;198 }199 200 return event;201 },202 203 dispatchEvent: function (elem, type, event) {204 if (elem.dispatchEvent) {205 elem.dispatchEvent(event);206 } else if (type === "click" && elem.click && elem.nodeName.toLowerCase() === "input") {207 elem.click();208 } else if (elem.fireEvent) {209 elem.fireEvent("on" + type, event);210 }211 },212 213 simulateFocus: function () {214 var focusinEvent,215 triggered = false,216 element = $(this.target);217 218 function trigger() {219 triggered = true;220 }221 222 element.bind("focus", trigger);223 element[0].focus();224 225 if (!triggered) {226 focusinEvent = $.Event("focusin");227 focusinEvent.preventDefault();228 element.trigger(focusinEvent);229 element.triggerHandler("focus");230 }231 element.unbind("focus", trigger);232 },233 234 simulateBlur: function () {235 var focusoutEvent,236 triggered = false,237 element = $(this.target);238 239 function trigger() {240 triggered = true;241 }242 243 element.bind("blur", trigger);244 element[0].blur();245 246 // blur events are async in IE247 setTimeout(function () {248 // IE won't let the blur occur if the window is inactive249 if (element[0].ownerDocument.activeElement === element[0]) {250 element[0].ownerDocument.body.focus();251 }252 253 // Firefox won't trigger events if the window is inactive254 // IE doesn't trigger events if we had to manually focus the body255 if (!triggered) {256 focusoutEvent = $.Event("focusout");257 focusoutEvent.preventDefault();258 element.trigger(focusoutEvent);259 element.triggerHandler("blur");260 }261 element.unbind("blur", trigger);262 }, 1);263 }264 });265 266 267 /** complex events **/268 269 function findCenter(elem) {270 var offset,271 document = $(elem.ownerDocument);272 elem = $(elem);273 offset = elem.offset();274 275 return {276 x: offset.left + elem.outerWidth() / 2 - document.scrollLeft(),277 y: offset.top + elem.outerHeight() / 2 - document.scrollTop()278 };279 }280 281 function findCorner(elem) {282 var offset,283 document = $(elem.ownerDocument);284 elem = $(elem);285 offset = elem.offset();286 287 return {288 x: offset.left - document.scrollLeft(),289 y: offset.top - document.scrollTop()290 };291 }292 293 $.extend($.simulate.prototype, {294 simulateDrag: function () {295 var i = 0,296 target = this.target,297 eventDoc = target.ownerDocument,298 options = this.options,299 center = options.handle === "corner" ? findCorner(target) : findCenter(target),300 x = Math.floor(center.x),301 y = Math.floor(center.y),302 coord = {clientX: x, clientY: y},303 dx = options.dx || ( options.x !== undefined ? options.x - x : 0 ),304 dy = options.dy || ( options.y !== undefined ? options.y - y : 0 ),305 moves = options.moves || 3;306 307 this.simulateEvent(target, "mousedown", coord);308 309 for (; i < moves; i++) {310 x += dx / moves;311 y += dy / moves;312 313 coord = {314 clientX: Math.round(x),315 clientY: Math.round(y)316 };317 318 this.simulateEvent(eventDoc, "mousemove", coord);319 }320 321 if ($.contains(eventDoc, target)) {322 this.simulateEvent(target, "mouseup", coord);323 this.simulateEvent(target, "click", coord);324 } else {325 this.simulateEvent(eventDoc, "mouseup", coord);326 }327 }328 });329 330 })(jQuery);
格式化json数据,高亮显示
1 /** 2 * 格式化json 3 * @param json 4 * @returns {string|XML} 5 */ 6 function jsonSyntaxHighLight(json) { 7 if (typeof json != 'string') 8 json = JSON.stringify(json, undefined, 2); 9 json = json.replace(/&/g, '&').replace(//g, '>');10 return json.replace(/("(\\u[a-zA-Z0-9]{4}|\\[^u]|[^\\"])*"(\s*:)?|\b(true|false|null)\b|-?\d+(?:\.\d*)?(?:[eE][+\-]?\d+)?)/g, function (match) {11 var cls = 'number';12 if (/^"/.test(match)) {13 if (/:$/.test(match)) {14 cls = 'key';15 } else {16 cls = 'string';17 }18 } else if (/true|false/.test(match)) {19 cls = 'boolean';20 } else if (/null/.test(match)) {21 cls = 'null';22 }23 return '' + match + '';24 });25 }
操作:
(以懒财网公告为例,测试)目前已经测试懒财,cnblog。。。1.首先安装tampermonkey插件下载地址: http://tampermonkey.net/2.新建脚本,复制web-extract-list.js 内容粘贴 ctrl+s3.新建脚本,复制web-extract-detail.js 内容粘贴 ctrl+s4.打开https://www.lancai.cn/about/notice.html 看执行效果采集结束之后,json页面:
注意:根据采集的网站不同需要变更js文件里面的// @match 处匹配的url, 以及task_json的脚本配置信息
项目代码github地址:https://github.com/jstarseven/web-list-extract码字挺累的,转载请注明出处:http://www.cnblogs.com/jstarseven/p/6278197.html