tampermonkey,采用js解析自定义脚本,实现网页列表数据采集分析

2017-01-12 19:08:32来源:作者:人点击

最近一直在做数据采集的事情,目的是使用java开发一套分析指定采集规则,模拟用户动作做数据提取。
因此定义了一套动作脚本,open,click,get,list,opentab,closetab。。。
java解析脚本,调用phantomjs做数据提取,生成数据json文件,对外提供数据接口。
采集引擎终于写的差不多了,虽然还有很多问题需要修改,但是终于不用加班了,嘿嘿嘿。-------jstarseven

码字挺累的,转载请注明出处:http://www.cnblogs.com/jstarseven/p/6278197.html
言归正传,由于一直搞这些东西,突然想着拿js去写个采集玩一玩,就用tampermonkey,毕竟好久没玩了。

简介:针对一些网站的数据列表,定义采集脚本,模拟用户操作,做列表数据提取,生成json数据格式化展示。

json采集脚本定义:

1 { 2  "type": "list", 3  "selector": "",//列表选择器 4  "max_page": 1,//采集页数 5  "page_selector": "",//翻页选择器 6  "iframe_selector": "",//iframe 选择器 7  "datas": [//采集字段定义 8 { 9 "selector": " ",//字段选择器<此处为针对列表的子选择器>10 "column": "title",//字段名称11 "from": "text",//采集类型12 "iframe_selector": "",//iframe选择器 防止一些网站怪异 一般不需要13 "open_tab": [//当前字段开新标签做采集14  {15 "selector": " ",//新标签字段选择器16 "column": " ",17 "from": "text",18 "iframe_selector": ""19  },20  {21 "selector": " ",22 "column": " ",23 "from": "text",24 "iframe_selector": ""25  },26  {27 "selector": " ",28 "column": " ",29 "from": "text",30 "iframe_selector": ""31  }32 ]33 },34 {35 "selector": " ",//字段选择器36 "column": " ",37 "from": "text",38 "iframe_selector": ""39 },40 {41 "selector": " ",//字段选择器42 "column": " ",43 "from": "text",44 "iframe_selector": ""45 }46  ]47 }




脚本定义好了,剩下的就是写js代码解析脚本,做数据采集,数据合并了。
那么怎么去解析实现呢,针对新开标签页的数据采集,怎么样要和之前的列表项数据做合并,保证数据的完整性呢?
1.因为数据需要做存储,首先想到这么多数据该怎么存储呢,首先想到sessionStorage,但是sessionStorage在我新开标签页的时候数据不能共享,
那么就用localStorage,localStorage一般上限5m左右,足以存储一般列表的十几页数据。
2.详情页面的数据和列表项数据合并,既然上面说到localStorage,那么就在localStorage里面放入一个指定的map,存放列表数据
针对列表的每一项做一个key,然后再新开标签的时候传递key,提取详情的数据,将详情页面数据,放入map中指定key的数据中。

js实现map方便数据存储:



1 /*  2  * MAP对象,实现MAP功能  3  *  4  * 接口:  5  * size()  获取MAP元素个数  6  * isEmpty() 判断MAP是否为空  7  * clear()  删除MAP所有元素  8  * put(key, value)向MAP中增加元素(key, value)9  * remove(key) 删除指定KEY的元素,成功返回True,失败返回False 10  * get(key) 获取指定KEY的元素值VALUE,失败返回NULL 11  * element(index)获取指定索引的元素(使用element.key,element.value获取KEY和VALUE),失败返回NULL 12  * containsKey(key)  判断MAP中是否含有指定KEY的元素 13  * containsValue(value) 判断MAP中是否含有指定VALUE的元素 14  * values() 获取MAP中所有VALUE的数组(ARRAY) 15  * keys()  获取MAP中所有KEY的数组(ARRAY) 16  */ 17 function Map() { 18  this.elements = []; 19  20  //获取MAP元素个数21  this.size = function () { 22 return this.elements.length; 23  }; 24  25  //判断MAP是否为空26  this.isEmpty = function () { 27 return (this.elements.length < 1); 28  }; 29  30  //删除MAP所有元素31  this.clear = function () { 32 this.elements = []; 33  }; 34  35  //向MAP中增加元素(key, value) 36  this.put = function (_key, _value) { 37 for (var i = 0; i < this.elements.length; i++) { 38 if (this.elements[i].key == _key) { 39  this.elements[i].value = _value; 40  return; 41 } 42 } 43 this.elements.push({ 44 key: _key, 45 value: _value 46 }); 47  }; 48  49  //删除指定KEY的元素,成功返回True,失败返回False50  this.remove = function (_key) { 51 var bln = false; 52 try { 53 for (var i = 0; i < this.elements.length; i++) { 54  if (this.elements[i].key == _key) { 55 this.elements.splice(i, 1); 56 return true; 57  } 58 } 59 } catch (e) { 60 bln = false; 61 } 62 return bln; 63  }; 64  65  //获取指定KEY的元素值VALUE,失败返回NULL66  this.get = function (_key) { 67 try { 68 for (var i = 0; i < this.elements.length; i++) { 69  if (this.elements[i].key == _key) { 70 return this.elements[i].value; 71  } 72 } 73 } catch (e) { 74 return null; 75 } 76  }; 77  78  //获取指定索引的元素(使用element.key,element.value获取KEY和VALUE),失败返回NULL79  this.element = function (_index) { 80 if (_index < 0 || _index >= this.elements.length) { 81 return null; 82 } 83 return this.elements[_index]; 84  }; 85  86  //判断MAP中是否含有指定KEY的元素87  this.containsKey = function (_key) { 88 var bln = false; 89 try { 90 for (var i = 0; i < this.elements.length; i++) { 91  if (this.elements[i].key == _key) { 92 bln = true; 93  } 94 } 95 } catch (e) { 96 bln = false; 97 } 98 return bln; 99  };100 101  //判断MAP中是否含有指定VALUE的元素  102  this.containsValue = function (_value) {103 var bln = false;104 try {105 for (var i = 0; i < this.elements.length; i++) {106  if (this.elements[i].value == _value) {107 bln = true;108  }109 }110 } catch (e) {111 bln = false;112 }113 return bln;114  };115 116  //获取MAP中所有VALUE的数组(ARRAY)  117  this.values = function () {118 var arr = [];119 for (var i = 0; i < this.elements.length; i++) {120 arr.push(this.elements[i].value);121 }122 return arr;123  };124 125  //获取MAP中所有KEY的数组(ARRAY)  126  this.keys = function () {127 var arr = [];128 for (var i = 0; i < this.elements.length; i++) {129 arr.push(this.elements[i].key);130 }131 return arr;132  };133 }




js实现操作localStorage:



1 /** 2  *获取当前任务配置信息 3  */ 4 function getTaskDataMap() { 5  var data_maps = localStorage.getItem("data_maps"); 6  var datas = new Map(); 7  if (isNullParam(data_maps)) { 8 data_maps = datas; 9  } else {10 datas.elements = JSON.parse(data_maps).elements;11 return datas;12  }13  return data_maps;14 }15 16 /**17  *清空当前任务配置信息18  */19 function clearTaskDataMap() {20  localStorage.setItem("data_maps", "");21 }22 23 /**24  * 当前任务添加配置信息25  * @param step_id  脚本步骤id26  * @param config[doms,json]27  */28 function addTaskDataMap(key, values) {29  if (isNullParam(key) || isNullParam(values))30 return;31  var data_maps = getTaskDataMap();32  data_maps.put(key, values);33  localStorage.setItem("data_maps", JSON.stringify(data_maps));34 }




采用jquery.simulate.js实现点击



1 /*!  2  * jQuery Simulate v@VERSION - simulate browser mouse and keyboard events  3  * https://github.com/jquery/jquery-simulate4  *  5  * Copyright jQuery Foundation and other contributors  6  * Released under the MIT license.  7  * http://jquery.org/license8  *  9  * Date: @DATE 10  */ 11  12 ;(function ($, undefined) { 13  14  var rkeyEvent = /^key/, 15 rmouseEvent = /^(?:mouse|contextmenu)|click/; 16  17  $.fn.simulate = function (type, options) { 18 return this.each(function () { 19 new $.simulate(this, type, options); 20 }); 21  }; 22  23  $.simulate = function (elem, type, options) { 24 var method = $.camelCase("simulate-" + type); 25  26 this.target = elem; 27 this.options = options; 28  29 if (this[method]) { 30 this[method](); 31 } else { 32 this.simulateEvent(elem, type, options); 33 } 34  }; 35  36  $.extend($.simulate, { 37  38 keyCode: { 39 BACKSPACE: 8, 40 COMMA: 188, 41 DELETE: 46, 42 DOWN: 40, 43 END: 35, 44 ENTER: 13, 45 ESCAPE: 27, 46 HOME: 36, 47 LEFT: 37, 48 NUMPAD_ADD: 107, 49 NUMPAD_DECIMAL: 110, 50 NUMPAD_DIVIDE: 111, 51 NUMPAD_ENTER: 108, 52 NUMPAD_MULTIPLY: 106, 53 NUMPAD_SUBTRACT: 109, 54 PAGE_DOWN: 34, 55 PAGE_UP: 33, 56 PERIOD: 190, 57 RIGHT: 39, 58 SPACE: 32, 59 TAB: 9, 60 UP: 38 61 }, 62  63 buttonCode: { 64 LEFT: 0, 65 MIDDLE: 1, 66 RIGHT: 2 67 } 68  }); 69  70  $.extend($.simulate.prototype, { 71  72 simulateEvent: function (elem, type, options) { 73 var event = this.createEvent(type, options); 74 this.dispatchEvent(elem, type, event, options); 75 }, 76  77 createEvent: function (type, options) { 78 if (rkeyEvent.test(type)) { 79  return this.keyEvent(type, options); 80 } 81  82 if (rmouseEvent.test(type)) { 83  return this.mouseEvent(type, options); 84 } 85 }, 86  87 mouseEvent: function (type, options) { 88 var event, eventDoc, doc, body; 89 options = $.extend({ 90  bubbles: true, 91  cancelable: (type !== "mousemove"), 92  view: window, 93  detail: 0, 94  screenX: 0, 95  screenY: 0, 96  clientX: 1, 97  clientY: 1, 98  ctrlKey: false, 99  altKey: false,100  shiftKey: false,101  metaKey: false,102  button: 0,103  relatedTarget: undefined104 }, options);105 106 if (document.createEvent) {107  event = document.createEvent("MouseEvents");108  event.initMouseEvent(type, options.bubbles, options.cancelable,109 options.view, options.detail,110 options.screenX, options.screenY, options.clientX, options.clientY,111 options.ctrlKey, options.altKey, options.shiftKey, options.metaKey,112 options.button, options.relatedTarget || document.body.parentNode);113 114  // IE 9+ creates events with pageX and pageY set to 0.115  // Trying to modify the properties throws an error,116  // so we define getters to return the correct values.117  if (event.pageX === 0 && event.pageY === 0 && Object.defineProperty) {118 eventDoc = event.relatedTarget.ownerDocument || document;119 doc = eventDoc.documentElement;120 body = eventDoc.body;121 122 Object.defineProperty(event, "pageX", {123 get: function () {124  return options.clientX +125 ( doc && doc.scrollLeft || body && body.scrollLeft || 0 ) -126 ( doc && doc.clientLeft || body && body.clientLeft || 0 );127 }128 });129 Object.defineProperty(event, "pageY", {130 get: function () {131  return options.clientY +132 ( doc && doc.scrollTop || body && body.scrollTop || 0 ) -133 ( doc && doc.clientTop || body && body.clientTop || 0 );134 }135 });136  }137 } else if (document.createEventObject) {138  event = document.createEventObject();139  $.extend(event, options);140  // standards event.button uses constants defined here: http://msdn.microsoft.com/en-us/library/ie/ff974877(v=vs.85).aspx 141  // old IE event.button uses constants defined here: http://msdn.microsoft.com/en-us/library/ie/ms533544(v=vs.85).aspx 142  // so we actually need to map the standard back to oldIE143  event.button = {144 0: 1,145 1: 4,146 2: 2147 }[event.button] || ( event.button === -1 ? 0 : event.button );148 }149 150 return event;151 },152 153 keyEvent: function (type, options) {154 var event;155 options = $.extend({156  bubbles: true,157  cancelable: true,158  view: window,159  ctrlKey: false,160  altKey: false,161  shiftKey: false,162  metaKey: false,163  keyCode: 0,164  charCode: undefined165 }, options);166 167 if (document.createEvent) {168  try {169 event = document.createEvent("KeyEvents");170 event.initKeyEvent(type, options.bubbles, options.cancelable, options.view,171 options.ctrlKey, options.altKey, options.shiftKey, options.metaKey,172 options.keyCode, options.charCode);173 // initKeyEvent throws an exception in WebKit174 // see: http://stackoverflow.com/questions/6406784/initkeyevent-keypress-only-works-in-firefox-need-a-cross-browser-solution 175 // and also https://bugs.webkit.org/show_bug.cgi?id=13368 176 // fall back to a generic event until we decide to implement initKeyboardEvent177  } catch (err) {178 event = document.createEvent("Events");179 event.initEvent(type, options.bubbles, options.cancelable);180 $.extend(event, {181 view: options.view,182 ctrlKey: options.ctrlKey,183 altKey: options.altKey,184 shiftKey: options.shiftKey,185 metaKey: options.metaKey,186 keyCode: options.keyCode,187 charCode: options.charCode188 });189  }190 } else if (document.createEventObject) {191  event = document.createEventObject();192  $.extend(event, options);193 }194 195 if (!!/msie [/w.]+/.exec(navigator.userAgent.toLowerCase()) || (({}).toString.call(window.opera) === "[object Opera]")) {196  event.keyCode = (options.charCode > 0) ? options.charCode : options.keyCode;197  event.charCode = undefined;198 }199 200 return event;201 },202 203 dispatchEvent: function (elem, type, event) {204 if (elem.dispatchEvent) {205  elem.dispatchEvent(event);206 } else if (type === "click" && elem.click && elem.nodeName.toLowerCase() === "input") {207  elem.click();208 } else if (elem.fireEvent) {209  elem.fireEvent("on" + type, event);210 }211 },212 213 simulateFocus: function () {214 var focusinEvent,215  triggered = false,216  element = $(this.target);217 218 function trigger() {219  triggered = true;220 }221 222 element.bind("focus", trigger);223 element[0].focus();224 225 if (!triggered) {226  focusinEvent = $.Event("focusin");227  focusinEvent.preventDefault();228  element.trigger(focusinEvent);229  element.triggerHandler("focus");230 }231 element.unbind("focus", trigger);232 },233 234 simulateBlur: function () {235 var focusoutEvent,236  triggered = false,237  element = $(this.target);238 239 function trigger() {240  triggered = true;241 }242 243 element.bind("blur", trigger);244 element[0].blur();245 246 // blur events are async in IE247 setTimeout(function () {248  // IE won't let the blur occur if the window is inactive249  if (element[0].ownerDocument.activeElement === element[0]) {250 element[0].ownerDocument.body.focus();251  }252 253  // Firefox won't trigger events if the window is inactive254  // IE doesn't trigger events if we had to manually focus the body255  if (!triggered) {256 focusoutEvent = $.Event("focusout");257 focusoutEvent.preventDefault();258 element.trigger(focusoutEvent);259 element.triggerHandler("blur");260  }261  element.unbind("blur", trigger);262 }, 1);263 }264  });265 266 267  /** complex events **/268 269  function findCenter(elem) {270 var offset,271 document = $(elem.ownerDocument);272 elem = $(elem);273 offset = elem.offset();274 275 return {276 x: offset.left + elem.outerWidth() / 2 - document.scrollLeft(),277 y: offset.top + elem.outerHeight() / 2 - document.scrollTop()278 };279  }280 281  function findCorner(elem) {282 var offset,283 document = $(elem.ownerDocument);284 elem = $(elem);285 offset = elem.offset();286 287 return {288 x: offset.left - document.scrollLeft(),289 y: offset.top - document.scrollTop()290 };291  }292 293  $.extend($.simulate.prototype, {294 simulateDrag: function () {295 var i = 0,296  target = this.target,297  eventDoc = target.ownerDocument,298  options = this.options,299  center = options.handle === "corner" ? findCorner(target) : findCenter(target),300  x = Math.floor(center.x),301  y = Math.floor(center.y),302  coord = {clientX: x, clientY: y},303  dx = options.dx || ( options.x !== undefined ? options.x - x : 0 ),304  dy = options.dy || ( options.y !== undefined ? options.y - y : 0 ),305  moves = options.moves || 3;306 307 this.simulateEvent(target, "mousedown", coord);308 309 for (; i < moves; i++) {310  x += dx / moves;311  y += dy / moves;312 313  coord = {314 clientX: Math.round(x),315 clientY: Math.round(y)316  };317 318  this.simulateEvent(eventDoc, "mousemove", coord);319 }320 321 if ($.contains(eventDoc, target)) {322  this.simulateEvent(target, "mouseup", coord);323  this.simulateEvent(target, "click", coord);324 } else {325  this.simulateEvent(eventDoc, "mouseup", coord);326 }327 }328  });329 330 })(jQuery);


View Code


格式化json数据,高亮显示



1 /** 2  * 格式化json 3  * @param json 4  * @returns {string|XML} 5  */ 6 function jsonSyntaxHighLight(json) { 7  if (typeof json != 'string') 8 json = JSON.stringify(json, undefined, 2); 9  json = json.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>');10  return json.replace(/("(//u[a-zA-Z0-9]{4}|//[^u]|[^//"])*"(/s*:)?|/b(true|false|null)/b|-?/d+(?:/./d*)?(?:[eE][+/-]?/d+)?)/g, function (match) {11 var cls = 'number';12 if (/^"/.test(match)) {13 if (/:$/.test(match)) {14  cls = 'key';15 } else {16  cls = 'string';17 }18 } else if (/true|false/.test(match)) {19 cls = 'boolean';20 } else if (/null/.test(match)) {21 cls = 'null';22 }23 return '<span class="' + cls + '">' + match + '</span>';24  });25 }




操作:
(以懒财网公告为例,测试)目前已经测试懒财,cnblog。。。
1.首先安装tampermonkey插件下载地址: http://tampermonkey.net/ 2.新建脚本,复制web-extract-list.js内容粘贴 ctrl+s
3.新建脚本,复制web-extract-detail.js 内容粘贴 ctrl+s
4.打开https://www.lancai.cn/about/notice.html 看执行效果

采集结束之后,json页面:



注意:根据采集的网站不同需要变更js文件里面的// @match 处匹配的url, 以及task_json的脚本配置信息

项目代码github地址:https://github.com/jstarseven/web-list-extract

码字挺累的,转载请注明出处:http://www.cnblogs.com/jstarseven/p/6278197.html

最新文章

123

最新摄影

微信扫一扫

第七城市微信公众平台