|
|
|
## 1.Source data_parser_plugin
|
|
|
|
1. 从数据源获取的数据,然后对其解析,提取想要的数据字段
|
|
|
|
|
|
|
|
## 2.支持插件
|
|
|
|
Plugins | 说明 | 文档衔接
|
|
|
|
---|---|---
|
|
|
|
json | 解析json数据格式数据
|
|
|
|
xml |使用XMLReader提取解析器获取要迁移的XML数据
|
|
|
|
simple_xml | 使用SimpleXML API获取要迁移的XML数据
|
|
|
|
soap | 获取用于迁移的SOAP数据
|
|
|
|
dom_crawler |DomCrawler组件处理解析HTML数据
|
|
|
|
|
|
|
|
|
|
|
|
## 3. JSON配置结构
|
|
|
|
1. ==备注: 数据源参考下面 #示例数据源格式#==
|
|
|
|
2. 示例数据源参考 #5.1
|
|
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
data_parser_plugin: json,
|
|
|
|
item_selector: entities // 选择一个数组key
|
|
|
|
fields:
|
|
|
|
-
|
|
|
|
"name":"roles",
|
|
|
|
"label":"roles",
|
|
|
|
"selector":"post/name" //获取entites.post.name的值
|
|
|
|
-
|
|
|
|
"name":"id",
|
|
|
|
"label":"roles",
|
|
|
|
"selector":"post/id"
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
### 3.1.字段说明
|
|
|
|
变量 | 值 | 说明
|
|
|
|
---|---|---
|
|
|
|
item_selector | 数据源key | 获取entities数组全部数据
|
|
|
|
name | 自定义字段名称|
|
|
|
|
label | 说明|
|
|
|
|
selector| {key}/{key} | 是有可以加/模式后期索引的值, post/name等同于 entites["post"]["name"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## 4. dom_crawler 配置结构
|
|
|
|
1. ==备注: 数据源参考下面==
|
|
|
|
2. 主要用于html数据采集 #5.2
|
|
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
data_parser_plugin: dom_crawler,
|
|
|
|
item_selector: ".winstyle55195 tr[height=\"30\"]" //选择html范围
|
|
|
|
fields:
|
|
|
|
-
|
|
|
|
"name":"link",
|
|
|
|
"label":"html",
|
|
|
|
"selector":"td > a" //选择 td a 里的数据
|
|
|
|
"attribute":"href" //选择a标签里href属性的值
|
|
|
|
-
|
|
|
|
"name":"title",
|
|
|
|
"label":"text",
|
|
|
|
"selector":"td > a"
|
|
|
|
"attribute":"title" //读取 a标签里的属性title值
|
|
|
|
-
|
|
|
|
"name":"a_info",
|
|
|
|
"label":"a_info",
|
|
|
|
"selector":"td > a"
|
|
|
|
"attribute": ["title","href"] //读取 a标签里的属性title和href
|
|
|
|
-
|
|
|
|
"name":"date",
|
|
|
|
"label":"date",
|
|
|
|
"selector":"td > span.timestyle55195"
|
|
|
|
"attribute":"_text" // 获取timestyle55195里所有的文本数据
|
|
|
|
|
|
|
|
```
|
|
|
|
### 4.1.字段说明
|
|
|
|
变量 | 值 | 说明
|
|
|
|
---|---|---
|
|
|
|
item_selector | css标识符 | 获取html结构范围
|
|
|
|
name | 自定义字段名称|
|
|
|
|
label | 说明|
|
|
|
|
selector| css标识符 | 选择具体css要提取的数据
|
|
|
|
attribute| href,title,_text,src等|提取标签里的属性,一次获取多个属性使用:["href","title"],_text标书获取一个纯文本数据
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## 5.演示数据源
|
|
|
|
#### 5.1 JSON数据源
|
|
|
|
|
|
|
|
```
|
|
|
|
{
|
|
|
|
"errno": 0,
|
|
|
|
"ecode": "SUCCEED",
|
|
|
|
"error": "Succeed.",
|
|
|
|
"entities": [
|
|
|
|
{
|
|
|
|
"post": {
|
|
|
|
"code": "QUYUXIAOSHOUJINGLI",
|
|
|
|
"formal": true,
|
|
|
|
"id": "060be905-4eb2-11e8-877e-00163e051882",
|
|
|
|
"name": "区域销售经理"
|
|
|
|
},
|
|
|
|
"dept": {
|
|
|
|
"code": "2060006",
|
|
|
|
"parent": "20600",
|
|
|
|
"independent": false,
|
|
|
|
"id": "6cdd2f9a-4651-11e9-af16-00163e051882",
|
|
|
|
"name": "华东南京营销"
|
|
|
|
},
|
|
|
|
"code": "200003020",
|
|
|
|
"source": "PULL"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"post": {
|
|
|
|
"code": "YUANGONG",
|
|
|
|
"formal": true,
|
|
|
|
"id": "060a0b5c-4eb2-11e8-877e-00163e051882",
|
|
|
|
"name": "员工"
|
|
|
|
},
|
|
|
|
"dept": {
|
|
|
|
"code": "2060006",
|
|
|
|
"parent": "20600",
|
|
|
|
"independent": false,
|
|
|
|
"id": "6cdd2f9a-4651-11e9-af16-00163e051882",
|
|
|
|
"name": "华东南京营销"
|
|
|
|
},
|
|
|
|
"code": "201803020",
|
|
|
|
"source": "PULL"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"post": {
|
|
|
|
"code": "FACULTY",
|
|
|
|
"formal": false,
|
|
|
|
"id": "9231b576-e0e4-11e5-aac5-00163e0226a1",
|
|
|
|
"name": "Faculty"
|
|
|
|
},
|
|
|
|
"dept": {
|
|
|
|
"code": "10800",
|
|
|
|
"parent": "0",
|
|
|
|
"independent": true,
|
|
|
|
"id": "05f36f2e-4eb2-11e8-877e-00163e051882",
|
|
|
|
"name": "营销部"
|
|
|
|
},
|
|
|
|
"code": "",
|
|
|
|
"source": "BUILTIN"
|
|
|
|
}
|
|
|
|
]
|
|
|
|
}
|
|
|
|
```
|
|
|
|
#### 5.2 html数据源
|
|
|
|
备注:来源 http://www.dgpt.edu.cn/index/tzgg.htm
|
|
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
<table width="100%" class="winstyle55195">
|
|
|
|
|
|
|
|
<tbody><tr id="line55195_0" height="30">
|
|
|
|
<td width="1" nowrap=""><span class="leaderfont55195">· </span></td>
|
|
|
|
<td width="100%" style="font-size:9pt">
|
|
|
|
|
|
|
|
|
|
|
|
<a class="c55195" href="../info/1009/9946.htm" target="_blank" title="东莞职业技术学院2020届高校(东莞)毕业生春季网络招聘会邀请函">东莞职业技术学院2020届高校(东莞)毕业生春季网络招聘会邀请函
|
|
|
|
</a>
|
|
|
|
|
|
|
|
|
|
|
|
</td>
|
|
|
|
<td width="1%" nowrap=""><span class="timestyle55195">2020-03-28 </span></td>
|
|
|
|
<td width="1%" nowrap=""></td>
|
|
|
|
</tr>
|
|
|
|
|
|
|
|
|
|
|
|
<tr id="line55195_1" height="30">
|
|
|
|
<td width="1" nowrap=""><span class="leaderfont55195">· </span></td>
|
|
|
|
<td width="100%" style="font-size:9pt">
|
|
|
|
|
|
|
|
|
|
|
|
<a class="c55195" href="../info/1009/9855.htm" target="_blank" title="关于对2020年广东省科技创新战略专项资金项目(攀登计划专项)拟推报项目的公示">关于对2020年广东省科技创新战略专项资金项目(攀登计划专项)拟推报项目的公...
|
|
|
|
</a>
|
|
|
|
|
|
|
|
|
|
|
|
</td>
|
|
|
|
<td width="1%" nowrap=""><span class="timestyle55195">2020-01-03 </span></td>
|
|
|
|
<td width="1%" nowrap=""></td>
|
|
|
|
</tr>
|
|
|
|
|
|
|
|
<tr><td colspan="3" align="left">
|
|
|
|
<table cellpadding="0" cellspacing="0" border="0">
|
|
|
|
<tbody><tr><td colspan="0"><table cellspacing="0" class="headStyle1h43iuqoza" width="100%" cellpadding="1"><tbody><tr valign="middle"><td nowrap="" align="left" width="1%" id="fanye55195">共38条 1/2 </td><td nowrap="" align="left"><div><span class="PrevDisabled">首页</span><span class="PrevDisabled">上页</span><a href="tzgg/1.htm" class="Next">下页</a><a href="tzgg/1.htm" class="Next">尾页</a> <input align="absmiddle" type="button" class="defaultButtonStyle" id="gotopagebut" name="a55195Find" value="转到" onclick="javascript:a55195_gopage_fun()"><input size="2" align="absmiddle" class="defaultInputStyle" name="a55195GOPAGE" id="a55195GOPAGE" value="" style="margin-left:1px;margin-right:1px">页</div></td></tr></tbody></table>
|
|
|
|
</td></tr></tbody></table>
|
|
|
|
</td></tr>
|
|
|
|
</tbody></table>
|
|
|
|
``` |