|
|
|
## 1. URL endpoint
|
|
|
|
|
|
|
|
- ==备注:配置使用yml文件说明,所有参数都可以在后台接口管理界面里使用Form可视化配置.==
|
|
|
|
- 演示1: 通过oauth2 授权获取 token,然后通过token获取infoplus岗位信息
|
|
|
|
- 演示2: 抓上html,然后提取相关信息,进行转换处理
|
|
|
|
- 门户已经集成了常用的第三方数据接口配置,可在后台查看具体配置参数
|
|
|
|
- Infoplus / oauth2
|
|
|
|
- 网易邮箱 / ras加密
|
|
|
|
- QQ 邮箱 / oauth2
|
|
|
|
- 迪塔维 / http
|
|
|
|
- 致远OA / token
|
|
|
|
- 希嘉 / oauth2 token
|
|
|
|
- 抓取Html
|
|
|
|
### 2. 通过 oauth2 授权获取 infoplus岗位信息
|
|
|
|
#### 2.1. 获取 infoplus token的配置
|
|
|
|
```
|
|
|
|
settings:
|
|
|
|
api:
|
|
|
|
etl_config:
|
|
|
|
source:
|
|
|
|
data_fetcher_plugin: guzzle_http
|
|
|
|
http:
|
|
|
|
method: GET
|
|
|
|
timeout: 15
|
|
|
|
authentication:
|
|
|
|
plugin: oauth2_client
|
|
|
|
oauth2:
|
|
|
|
grant_type: client_credentials
|
|
|
|
urlAccessToken: http://sandbox.qtgl.com.cn/infoplus/oauth2/token
|
|
|
|
clientId: demo
|
|
|
|
clientSecret: demosecret
|
|
|
|
scopes: sys_profile
|
|
|
|
timeout: '15'
|
|
|
|
plugin: url
|
|
|
|
ids: []
|
|
|
|
api_id: '3'
|
|
|
|
process: []
|
|
|
|
destination:
|
|
|
|
plugin: api_response_data
|
|
|
|
id: api_id_3
|
|
|
|
label: 获infoplus的token
|
|
|
|
migration_group: API
|
|
|
|
id: '3'
|
|
|
|
type: url
|
|
|
|
status: 1
|
|
|
|
title: 获infoplus的token
|
|
|
|
url_alias: "/demo/infoplus/accesstoken"
|
|
|
|
access_roles:
|
|
|
|
- authenticated
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
#### 2.2. 通过token获取positions接口的信息,然后处理,配置如下
|
|
|
|
|
|
|
|
```
|
|
|
|
settings:
|
|
|
|
api:
|
|
|
|
etl_config:
|
|
|
|
source:
|
|
|
|
urls:
|
|
|
|
- http://sandbox.qtgl.com.cn/infoplus/apis/v2/user/[current-user:name]/positions
|
|
|
|
data_fetcher_plugin: guzzle_http
|
|
|
|
http:
|
|
|
|
method: GET
|
|
|
|
query:
|
|
|
|
access_token: "[api:svc:pm.parser::getApiResponseData(3,access_token)]"
|
|
|
|
timeout: 15
|
|
|
|
data_parser_plugin: json
|
|
|
|
item_selector: entities
|
|
|
|
fields:
|
|
|
|
- name: roles
|
|
|
|
label: roles
|
|
|
|
selector: "/post/name"
|
|
|
|
- name: dept
|
|
|
|
label: dept
|
|
|
|
selector: dept/name
|
|
|
|
plugin: url
|
|
|
|
ids: []
|
|
|
|
api_id: '6'
|
|
|
|
process:
|
|
|
|
roles:
|
|
|
|
- plugin: get
|
|
|
|
source: roles
|
|
|
|
rids:
|
|
|
|
- plugin: entity_lookup
|
|
|
|
source: roles
|
|
|
|
entity_type: user_role
|
|
|
|
value_key: label
|
|
|
|
dept:
|
|
|
|
- plugin: entity_lookup
|
|
|
|
source: dept
|
|
|
|
entity_type: taxonomy_term
|
|
|
|
bundle_key: vid
|
|
|
|
bundle: department
|
|
|
|
ignore_case: 'true'
|
|
|
|
value_key: name
|
|
|
|
dept_name:
|
|
|
|
- plugin: get
|
|
|
|
source: dept
|
|
|
|
destination:
|
|
|
|
plugin: api_response_data
|
|
|
|
id: api_id_6
|
|
|
|
label: 获取岗位
|
|
|
|
migration_group: API
|
|
|
|
id: '6'
|
|
|
|
type: url
|
|
|
|
status: 1
|
|
|
|
title: 获取岗位
|
|
|
|
url_alias: "/demo/infoplus/me/positions"
|
|
|
|
access_roles:
|
|
|
|
- authenticated
|
|
|
|
```
|
|
|
|
|
|
|
|
#### 2.2. 返回最终数据
|
|
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
[
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"roles": "区域销售经理",
|
|
|
|
"dept": "16",
|
|
|
|
"dept_name": "华东南京营销"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"roles": "员工",
|
|
|
|
"rids": "yuangong",
|
|
|
|
"dept": "16",
|
|
|
|
"dept_name": "华东南京营销"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"roles": "Faculty",
|
|
|
|
"dept_name": "营销部"
|
|
|
|
}
|
|
|
|
]
|
|
|
|
]
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### 2.3. 结构说明
|
|
|
|
|
|
|
|
变量 | 值 |说明
|
|
|
|
---|---|---
|
|
|
|
source | url | 定义source插件
|
|
|
|
urls | [] | 请求的地址,支持多个,比如用于分页请求,数据源结构必须相同
|
|
|
|
data_fetcher_plugin | guzzle_http | 一款处理http请求的插件
|
|
|
|
http|{"method" ...}| 根据guzzle插件参数定义,具体可以[参考手册](https://github.com/guzzle/guzzle)
|
|
|
|
authentication|{"plugin":"oauth2"}| 标准的oauth2协议支持
|
|
|
|
data_parser_plugin | json|xml|dom_crawler|simple_xml|soap| 对各种数据类型进行解析
|
|
|
|
item_selector|{entites}| 根据接口返回的数据结构,选择想要读取的key下面的数据。infoplus返回的是结构是{"entites":{}} 所以我们想取entites下面的数据。
|
|
|
|
fields|name,selector| name:数据的名称,selector选择哪个字段,多维数组可以使用 / 分开
|
|
|
|
ids|{"ids":{"<field_name>":{"type":"<string>"}}}|唯一标识符,主要为了映射到目标比如数据里的唯一主键。支持多值
|
|
|
|
process|{field_name}:["plugin":{get}],"source":{soucre_field_name}|自定义的field name,然后通过各种插件转换source对应的字段. [查询相关process里面提供的plugins](https://note.youdao.com/)
|
|
|
|
destination|api_response_data |把结果生成json格式数据
|
|
|
|
|
|
|
|
### 3.抓取html,然后对其提取、转换
|
|
|
|
|
|
|
|
#### 3.1 抓取 www.dgpt.edu.cn/index/tzgg.htm
|
|
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
settings:
|
|
|
|
api:
|
|
|
|
etl_config:
|
|
|
|
source:
|
|
|
|
urls:
|
|
|
|
- http://www.dgpt.edu.cn/index/tzgg.htm
|
|
|
|
data_fetcher_plugin: guzzle_http
|
|
|
|
http:
|
|
|
|
method: GET
|
|
|
|
timeout: 15
|
|
|
|
data_parser_plugin: dom_crawler
|
|
|
|
item_selector: .winstyle55195 tr[height="30"]
|
|
|
|
fields:
|
|
|
|
- name: link
|
|
|
|
label: html
|
|
|
|
selector: td > a
|
|
|
|
attribute: href
|
|
|
|
- name: title
|
|
|
|
label: text
|
|
|
|
selector: td > a
|
|
|
|
attribute: title
|
|
|
|
- name: date
|
|
|
|
label: date
|
|
|
|
selector: td > span.timestyle55195
|
|
|
|
attribute: _text
|
|
|
|
ids:
|
|
|
|
title:
|
|
|
|
type: string
|
|
|
|
plugin: url
|
|
|
|
api_id: '11'
|
|
|
|
process:
|
|
|
|
title:
|
|
|
|
- plugin: get
|
|
|
|
source: title
|
|
|
|
url_redirection:
|
|
|
|
- plugin: get
|
|
|
|
source: link
|
|
|
|
- plugin: str_replace
|
|
|
|
source: link
|
|
|
|
search: "../info"
|
|
|
|
replace: http://www.dgpt.edu.cn/info
|
|
|
|
published_date:
|
|
|
|
- plugin: get
|
|
|
|
source: date
|
|
|
|
- plugin: str_replace
|
|
|
|
search: " "
|
|
|
|
replace: " 12:00"
|
|
|
|
- plugin: callback
|
|
|
|
callable: strtotime
|
|
|
|
category:
|
|
|
|
- plugin: default_value
|
|
|
|
default_value: "[13,14]"
|
|
|
|
destination:
|
|
|
|
default_bundle: news
|
|
|
|
plugin: api_response_data
|
|
|
|
id: api_id_11
|
|
|
|
label: 数据采集html实例
|
|
|
|
migration_group: API
|
|
|
|
id: '11'
|
|
|
|
type: url
|
|
|
|
status: 1
|
|
|
|
title: 数据采集html实例
|
|
|
|
url_alias: "/portal/api/v2/news/note/"
|
|
|
|
access_roles:
|
|
|
|
- authenticated
|
|
|
|
```
|
|
|
|
#### 3.2 抓取结果
|
|
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
[
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"title":"东莞职业技术学院2020届高校(东莞)毕业生春季网络招聘会邀请函",
|
|
|
|
"url_redirection":"http://www.dgpt.edu.cn/info/1009/9946.htm",
|
|
|
|
"published_date":1585368000,
|
|
|
|
"category":[
|
|
|
|
13,
|
|
|
|
14
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"title":"关于对2020年广东省科技创新战略专项资金项目(攀登计划专项)拟推报项目的公示",
|
|
|
|
"url_redirection":"http://www.dgpt.edu.cn/info/1009/9855.htm",
|
|
|
|
"published_date":1578024000,
|
|
|
|
"category":[
|
|
|
|
13,
|
|
|
|
14
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"title":"东莞职业技术学院关于设置2019年成人高等教育校外教学点的通告",
|
|
|
|
"url_redirection":"http://www.dgpt.edu.cn/info/1009/9823.htm",
|
|
|
|
"published_date":1577073600,
|
|
|
|
"category":[
|
|
|
|
13,
|
|
|
|
14
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"title":"东莞职业技术学院2020届高校(东莞)毕业生供需见面会参会企业展位-30日",
|
|
|
|
"url_redirection":"http://www.dgpt.edu.cn/info/1009/9578.htm",
|
|
|
|
"published_date":1572494400,
|
|
|
|
"category":[
|
|
|
|
13,
|
|
|
|
14
|
|
|
|
]
|
|
|
|
}
|
|
|
|
]
|
|
|
|
]
|
|
|
|
```
|
|
|
|
|