Go开发的分布式爬虫框架 yispider

释放双眼,带上耳机,听听看~!

yispider一款分布式爬虫平台,帮助你更好的管理和开发爬虫。
内置一套爬虫定义规则(模版),可使用模版快速定义爬虫,也可当作框架手动开发爬虫 .
.
码云地址:https://gitee.com/bilibala/YiSpider
github地址:https://github.com/2young2simple/yispider

架构

目前框架分为2个部分:

1.爬虫部分(spider节点):

内部结构参考python scrapy框架,主要由 schedule,page process,pipline 4个部分组成,单个爬虫单独调度器,单独上下文管理,目前内置2中pipline的方式,控制台和文件,节点信息注册在etcd上用于manage节点发现。

  • core:负责爬虫生命周期、上下文的管理,负责爬虫的运行。

  • schedule:负责爬虫请求的调度。(目前只有一种基于channel的调度器,无法单个爬虫多worker运行,可自行实现基于redis,或者mq服务的调度器即可实现)

  • process (page process):负责请求结果的处理。

  • pipline: 结果的输出输出到不同渠道,如控制台,文件,消息队列,数据库等等

  • register:负责服务的注册(目前只支持etcd)

  • http: 提供一些http接口

2.管理部分(manage节点):

负责spider节点的管理,用etcd进行spider节点的发现。通过http与spider节点通讯。

开始使用

1. Json模版

http接口调用
curl -d '{"id":"douban-movie","Name":"douban-movie","request":[{"url":"https://movie.douban.com/j/new_search_subjects?sort=T/u0026range=0,10/u0026tags=/u0026start={0-100,20}","method":"get","type":"","data":null,"header":null,"cookies":{"url":"","data":""},"process_name":"movie"}],"process":[{"name":"movie","reg_url":null,"type":"json","template_rule":{"Rule":null},"json_rule":{"Rule":{"casts":"casts","cover":"cover","id":"id","node":"array|data","rate":"rate","star":"star","title":"title","url":"url"}},"add_queue":null}],"pipline":"file","depth":0,"end_count":0}' "http://127.0.0.1:7774/task/addAndRun"

豆瓣电影模版

 {    "id": "douban-movie",    "Name": "douban-movie",    "request": [
        {            "url": "https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start={0-10,20}",            "method": "get",            "process_name": "movie"
        }
    ],    "process": [
        {            "name": "movie",            "type": "json",            "json_rule": {                "Rule": {                    "casts": "casts",                    "cover": "cover",                    "id": "id",                    "node": "array|data",                    "rate": "rate",                    "star": "star",                    "title": "title",                    "url": "url"
                }
            },            "add_queue": null
        }
    ],    "pipline": "file",    "depth": 0,    "end_count": 0}

dilidili模版

   {    "id": "dilidili",    "Name": "dilidili",    "request": [
        {            "url": "http://www.dilidili.wang/{gaoxiao|kehuan|yundong|danmei|zhiyuxi|luoli|zhenren|zhuangbi|youxi|tuili|qingchun|kongbu|jizhan|rexue|qingxiaoshuo|maoxian|hougong|qihuan|tongnian|lianai|meishaonv|lizhi|baihe|paomianfan|yinv}/",            "method": "get",            "process_name": "animelist"
        }
    ],    "process": [
        {            "name": "animelist",            "type": "template",            "template_rule": {                "Rule": {                    "content": "text|dd div",                    "desc": "text|dd p",                    "href": "attr.href|dt a",                    "img": "attr.src|dt a img",                    "node": "array|.anime_list dl",                    "title": "text|dd h3 a"
                }
            },            "add_queue": [
                {                    "url": "http://www.dilidili.wang{href}",                    "method": "get",                    "process_name": "animeinfo"
                }
            ]
        },
        {            "name": "animeinfo",            "type": "template",            "template_rule": {                "Rule": {                    "episode": "texts|.time_con .swiper-slide .clear li a em",                    "episode-link": "attrs.href|.time_con .swiper-slide .clear li a",                    "title": "text|.detail dl dd h1"
                }
            },            "add_queue": [
                {                    "url": "{episode-link}",                    "method": "get",                    "process_name": "episodeinfo"
                }
            ]
        },
        {            "name": "episodeinfo",            "reg_url": null,            "type": "template",            "template_rule": {                "Rule": {                    "player": "attr.src|.player_main iframe",                    "title": "text|#intro2 h1",                    "url": "attr.href|link[rel=/"canonical/"]"
                }
            },            "add_queue": null
        }
    ],    "pipline": "file",    "depth": 0,    "end_count": 0}

2. 代码模版 编写

豆瓣电影

package mainimport (    "YiSpider/spider/model"
    "YiSpider/spider"
    spider2 "YiSpider/spider/spider")func main(){

    task := &model.Task{
        Id:"douban-movie",
        Name:"douban-movie",
        Request:[]*model.Request{
            {
                Method:"get",
                Url:"https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start={0-10000,20}",
                ProcessName:"movie",
            },
        },
        Process: []model.Process{
            {
                Name:"movie",
                Type:"json",
                JsonRule:model.JsonRule{
                    Rule:map[string]string{                        "node":"array|data",                        "rate":"rate",                        "star":"star",                        "id":"id",                        "url":"url",                        "title":"title",                        "cover":"cover",                        "casts":"casts",
                    },
                },
            },
        },
        Pipline:"file",
    }

    app := spider.New()
    app.AddSpider(spider2.InitWithTask(task))
    app.Run()
}

dilidili番剧

package mainimport (    "YiSpider/spider/model"
    "YiSpider/spider"
    spider2 "YiSpider/spider/spider")func main(){

    task := &model.Task{
        Id:"dilidili",
        Name:"dilidili",
        Request:[]*model.Request{
            {
                Method:"get",
                Url:"http://www.dilidili.wang/{gaoxiao|kehuan|yundong|danmei|zhiyuxi|luoli|zhenren|zhuangbi|youxi|tuili|qingchun|kongbu|jizhan|rexue|qingxiaoshuo|maoxian|hougong|qihuan|tongnian|lianai|meishaonv|lizhi|baihe|paomianfan|yinv}/",
                ProcessName:"animelist",
            },
        },
        Process: []model.Process{
            {
                Name:"animelist",
                Type:"template",
                TemplateRule:model.TemplateRule{
                    Rule:map[string]string{                        "node":"array|.anime_list dl",                        "img":"attr.src|dt a img",                        "title":"text|dd h3 a",                        "href":"attr.href|dt a",                        "content":"text|dd div",                        "desc":"text|dd p",
                    },
                },
                AddQueue:[]*model.Request{
                    {
                        Method:      "get",
                        Url:         "http://www.dilidili.wang{href}",
                        ProcessName: "animeinfo",
                    },
                },
            },
            {
                Name:"animeinfo",
                Type:"template",
                TemplateRule:model.TemplateRule{
                    Rule:map[string]string{                        "episode":"texts|.time_con .swiper-slide .clear li a em",                        "title":"text|.detail dl dd h1",                        "episode-link":"attrs.href|.time_con .swiper-slide .clear li a",
                    },
                },
                AddQueue:[]*model.Request{
                    {
                        Method:      "get",
                        Url:         "{episode-link}",
                        ProcessName: "episodeinfo",
                    },
                },
            },
            {
                Name:"episodeinfo",
                Type:"template",
                TemplateRule:model.TemplateRule{
                    Rule:map[string]string{                        "url":"attr.href|link[rel=/"canonical/"]",                        "title":"text|#intro2 h1",                        "player":"attr.src|.player_main iframe",
                    },
                },
            },
        },

        Pipline:"file",
    }


    app := spider.New()
    app.AddSpider(spider2.InitWithTask(task))
    app.Run()

}
  1. 纯代码编写


码云地址:https://gitee.com/bilibala/YiSpider
github地址:https://github.com/2young2simple/yispider

作者:fimos
链接:https://www.jianshu.com/p/33fd62ca09da

【转自慕课】https://www.imooc.com

Go

go语言之http

2022-3-3 7:33:25

Go

Golang 用 flag 来操作控制台参数

2022-3-3 7:35:48

搜索