Go爬虫

go爬虫

第一只爬虫

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

func fetch(url string) string {
	c := &http.Client{}
	// 新建一个对网页的请求
	req,_ := http.NewRequest("get",url,nil)
	// 在浏览器中随便找个网页 右键->检查->NetWork->找到对应的请求,找到 cookie和 user-agent添加进设置,仿造用户
	req.Header.Set("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
	req.Header.Add("cookie","MUID=2D67B23EE27866FA03DFA328E31E6749; MUIDB=2D67B23EE27866FA03DFA328E31E6749; _EDGE_V=1; SRCHD=AF=NOFORM; SRCHUID=V=2&GUID=0E0B1A0A134648729F811B8DA4A9D938&dmnchg=1; CSRFCookie=438937a0-ddf8-4fa9-b883-86d0c9891593; ANON=A=D75DA0F1D1CB4ADA171BD1FFFFFFFFFF&E=1a5a&W=1; PPLState=1; MUIDV=NU=1; _tarLang=default=zh-Hans; _TTSS_IN=hist=WyJlbiIsImF1dG8tZGV0ZWN0Il0=; _TTSS_OUT=hist=WyJ6aC1IYW5zIl0=; _EDGE_S=F=1&SID=07B549486E05610018F1585E6F636013&ui=zh-cn&mkt=zh-cn; _SS=SID=07B549486E05610018F1585E6F636013&PC=u477&OCID=wispr; SRCHS=PC=u477; ZHSEARCHCHATSTATUS=STATUS=0; NAP=V=1.9&E=1a5e&C=wn42YlN2eNH7FaA0XdaMtL8Dqf2nQ6kHkObNtEZhNdc2R7CKGxovkQ&W=1; _UR=QS=0&TQS=0; _HPVN=CS=eyJQbiI6eyJDbiI6MywiU3QiOjAsIlFzIjowLCJQcm9kIjoiUCJ9LCJTYyI6eyJDbiI6MywiU3QiOjAsIlFzIjowLCJQcm9kIjoiSCJ9LCJReiI6eyJDbiI6MywiU3QiOjAsIlFzIjowLCJQcm9kIjoiVCJ9LCJBcCI6dHJ1ZSwiTXV0ZSI6dHJ1ZSwiTGFkIjoiMjAyMi0wNi0wNVQwMDowMDowMFoiLCJJb3RkIjowLCJHd2IiOjAsIkRmdCI6bnVsbCwiTXZzIjowLCJGbHQiOjAsIkltcCI6MTB9; imgv=flts=20220607; KievRPSSecAuth=FABqBBRaTOJILtFsMkpLVWSG6AN6C/svRwNmAAAEgAAACE9nnYPK6qlsKATVzAL4Vt+YOKgqyIQRzcHsasM3PnMU1D3tuxWa2IetF8taeKVGkBxm713P6Dm41JlkiWJTy1jQsbgG4gz/FQLSqaZiSqMsJ9ZAJZNP0N1Vg0t+m0x1ExgIVVb5Ccdvi7PxshX6nB0J28aoMDQrdZl5A7dsaV1H2eDGS8XrUwpM05V4C9DEJemE6+ffQoUGtxz+6VKPlb+2rMOGUU7r9ljDa2G6hnSVXw2YJZyprug2Yy1iGjtFeSxyPng5oT5XyqchJvm9aIfFGYB0Dp6xO6OMcOjzG77zNZGYya3q42Hj1iX6MCdEbnKrV5Uzf9IuAVsT5UtMtbI27DvjXyGg39P3JkzttTcsPDUff7muO5YzWnrJE/7OBeozyOIokfnpBnhh8dH+4e5vKogN5mzw1lfsow993utFLcwUbGSppet/9c42hrOzvnkglZwrDHYCHeGVsscnk9DuXJYoX9GIxQr/2OVvCJ5PD7xfdlgq9ibl0V5N7QOCNSKaIPVDSaBvt8PMIKmIRQ0jqmlmGHIt3mVuWDNJqmcMYcnJefW2KMyEnvqaJzw5Znvb4Qpm5qZyKT7DPn8xrGiKWbDLkQh7I24kW0PlqWzQ+VYDCFKklTv7zXvdSIV0muVB5LQTscyPR/Am3x9ScL1ZpF51TXchtvJorpDIJvKkiLcGPyV1DoeuiHr3xHEEYc4MC8FqEltIQydn7Q8Tbf5wR2lt1GQFW4sWDqHBPj1V58vnZ2S2ufJKqLvZK4YcNOw36Y7XNv2wWMt543IrgNMj/rnOcvu+vbjza9IilNnzvxzaxDh02u8mPoiCevN5Cof90Q7Vu1A/Z25dYMk6B9eGCpsHnn5cHcUw0rNmSJgy7O/5Olx4H/rQ6pymKArOb9HP4OdTeM7gD9aGqwwZZU4vJAcKGkbDKoWiISW/ATshihntgDfaYc2Xy546C+PWyyIlF+sUiEKyqT//8zMtOPwYe6vj3Xw/IC+B+uhPqdBc0HZqPW0zgQrVe4IzAxVHX47InF6FUAUdftARR4cOQWTjfxpZ1JU5m2X7njuNVYKRUY4xBTJX9BPystlUBJx8fM/G4tBKIdlatkn/BZzQ7WrZV/PsitPBXft1NhH3+807xqhe1+ngPHG5LnjL73A4V55n221rE8AUcHCfwCMai06cj3pOLRBe1ezzlTipKj+anv/AzxvOiMZ/FuQVDw+xC3UfG0j7ji3qTH/OuCE4fHQ+D+SLlF4tNUmH4/SlYbSnjM/m5MuBCozk5nIwGO2IWlmiQQ9vjcWsis+bZmU0SWcGd5mSU4HCutmcTQ7iXAz63l/cgcKX2L78OBWO4rGh1KIRRKUhyOeqBhvDQmUnBXJsUQjA9EVy/sb4mZkes4s9CC/xSCwxF7wKOJkU2HFtQ9EQ4M8eKDDk6osXYwbM4HD2WhQAwmniVHZ15hCvgdyZvVJZnRq00IQ=; _U=1OLH3zVzPIaeCg4-meJyk8gAqofbsryswQGX7sDqjL7gJETLX_Pvj42je6c1iIkLGRmUyMJN2uSVG0pRrrwV3QcXaXjae8RDlff0m5UJGOS9JK50BCS2IriH3SdJcxLxVAA4Tg-M6qh5HQyGSK848Srh0g4iRbpzUUqJPYMbMSjRTv8u-Qb5x5-ymEFK8uf3PooZTwaTnB70QoRqLM4Kj6FLqnZylBTTcLtnuTyu-N9Q; WLS=C=e52034126effc4c3&N=祈祷中; WLID=gAp1meRVPFwos+mlyFgWkHh+yiv4BirIxXVzeydsJ7o2SBm5TfdUyMMsQBQjlQkrEXqpCIbBVJMryiQbvhIxbGPWEKddjXdHfeOh73bO22Y=; ZHCHATSTRONGATTRACT=TRUE; ABDEF=V=13&ABDV=11&MRNB=1655442740219&MRB=0; SUID=A; ZHCHATWEAKATTRACT=TRUE; SRCHUSR=DOB=20220421&T=1655538629000&TPC=1655533328000&POEX=W; ipv6=hit=1655542230902&t=4; SNRHOP=I=&TS=; SRCHHPGUSR=SRCHLANG=zh-Hans&BRW=HTP&BRH=M&CW=942&CH=852&SW=1920&SH=1080&DPR=1.100000023841858&UTC=480&DM=0&HV=1655540040&PV=7.0.0&BZA=0&WTS=63790878387&EXLTT=9")
	resp,err := c.Do(req)
	if err!=nil||resp.StatusCode != http.StatusOK{
		panic(err)
	}
	defer resp.Body.Close()
	body,err := ioutil.ReadAll(resp.Body)
	if err!=nil{
		panic(err)
	}
	return string(body)
}

func main()  {
	fmt.Println(fetch("http://www.pingtaimeng.com/article/detail/id/815982"))
}

==goquery==包里使用的另一种请求方法,直接请求,没有自己设置头之类的

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
func fetch(url string) string {
   resp,err := http.Get(url)
   if err!=nil{
      panic(err)
   }
   if resp.StatusCode!=http.StatusOK{
      panic(errors.New(url+" StatusCode = "+string(resp.StatusCode)))
   }
   defer resp.Body.Close()
   if resp.Request == nil {
      panic(errors.New("Response.Request is nil"))
   }
   body,err := ioutil.ReadAll(resp.Body)
   if err!=nil{
      panic(err)
   }
   return string(body)
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
// 类似方法2,只是把http.Get()拆开了而已
func fetch (url string) string {
   fmt.Println("Fetch Url", url)
   // 创建请求
   req, _ := http.NewRequest("GET", url, nil)
   // 创建HTTP客户端
   client := &http.Client{}
   // 发出请求
   resp, err := client.Do(req)
   if err != nil {
      fmt.Println("Http get err:", err)
      return ""
   }
   if resp.StatusCode != 200 {
      fmt.Println("Http status code:", resp.StatusCode)
      return ""
   }
   // 读取HTTP响应正文
   defer resp.Body.Close()
   body, err := ioutil.ReadAll(resp.Body)
   if err != nil {
      fmt.Println("Read error", err)
      return ""
   }
   return string(body)
}

保存爬取的内容

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
package main

import (
   "fmt"
   _ "github.com/go-sql-driver/mysql"
   "io"
   "io/ioutil"
   "net/http"
   "os"
   "regexp"
   "strings"
   "time"
   "xorm.io/xorm"
)

var engine *xorm.Engine

// 读取页面
func fetch(url string) string {
   c := &http.Client{}
   // 新建一个对网页的请求
   req,_ := http.NewRequest("GET",url,nil)
   // 在浏览器中随便找个网页 右键->检查->NetWork->找到对应的请求,找到 cookie和 user-agent添加进设置,仿造用户
   req.Header.Set("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
   req.Header.Add("cookie","MUID=2D67B23EE27866FA03DFA328E31E6749; MUIDB=2D67B23EE27866FA03DFA328E31E6749; _EDGE_V=1; SRCHD=AF=NOFORM; SRCHUID=V=2&GUID=0E0B1A0A134648729F811B8DA4A9D938&dmnchg=1; CSRFCookie=438937a0-ddf8-4fa9-b883-86d0c9891593; PPLState=1; MUIDV=NU=1; _tarLang=default=zh-Hans; _TTSS_IN=hist=WyJlbiIsImF1dG8tZGV0ZWN0Il0=; _TTSS_OUT=hist=WyJ6aC1IYW5zIl0=; _EDGE_S=F=1&SID=07B549486E05610018F1585E6F636013&ui=zh-cn&mkt=zh-cn; _SS=SID=07B549486E05610018F1585E6F636013&PC=u477&OCID=wispr; SRCHS=PC=u477; ZHSEARCHCHATSTATUS=STATUS=0; _UR=QS=0&TQS=0; imgv=flts=20220607; ENSEARCH=BENVER=0; _HPVN=CS=eyJQbiI6eyJDbiI6MSwiU3QiOjAsIlFzIjowLCJQcm9kIjoiUCJ9LCJTYyI6eyJDbiI6MSwiU3QiOjAsIlFzIjowLCJQcm9kIjoiSCJ9LCJReiI6eyJDbiI6MSwiU3QiOjAsIlFzIjowLCJQcm9kIjoiVCJ9LCJBcCI6dHJ1ZSwiTXV0ZSI6dHJ1ZSwiTGFkIjoiMjAyMi0wNi0yNlQwMDowMDowMCIsIklvdGQiOjAsIkd3YiI6MCwiRGZ0IjpudWxsLCJNdnMiOjAsIkZsdCI6MCwiSW1wIjozOX0=; ANON=A=D75DA0F1D1CB4ADA171BD1FFFFFFFFFF&E=1b0f&W=1; NAP=V=1.9&E=1abc&C=p-IomcnELRRFsYW7fLLtN_iPRJfgnoR66Av6HuopxlJM1_zl88CMEg&W=1; KievRPSSecAuth=FABiBBRaTOJILtFsMkpLVWSG6AN6C/svRwNmAAAEgAAACIDcTOtdy0lsIAQGqqPejTWdYKVemzkE8/qYqjIgrlEQ06O5x7bYTGyYAc70pjKpLczHb6EsYXY4PV2qstpg5FCwEmWqg9Wst5jSjOY6jyM06hS7hwY7G8lXKodJ6dpF7uo2J5Azq+sLc0H7AgM5t+osVjjCcfR06hbevunUrmhmhlb3KQGSzjn88+bcd/qZYlR25DcCYApy9Rvrl3woowd5ZbQVNN6KqfzY0YH+h/AceqLpNexKTYSWPn9ExsZ4Ug5vgJRNyVT/7wVNpcOf7GlBR8trvw5MbbyYjclvmJvQe7C27bQ0HS5puWWjZnPUkcymdDE8wYj4umynAlEO0sQo/l1Wk97ARYRdTrSQO3oh/3CK1XfMOs9XvxkR0zHqYVDCrq6Vbhzkm2/3CvGUm95diff/fxIOHRxLYd0Vx7f/rxnkpP2jXKGXZX8BXlrds9iLEX+FR5Lu2pNzPU5ukg45rKpV1+keqoliG58pE7eElbNH3f6sBFF1hiJKvitr5mADhMUw62ikZeE1nsx2SnNkX0jd1A4Q7qPHGkEj0NAPXg2LEukDeh5gAstf6SG4WsZ+D6MjKjqUN1Ep4jTV5qaImcwCQ9tvMcqSR4HAU8SezRhG7MtkL0HNPMMG37Wf7Gwb2bR7WrB4BIiX/13UEUnTb1rEj4eHCXZ1A55M+NpOWlVZ8bqeWlQt9CvOmnmCoTJ6JnqmcSD2874C5EwDvKGHnGmQTGObPL5EHaOMcY8El31HY2mTXmoDcrzkLlrRtro4mjdZeYvaYJZyIB+jw4gzOHpDwmS7nIeEtb1XM7p96aUJkrXF7mJU1s89HeT+hpYyFmkl/FUX/x/nSIXSfeGdqU7i6JmRJQ1WkxrCAlp9YSPA0f5crIohX5xEt2NnhdmHSiOXSDzTwtCpoxBFt5F5FRc1gYDnKOJwu3TjijU9hEHVAVPY1eZFF8KvTvAaKd0NqkDN+ccFo/iLQ3o7TOQ7G0KSa5D66FqBCct9VPe5P9KJs/yDb2BdfXZIJ/E6u/12Y+6eEGHEAIXhyZCoGV+IeP0/mde+ZvmVfLUWGzv48yU5kiSZT+C8f9FQpPTtzrzWhtUOHWDzaolzmq6dt73khSQ9/mKbvzVcbedGmhNciIglyK0kATrASR/o7p3tu9WAClwMP3O8c85AM9u/Zafe0TFxw1x0J/t3yrET+1tLLwv/jN0yJRRJq6zlkgGtdBl+omJQtvlNX6TRNhaG/IwlietBxeB6gVAg/8CoHPLgQdHEMh0vd7cS9S+uTHcCYSyccOHh52ZF98zETG3Po4/84/KQFqVKbgOoRepEuLFqNSisCTxcJjCwXK8dI1fql2CFK23Rf2Xo6EGZhEg6TxrT9bNEXAZdpROxe+fK8xQ3Mziohjik2HR62QCbCGSxv3gTm7BBOj6722kUAC1T4zWe78k5DbP+KLA4p3dcoTB8; _U=1xQBB0QnAKV9LyKeoF4-dqbkzcpUlrdd_xazfbujcv_pmSVIc8RJYA1fV7gJUd_kx-La7C62bksJMkS2OTRl31lrtyECaW_QjLTZBWVlYVo3no7iO804wobZJHqN1JNZzr9BLrcgWPUYj8wNRXSxhuoBB8fCc-PrbVbsYTf4PDDfpFnWSDcGERQ7hg8EWaC1p0_r9ARxy4sUlC2hgyEeUelo2XDCjrv5F-DEbviC0sD0; WLS=C=e52034126effc4c3&N=祈祷中; WLID=gAp1meRVPFwos+mlyFgWkHh+yiv4BirIxXVzeydsJ7o2SBm5TfdUyMMsQBQjlQkrEXqpCIbBVJMryiQbvhIxbGPWEKddjXdHfeOh73bO22Y=; ABDEF=V=13&ABDV=11&MRNB=1657271978237&MRB=0; ZHCHATSTRONGATTRACT=TRUE; SUID=A; SRCHUSR=DOB=20220421&T=1657591309000&TPC=1657591310000&POEX=W; ZHCHATWEAKATTRACT=TRUE; ipv6=hit=1657594911872&t=4; SNRHOP=TS=637931881239107729&I=1; SRCHHPGUSR=SRCHLANG=zh-Hans&BRW=S&BRH=M&CW=1166&CH=852&SW=1920&SH=1080&DPR=1.100000023841858&UTC=480&DM=0&HV=1657591310&PV=7.0.0&BZA=0&WTS=63792119231&EXLTT=9")
   resp,err := c.Do(req)

   if err!=nil||resp.StatusCode != http.StatusOK{

   }
   defer func(Body io.ReadCloser) {
      err := Body.Close()
      if err != nil {
         panic(err)
      }
      if resp.StatusCode != http.StatusOK {
         panic("Status!OK:"+url)
      }
   }(resp.Body)
   body,err := ioutil.ReadAll(resp.Body)
   if err!=nil{
      panic(err)
   }
   return string(body)
}

// parse 解析页面
func parse(html string) {
   // 去 \n符 n==-1表示不限制替换的数量 即替换所有 \n
   html = strings.Replace(html,"\n","",-1)

   // 匹配html里 <body>块的所有内容
   sidebar := regexp.MustCompile(`<aside id="sidebar" role="navigation">(.*?)</aside>`).FindString(html)
   //fmt.Println(sidebar)
   links := regexp.MustCompile(`href="(.*?)"`).FindAllString(sidebar,-1)
   for _,v:=range links{
      //fmt.Println(v)
      //fmt.Printf(v+" ")
      url:="https://gorm.io"
      if v[6]!='/'{
         url+="/zh_CN/docs/"
      }
      url+=v[6:len(v)-1]
      //fmt.Println(url)
      go parse2(fetch(url))
   }
}

func parse2(body string)  {
   body = strings.Replace(body,"\n","",-1)
   content := regexp.MustCompile(`<div class="article">(.*?)</div>`).FindString(body)
   title := regexp.MustCompile(`<header class="article-header"> (.*?)</h1>`).FindString(content)
   title=title[89:len(title)-5]
   fmt.Println(title)
   //fmt.Println(content)
   //save(title,content)
   saveToDB(title,content)
}
1
2
3
4
5
6
7
// save 保存到本地
func save(title,content string){
   err:=os.WriteFile("./pages/"+title+".html",[]byte(content),0644)
   if err!=nil{
      panic(err)
   }
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// 存入数据库
// GormPage 存入数据库的类
type GormPage struct {
   Id int64
   Title string
   Content string `xorm:"text"`
   Created time.Time `xorm:"created"`
   Updated time.Time  `xorm:"updated"`
}

// init 数据库连接初始化
func init()  {
   var err error
   engine,err = xorm.NewEngine("mysql","root:zyyxy123@tcp(127.0.0.1:3306)/bubble?charset=utf8mb4&parseTime=True&loc=Local")
   if err!=nil{
      panic(err)
   }else {
      err=engine.Ping()
      if err != nil {
         panic(err)
      }else {
         println("连接成功")
      }
   }
}

// saveToDB 保存到数据库
func saveToDB(title,content string)  {
   engine.Sync(new(GormPage))
   page:=GormPage{
      Title: title,
      Content: content,
   }
   affected,err:=engine.Insert(&page)
   if err != nil {
      panic(err)
   }
   fmt.Println("save"+string(affected))
}

func main()  {
   url := "https://gorm.io/zh_CN/docs/index.html"
   //fmt.Println(fetch(url))
   parse(fetch(url))
}

goquery库

1
go get github.com/PuerkitoBio/goquery

getDoc1\2\3 对应3种常用使用方法

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
package main

import (
	"errors"
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"io/ioutil"
	"net/http"
	"strings"
)

// fetch 手动获取html
func fetch(url string) string {
	resp,err := http.Get(url)
	if err!=nil{
		panic(err)
	}
	if resp.StatusCode!=http.StatusOK{
		panic(errors.New(url+" StatusCode = "+string(resp.StatusCode)))
	}
	defer resp.Body.Close()
	if resp.Request == nil {
		panic(errors.New("Response.Request is nil"))
	}
	body,err := ioutil.ReadAll(resp.Body)
	if err!=nil{
		panic(err)
	}
	return string(body)
}

// getDoc1 根据 url拿到 dom,已弃用
func getDoc1(url string) error {
	// NewDocument 根据网址拿到整个 html页面
	d, err := goquery.NewDocument(url)

	if err != nil {
		return err
	}

	// Find '.'代表 class="sidebar-link" 没有 '.'则匹配标签
	d.Find(".sidebar-link").Each(func(i int, s *goquery.Selection) {
		s2:=s.Text()

		href,_:=s.Attr("href")
		fmt.Printf("%v href: ",s2)
		fmt.Println(href)
	})
	return nil
}

// getDoc2 根据请求获取页面,已弃用
func getDoc2(url string) error {
	client := http.Client{}
	req , _ := http.NewRequest("GET",url,nil)
	resp , err := client.Do(req)

	if err != nil {
		return err
	}

	d,err := goquery.NewDocumentFromResponse(resp)
	if err != nil {
		return err
	}
	fmt.Printf(d.Text())
	return nil
}

// getDoc3 根据 html解析页面
func getDoc3(url string) error {
	html:=fetch(url)
	d,err:=goquery.NewDocumentFromReader(strings.NewReader(html))
	if err != nil {
		return err
	}
	fmt.Println(d.Text())
	return nil
}

func main()  {
	url := "https://www.vilipix.com/"
	getDoc1(url)
	getDoc2(url)
	getDoc3(url)
}

例子的解释(以getDoc1为例)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
func getDoc1(url string) error {
   // NewDocument 根据网址拿到整个 html页面,仅作解释所以不处理 err
   d, _ := goquery.NewDocument(url)

   // Find 选择器 '.xxx'代表 class="xxx",'#'代表 id="xxx",没有 '.'或 '#'则匹配标签
   // Find("div[class]") Find包含 class的 div,同理可以用 Find("div[class=name]")甚至Find("div[id][lang=zh]")继续缩小筛选范围,关于它的其他用法见代码块下面的表格
   // Find("parent>child"),表示筛选parent这个父元素下,符合child这个条件的最直接(一级)的子元素
   // Find("parent child"),表示筛选parent这个父元素下,符合child这个条件的所有(无论多少级)子元素,如 想要 body里的所有 div
   // Find("div[lang=zh]+p"),查找相邻标签,和div[lang=zh]相邻的标签
   // Find("div[lang=zh]~p"),查找兄弟(同级)标签,和div[lang=zh]同级的标签,不要求相邻
   // Find("div,span") 选择器或(|)运算 以 ','分割 同时筛选出 div 和 span 
   
   // Find("div:contains(DIV2)"),内容过滤器
    	// :contains(DIV2) 表示筛选出的元素文本中要包含"DIV2",如<div>DIV2</div>
    	// :has(selector) contains差不多,只不过这个是包含的是元素节点,就是括号里的,Find("span:has(div)")的意思是找到包含 包含div的节点 的 span节点
    	// :first-child 只筛选的 first-child
    	// :first-of-type 只筛选的和 first-child同类型的节点
    	// :last-child 和 :last-of-type 同first
    	// :nth-child(n) 和 :nth-of-type(n) 类似上面的first,只不过这里指定第几个,:nth-child(1)等价于 :first-child
    	// :nth-last-child(n) 和:nth-last-of-type(n),同上不过是从后往前数,:nth-last-child(1)等价于 :last-child
    	// :only-child 独生子节点过滤器,匹配没有兄弟节点
    	// :only-of-type 
   d.Find(".el-image").Each(func(i int, s *goquery.Selection) {
	  // Attr 获取对应的标签的属性
      href,_:=s.Attr("src")
      fmt.Printf("%v src: ",s.Text())
      fmt.Println(href)
   })
   return nil
}
符号 意义
Find(“div[lang]“) 筛选含有lang属性的div元素
Find(“div[lang=zh]“) 筛选lang属性为zh的div元素
Find(“div[lang!=zh]“) 筛选lang属性不等于zh的div元素
Find(“div[lang¦=zh]“) 筛选lang属性为zh或者zh-开头的div元素
Find(“div[lang*=zh]“) 筛选lang属性包含zh这个字符串的div元素
Find(“div[lang~=zh]“) 筛选lang属性包含zh这个单词的div元素,单词以空格分开的
Find(“div[lang$=zh]“) 筛选lang属性以zh结尾的div元素,区分大小写
Find(“div[lang^=zh]“) 筛选lang属性以zh开头的div元素,区分大小写

==关于乱码==:goquery只支持 utf-8编码,如果乱码可以尝试转码,

1
2
// 本地库 url 把 %E3%83%90%E3这类 url-encode字符 转 utf-8
url.QueryUnescape(text)

如果是 gdk或者其他编码的

colly框架

基于 goquery的框架

事件

只有这些

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
package main

import (
   "fmt"
   "github.com/gocolly/colly"
)

var URL string = "https://gorm.io/zh_CN/docs/"

func main() {
   // 创建 collector
   c := colly.NewCollector(
      // 访问白名单 collector不访问 除 gorm.io外的其他网页,这里传的是 ...string,所以可以传多个白名单
      colly.AllowedDomains("gorm.io"),
   )

   // 在得到html页面时执行,查找带有 href的 a标签
   // 类似 Find(".el-image").Each(func(i int, s *goquery.Selection)
   c.OnHTML("a[href]", func(e *colly.HTMLElement) {
      // 取其中的 href标签对应的值
      link := e.Attr("href")

      fmt.Printf("Link found: %q -> %s\n", e.Text, link)

      // 使用c去访问 link
      //c.Visit(e.Request.AbsoluteURL(link))
   })

   // 在连接(请求)时执行
   c.OnRequest(func(r *colly.Request) {
      fmt.Println("OnRequest:", r.URL.String())
   })

   // 响应返回之后调用
   c.OnResponse(func(r *colly.Response) {
      fmt.Println("OnResponse:",r.Request.URL.String())
   })

   // 取消监听,参数为 selector字符串,没明白这个什么意思,同理还有一个 xml的
   // c.OnHTMLDetach(URL)

   // 完成抓取后执行,完成所有工作后执行
   c.OnScraped(func(r *colly.Response) {
      fmt.Println("OnScraped:",r.Request.URL.String())
   })

   // 错误时执行
   c.OnError(func(r *colly.Response, err error) {
      fmt.Println("OnError:",r.Request.URL,"error:",err)
   })

   // 访问 URL
   c.Visit(URL)
}

配置

配置可以直接在 NewCollector阶段设置,或是在外面用c.配置项直接运行

1
2
3
4
c := colly.NewCollector(
    colly.UserAgent("xy"),
    colly.AllowURLRevisit(),
)

也可以在任意事件里执行,此处的例子是随机 User-Agent,直接传字符串也也行

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

func RandomString() string {
    b := make([]byte, rand.Intn(10)+10)
    for i := range b {
        b[i] = letterBytes[rand.Intn(len(letterBytes))]
    }
    return string(b)
}

c := colly.NewCollector()

c.OnRequest(func(r *colly.Request) {
    r.Headers.Set("User-Agent", RandomString())
})

支持的配置

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
ALLOWED_DOMAINS (字符串切片),允许的域名,比如 []string{"segmentfault.com", "zhihu.com"}
CACHE_DIR (string) 缓存目录
DETECT_CHARSET (y/n) 是否检测响应编码
DISABLE_COOKIES (y/n) 禁止 cookies
DISALLOWED_DOMAINS (字符串切片),禁止的域名,同 ALLOWED_DOMAINS 类型
IGNORE_ROBOTSTXT (y/n) 是否忽略 ROBOTS 协议
MAX_BODY_SIZE (int) 响应最大
MAX_DEPTH (int - 0 means infinite) 访问深度
PARSE_HTTP_ERROR_RESPONSE (y/n) 解析 HTTP 响应错误
USER_AGENT (string)

动态页面

updatedupdated2023-02-072023-02-07