最近在研究golang多线程(指定线程数)爬虫,并且亲手写了个可以模拟ua,模拟cookie,指定线程数量的golang语言版本爬虫,以下是代码实例:
最近在研究golang多线程(指定线程数)爬虫,并且亲手写了个可以模拟ua,模拟cookie,指定线程数量的golang语言版本爬虫,以下是代码实例:
package main import ( "fmt" "io/ioutil" "net/http" "regexp" "math/rand" "time" "os" ) var cookie string ="in_userid=1; in_username=xxxxxqq.com; in_userpassword=xxxxx; in_adminid=3; in_adminname=xxxxxqq.com; in_adminpassword=xxxxx; in_permission=1%2C2%2C3%2C4%2C5%2C6" var userAgentList = []string{"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)", "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)", "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", } var count int type Task struct { myfun func()error } func NewTask(fun func()error)*Task { return &Task{ myfun: fun, } } func (m*Task)Excute() { m.myfun() } type GroutinePool struct { Sum int EntryTask chan *Task WorkTask chan *Task } func NewGroutinePool(sum int)*GroutinePool { return &GroutinePool{ Sum: sum, EntryTask: make(chan *Task), WorkTask: make(chan *Task), } } func (pool*GroutinePool)Work(workid int) { for task:=range pool.WorkTask{ task.Excute() count++ fmt.Println("第几个线程执行完成",workid,"执行次数",count) } } func (pool*GroutinePool)Run() { for i:=0;i<pool.Sum;i++{ go func() { pool.Work(i) }() } for task:=range pool.EntryTask{ pool.WorkTask<-task } } func GetRandomUserAgent() string { r := rand.New(rand.NewSource(time.Now().UnixNano())) return userAgentList[r.Intn(len(userAgentList))] } func getUrlRespHtml() string { rand.Seed(time.Now().UnixNano()) url := "http://gyb.chixnaxixnge.cn/mryens/"+randomString(5)+"/" client := &http.Client{} req, err := http.NewRequest("GET", url, nil) if err!=nil { return "错误" } req.Header.Add("User-Agent",GetRandomUserAgent() ) resp, err := client.Do(req) if err!=nil { return "错误" } resp_byte, err := ioutil.ReadAll(resp.Body) if err!=nil { return "错误" } defer resp.Body.Close() respHtml := string(resp_byte) return respHtml } func Initialize(){ reader := getUrlRespHtml() Reg := `<meta name="keywords" content="(.*?)">` rp1 := regexp.MustCompile(Reg) heads := rp1.FindAllStringSubmatch(reader, -1) //打开文件 fo,err:=os.OpenFile("baidu.txt",os.O_APPEND,6) if err!=nil{ fmt.Println("os.OpenFile err",err) return } for _, v := range heads { fo.Write([]byte(v[1]+"\r\n")) fmt.Println(v[1]) } } // Returns an int >= min, < max func randomInt(min, max int) int { return min + rand.Intn(max-min) } // Generate a random string of A-Z chars with len = l func randomString(len int) string { bytes := make([]byte, len) for i := 0; i < len; i++ { bytes[i] = byte(randomInt(65, 90)) } return string(bytes) } func testwork() error { Initialize() return nil } func main() { p:=NewGroutinePool(100) go func() { for { p.EntryTask<-NewTask(testwork) } }() p.Run() }
main函数里的
p:=NewGroutinePool(100)
代表是开启100线程爬数据
这个是爬虫主业务函数
Initialize()
,其他的是线程池实现代码和随机字符函数
最后来一张运行golang多线程爬虫过程截图:
转载请注明来自本站(66娱乐网)
66优乐网 » golang多线程爬虫实例
66优乐网 » golang多线程爬虫实例