Python urllib2多线程

本文介绍怎么使用urllib2和thread让你的爬虫多线程运行。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# !/usr/bin/env python
# encoding: utf-8
import urllib2
from threading import Thread, Lock
from Queue import Queue
import time


class Fetcher:
def __init__(self, threads):
self.opener = urllib2.build_opener(urllib2.HTTPHandler)
self.lock = Lock()
self.q_req = Queue()
self.q_ans = Queue()
self.threads = threads
for i in range(threads):
t = Thread(target=self.get_thread)
t.setDaemon(True)
t.start()
self.running = 0

def __del__(self):
time.sleep(0.5)
self.q_req.join()
self.q_ans.join()

def get_task(self):
return self.q_req.qsize()+self.q_ans.qsize()+self.running

def push(self, req):
self.q_req.put(req)

def pop(self):
return self.q_ans.get()

def get_thread(self):
while True:
req = self.q_req.get()
with self.lock:
self.running += 1
try:
ans = self.opener.open(req).read()
except Exception, what:
ans = ''
print what
self.q_ans.put((req, ans))
with self.lock:
self.running -= 1
self.q_req.task_done()
time.sleep(0.1)

if __name__ == "__main__":
links = ['http://www.verycd.com/topics/%d/' % i for i in range(5420, 5430)]
f = Fetcher(threads=10)
for url in links:
f.push(url)
while f.get_task():
url, content = f.pop()
print url, len(content)

更多爬虫技巧请戳传送门

0%