1. Python多线程爬虫
在批量去爬取数据的时候,往往效率会很低,这个时候我们可以用到多线程的技术。
python是支持多线程的, 主要是通过thread和threading这两个模块来实现的。
单线程爬虫效率相对来说会低很多,例如:
import
requests
from
bs4
import
BeautifulSoup
import
time
start_time
=
time
.
time
(
)
def
main
(
)
:
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
for
i
in
range
(
1
,
6
)
:
url
=
"https://so.csdn.net/so/search/s.do?p="
+
str
(
i
)
+
"&q=python"
s
=
requests
.
session
(
)
html
=
s
.
get
(
url
,
headers
=
headers
)
html
.
encoding
=
"utf-8"
r
=
html
.
text
#print(r)
#print(r)
soup
=
BeautifulSoup
(
str
(
r
)
,
"html.parser"
)
limit
=
soup
.
find_all
(
'div'
,
class_
=
'limit_width'
)
soup1
=
BeautifulSoup
(
str
(
limit
)
,
"html.parser"
)
div
=
soup1
.
find_all
(
'div'
,
class_
=
'limit_width'
)
soup2
=
BeautifulSoup
(
str
(
div
)
,
"html.parser"
)
a
=
soup2
.
find_all
(
'a'
)
for
i
in
a
:
text
=
i
.
get_text
(
)
href
=
i
[
"href"
]
if
"CSDN"
not
in
text
:
print
(
text
)
print
(
href
)
main
(
)
end
=
time
.
time
(
)
print
(
end
-
start_time
)
#运行结果:
#......
#Time-Cost:2.061112642288208
然后我们尝试用多线程的方法,执行同样的爬取内容,如下所示:
# coding=utf-8
import
threading
,
queue
,
time
,
urllib
from
bs4
import
BeautifulSoup
from
urllib
import
request
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
import
requests
baseUrl
=
"https://so.csdn.net/so/search/s.do?p="
urlQueue
=
queue
.
Queue
(
)
for
i
in
range
(
1
,
6
)
:
url
=
baseUrl
+
str
(
i
)
+
"&q=python"
urlQueue
.
put
(
url
)
#print(url)
def
fetchUrl
(
urlQueue
)
:
while
True
:
try
:
#不阻塞的读取队列数据
url
=
urlQueue
.
get_nowait
(
)
i
=
urlQueue
.
qsize
(
)
#print(url,threading.current_thread().name)
except
Exception
as
e
:
break
#print ('Current Thread Name %s, Url: %s ' % (threading.currentThread().name, url))
try
:
s
=
requests
.
session
(
)
html
=
s
.
get
(
url
,
headers
=
headers
)
html
.
encoding
=
"utf-8"
r
=
html
.
text
# print(r)
# print(r)
soup
=
BeautifulSoup
(
str
(
r
)
,
"html.parser"
)
limit
=
soup
.
find_all
(
'div'
,
class_
=
'limit_width'
)
soup1
=
BeautifulSoup
(
str
(
limit
)
,
"html.parser"
)
div
=
soup1
.
find_all
(
'div'
,
class_
=
'limit_width'
)
soup2
=
BeautifulSoup
(
str
(
div
)
,
"html.parser"
)
a
=
soup2
.
find_all
(
'a'
)
for
i
in
a
:
text
=
i
.
get_text
(
)
href
=
i
[
"href"
]
if
"CSDN"
not
in
text
:
print
(
text
)
print
(
href
)
print
(
"已爬取完毕!"
)
except
:
pass
#抓取内容的数据处理可以放到这里
#为了突出效果, 设置延时
#time.sleep(1)
#print(html)
if
__name__
==
'__main__'
:
startTime
=
time
.
time
(
)
print
(
"这是主线程:"
,
threading
.
current_thread
(
)
.
name
)
threads
=
[
]
# 可以调节线程数, 进而控制抓取速度
threadNum
=
5
for
i
in
range
(
0
,
threadNum
)
:
#创建一个线程
t
=
threading
.
Thread
(
target
=
fetchUrl
,
args
=
(
urlQueue
,
)
)
threads
.
append
(
t
)
print
(
threads
)
for
t
in
threads
:
t
.
start
(
)
for
t
in
threads
:
#多线程多join的情况下,依次执行各线程的join方法, 这样可以确保主线程最后退出, 且各个线程间没有阻塞
t
.
join
(
)
endTime
=
time
.
time
(
)
print
(
"主线程结束:"
,
threading
.
current_thread
(
)
.
name
)
print
(
'Done, Time cost: %s '
%
(
endTime
-
startTime
)
)
#运行结果:
#这是主线程: MainThread
#Python游戏开发入门
#https://edu.csdn.net/course/detail/5690
#Python, Python, Python
#https://blog.csdn.net/ww_great/article/details/3057071
#......
#已爬取完毕!
#主线程结束: MainThread
#Time cost: 0.7241780757904053
设置threadNum = 2的话,也就是将队列设置为2,那么速度会大大降低。
我们运行一下,发现 Time cost: 1.3654978275299072