Python爬取豆瓣电影,最简单,最暴力,直接搞Api
首先是api地址(地址去官网溜达一圈很容易就找到):
requests
.
get
(
'https://movie.douban.com/j/search_subjects?type=movie&tag={}&sort=recommend&page_limit={}&page_start=0'
.
format
(
tag
,
page
)
使用requests发送get请求拿到json数据( 一次可以抓很多条,所以没必要循环抓,User-Agent我只准备了一个即可 ),导入json包,解析json数据,这里需要将编码改为utf-8,否则会乱码
{
"subjects"
:
[
{
"rate"
:
"8.7"
,
"cover_x"
:
1500
,
"title"
:
"寄生虫"
,
"url"
:
"https:\/\/movie.douban.com\/subject\/27010768\/"
,
"playable"
:
false
,
"cover"
:
"https://img3.doubanio.com\/view\/photo\/s_ratio_poster\/public\/p2561439800.webp"
,
"id"
:
"27010768"
,
"cover_y"
:
2138
,
"is_new"
:
false
}
,
{
"rate"
:
"7.7"
,
"cover_x"
:
1000
,
"title"
:
"极限逃生"
,
"url"
:
"https:\/\/movie.douban.com\/subject\/30210691\/"
,
"playable"
:
false
,
"cover"
:
"https://img3.doubanio.com\/view\/photo\/s_ratio_poster\/public\/p2563546656.webp"
,
"id"
:
"30210691"
,
"cover_y"
:
1425
,
"is_new"
:
false
}
,
{
"rate"
:
"7.5"
,
"cover_x"
:
1080
,
"title"
:
"爱哭鬼上学记"
,
"url"
:
"https:\/\/movie.douban.com\/subject\/34781114\/"
,
"playable"
:
false
,
"cover"
:
"https://img1.doubanio.com\/view\/photo\/s_ratio_poster\/public\/p2564498289.webp"
,
"id"
:
"34781114"
,
"cover_y"
:
1599
,
"is_new"
:
true
}
,
{
"rate"
:
"6.2"
,
"cover_x"
:
2000
,
"title"
:
"大地震"
,
"url"
:
"https:\/\/movie.douban.com\/subject\/34800551\/"
,
"playable"
:
true
,
"cover"
:
"https://img3.doubanio.com\/view\/photo\/s_ratio_poster\/public\/p2568281066.webp"
,
"id"
:
"34800551"
,
"cover_y"
:
2667
,
"is_new"
:
true
}
,
{
"rate"
:
"7.9"
,
"cover_x"
:
3043
,
"title"
:
"骡子"
,
"url"
:
"https:\/\/movie.douban.com\/subject\/30135113\/"
,
"playable"
:
false
,
"cover"
:
"https://img1.doubanio.com\/view\/photo\/s_ratio_poster\/public\/p2563626309.webp"
,
"id"
:
"30135113"
,
"cover_y"
:
4500
,
"is_new"
:
false
}
,
{
"rate"
:
"5.9"
,
"cover_x"
:
4000
,
"title"
:
"X战警:黑凤凰"
,
"url"
:
"https:\/\/movie.douban.com\/subject\/26667010\/"
,
"playable"
:
false
,
"cover"
:
"https://img3.doubanio.com\/view\/photo\/s_ratio_poster\/public\/p2555886490.webp"
,
"id"
:
"26667010"
,
"cover_y"
:
5915
,
"is_new"
:
false
}
,
{
"rate"
:
"7.9"
,
"cover_x"
:
3600
,
"title"
:
"疾速备战"
,
"url"
:
"https:\/\/movie.douban.com\/subject\/26909790\/"
,
"playable"
:
false
,
"cover"
:
"https://img3.doubanio.com\/view\/photo\/s_ratio_poster\/public\/p2551393832.webp"
,
"id"
:
"26909790"
,
"cover_y"
:
5550
,
"is_new"
:
false
}
,
{
"rate"
:
"7.5"
,
"cover_x"
:
1872
,
"title"
:
"安娜"
,
"url"
:
"https:\/\/movie.douban.com\/subject\/27166976\/"
,
"playable"
:
false
,
"cover"
:
"https://img3.doubanio.com\/view\/photo\/s_ratio_poster\/public\/p2560205995.webp"
,
"id"
:
"27166976"
,
"cover_y"
:
2808
,
"is_new"
:
false
}
,
{
"rate"
:
"7.7"
,
"cover_x"
:
1500
,
"title"
:
"恶人传"
,
"url"
:
"https:\/\/movie.douban.com\/subject\/30211551\/"
,
"playable"
:
false
,
"cover"
:
"https://img3.doubanio.com\/view\/photo\/s_ratio_poster\/public\/p2555084871.webp"
,
"id"
:
"30211551"
,
"cover_y"
:
2145
,
"is_new"
:
false
}
,
{
"rate"
:
"6.0"
,
"cover_x"
:
1020
,
"title"
:
"扫毒2天地对决"
,
"url"
:
"https:\/\/movie.douban.com\/subject\/30171425\/"
,
"playable"
:
true
,
"cover"
:
"https://img3.doubanio.com\/view\/photo\/s_ratio_poster\/public\/p2561172733.webp"
,
"id"
:
"30171425"
,
"cover_y"
:
1428
,
"is_new"
:
false
}
]
}
最后将数据放入数组中,通过
pyecharts
实现数据可视化,生成html文件,
当然可能不是很好看,自己可以再调整比如居中之类的(我这里是手动改了生成之后的html部分代码)
,如图:
下面贴上完整代码:
import
json
import
requests
from
example
.
commons
import
Faker
from
pyecharts
import
options
as
opts
from
pyecharts
.
charts
import
Bar
def
conn
(
page
,
tag
)
:
result
=
requests
.
get
(
'https://movie.douban.com/j/search_subjects?type=movie&tag={}&sort=recommend&page_limit={}&page_start=0'
.
format
(
tag
,
page
)
,
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
)
print
(
result
.
content
.
decode
(
'utf-8'
)
)
x
=
json
.
loads
(
result
.
content
.
decode
(
'utf-8'
)
)
list
=
x
.
get
(
'subjects'
)
fenshu
=
[
]
name
=
[
]
for
i
in
list
:
print
(
i
.
get
(
'rate'
)
)
fenshu
.
append
(
i
.
get
(
'rate'
)
)
print
(
i
.
get
(
'title'
)
)
name
.
append
(
i
.
get
(
'title'
)
)
bar
=
Bar
(
)
bar
.
add_xaxis
(
name
)
bar
.
add_yaxis
(
'分数'
,
fenshu
,
stack
=
"stack1"
,
color
=
Faker
.
rand_color
(
)
)
bar
.
reversal_axis
(
)
bar
.
set_global_opts
(
title_opts
=
opts
.
TitleOpts
(
title
=
"影视评分"
)
,
datazoom_opts
=
opts
.
DataZoomOpts
(
orient
=
"vertical"
)
)
bar
.
set_series_opts
(
label_opts
=
opts
.
LabelOpts
(
is_show
=
False
)
,
markpoint_opts
=
opts
.
MarkPointOpts
(
data
=
[
opts
.
MarkPointItem
(
type_
=
"max"
,
name
=
"最大值"
)
,
opts
.
MarkPointItem
(
type_
=
"min"
,
name
=
"最小值"
)
,
opts
.
MarkPointItem
(
type_
=
"average"
,
name
=
"平均值"
)
,
]
)
)
bar
.
render
(
'douban.html'
)
if
__name__
==
'__main__'
:
# 第一个参数是一次抓多少条数据(比较大我试过几千),从0开始抓,第二个参数是抓什么类型,下面的names是可抓取类型,替换即可
conn
(
100
,
'最新'
)
names
=
[
'热门'
,
'最新'
,
'经典'
,
'可播放'
,
'豆瓣高分'
,
'冷门佳片'
,
'华语'
,
'欧美'
,
'日本'
,
'动作'
,
'喜剧'
,
'爱情'
,
'科幻'
,
'悬疑'
,
'恐怖'
,
'成长'
]