声明:本文仅作为学习爱好者编写,请勿商业和恶意攻击源网站,本文所有解释权归作者
本文没有使用爬虫框架,仅用了三个Python的常用库
本文适合新手参考,文章里面有大量注释为理解提供便利
# 爬喜马拉雅
import requests
from lxml import etree
import os
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
'Referer':'https://www.ximalaya.com/'
}
# 设置url 我们先搞第一页,注意默认p1是省略的 总歌单 单歌曲
# https://www.ximalaya.com/youshengshu/14968275/83332135
# 如果要爬取别的区块的内容,则修改下面的url就可以了
url = "https://www.ximalaya.com/youshengshu/p1/"
# 列表页函数
def listing(url):
#直接请求
response = requests.get(url=url, headers=headers).text
# 生成解析对象
etrees = etree.HTML(response)
# 解析页面 获取当前页的歌曲地址
page_listing_url = etrees.xpath('//div[@class="sound-list _yo5_"]/ul/li/div[@class="text _yo5_"]/a/@href')
# 获取歌曲名称
page_listing_name = etrees.xpath('//div[@id="anchor_sound_list"]/div/ul/li/div[@class="text _yo5_"]/a/span/text()')
# 获取总的名字
all_name = etrees.xpath('//div[@class="info _J460"]/h1/text()')[0]
# 遍历歌曲url不是传到详情页 而是传到外链页 https://link.hhtjim.com/ximalaya/203837977.mp3
file_path = "./%s/"%all_name
if not os.path.exists(file_path):
os.mkdir(file_path)
for i,j in enumerate(page_listing_url):
for k,v in enumerate(page_listing_name):
if i == k:
# 分割i ,拿出歌曲id
song_id = j.split("/")[-1]
# 拼接路由
url = "https://link.hhtjim.com/ximalaya/" + song_id + ".mp3"
# 传给外链页url,返回歌曲内容
content = linking(url)
with open(file_path + v + ".mp3","wb") as f:
f.write(content)
print("%s,%s下载成功"%(all_name,v))
# 获取下一页的列表,有下一页则递归
next_url = etrees.xpath('//div[@class="pagination _yo5_"]/nav/ul/li[@class="page-next page-item _dN2"]/a/@href')
if next_url:
# 拼接路由
url = "https://www.ximalaya.com" + next_url[0]
# 递归调用
listing(url)
else:
pass
# 外链页函数
def linking(url):
response = requests.get(url=url,headers=headers).content
return response
# 分类首页函数
def category_index(url):
# 发送请求
response = requests.get(url=url,headers=headers).text
# 生成解析对象
etrees = etree.HTML(response)
# 解析页面 获取当前页的歌单地址
# page_url_list = etrees.xpath('//div[@class="content"]/ul/li/div/a[@class="album-title line-1 lg bold _qie"]/@href')
page_url_list = etrees.xpath('//div[@class="content"]/ul/li/div/a[@class="album-title line-1 lg bold _qie"]/@href')
print(page_url_list)
# page_name_list = etrees.xpath('//div[@class="content"]/ul/li/div/a/span/text()')
# 获取之后遍历,直接把url传到列表页
for i in page_url_list:
# 拼接url传值 /youshengshu/25407248/ https://www.ximalaya.com/youshengshu/25407248/
url = "https://www.ximalaya.com" + i
listing(url)
# 判断是否还有下一页 /youshengshu/p2/
next_url = etrees.xpath('//div[@class="pagination-wrap"]/nav/ul/li[@class="page-next page-item _dN2"]/a/@href')
if next_url:
# 拼接路由
url = "https://www.ximalaya.com" + next_url[0]
# 递归调用
category_index(url)
else:
pass
# 调用分类首页函数
category_index(url)