0x00:使用xpath进行网页解析
#coding: utf-8
import requests
import os
import re
from lxml import etree
import time
def get_title(title): #获取标题,创建文件
path=r"./Pic/"+title
if os.path.exists(path): #文件夹存在,返回
return path
else:
os.makedirs(path) #创建空文件夹
return path
def pic_get(info):#下载图片
url = info['url']
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0",
"Referer": url
}
store_path=info['path']
all=info['all']
print("获取"+store_path+"******************")
for i in range (1,all+1):
i_str=str(i)
finall_url=url+"/"+i_str
response=requests.get(finall_url,headers=header)
data=response.content.decode('utf-8')
try:
html=etree.HTML(data)
img_url=html.xpath("//div[@class=\"main-image\"]//img")[0].xpath("./@src")[0]
response=requests.get(img_url,headers=header)
if response.status_code==200:
data=response.content
with open(store_path+"/"+i_str+'.jpg',"wb+") as fp:
fp.write(data)
fp.close()
print(img_url)
time.sleep(0.5)
except:
pass
return
def url_create(url_path,type):
#主url产生
if type=='main_url':
print("正在获取全部可访问页面....")
parser=etree.HTMLParser(encoding="utf-8")
html=etree.parse(url_path,parser)
num=html.xpath("//div[@class=\"nav-links\"]/a[4]")[0].xpath('text()')[0]
main_url=[]
for i in range(1,int(num)-10):
tmp_url="https://www.xxxx.com/tag/xxx/page/"+str(i)
main_url.append(tmp_url)
return main_url
#图片url获取
if type=='pic_url':
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
}
response=requests.get(url_path+"/",headers=header)
data=response.content.decode("utf-8")
html=etree.HTML(data)
lis=html.xpath("//ul[@id=\"pins\"]/li/span/a")
pic_info=[]
for li in lis:
tmp_url=li.xpath("./@href")[0]
title=li.xpath("text()")[0]
pre_rul=r"[:,.<>'\":]"
title=re.sub(pre_rul,'-',title)
path=get_title(title) #创建文件夹
info={
"path":path,
"url":tmp_url
}
pic_info.append(info)
return pic_info
#查看每一个主题可以下载多少图片
if type=='title_url':
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
}
response = requests.get(url_path+"/1", headers=header)
data = response.content.decode("utf-8")
html = etree.HTML(data)
all = html.xpath("//div[@class=\"pagenavi\"]/a/span")[4].xpath("text()")[0]
return int(all)
def main():
#首先访问主页获取基本参数信息
url="https://www.xxxxxx.com/tag/xxxxxx/"
header={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
}
response=requests.get(url,headers=header)
data=response.content.decode("utf-8")
with open(r"./1.html","w+",encoding="utf-8") as fp:
fp.write(data)
fp.close()
#调用链接生成函数,生成可操作链接
url_path=r"./1.html"
main_url=url_create(url_path,'main_url') #获取所有可访问页面
time.sleep(1)
#进入每一页面,获取当前页的全部可访问图片链接
pic_url=[]
for page_url in main_url:
tmp_url=url_create(page_url,'pic_url')
pic_url.append(tmp_url)
#print(pic_url)
time.sleep(1) #避免操作过快,服务器拒绝响应
#处理获取的信息
for first in pic_url:
for seconde in first:
all=url_create(seconde['url'],"title_url")
seconde['all']=all
time.sleep(0.5)
print("全部信息获取完毕,开始下载图片!!!!\n")
print(pic_url)
for first in pic_url:
for seconde in first:
pic_get(seconde)
time.sleep(0.5)
if __name__ == '__main__':
main()
0x01:使用正则表达式进行网页数据获取:
#-*-coding:utf-8 -*-
import re
import requests
from multiprocessing import Pool
import time
import os
def get_Pic(url):
print(url)
header = {
"Referer": url,
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
response = requests.get(url, headers=header)
data = response.content.decode()
title = re.findall(r'
(.*?)
',data)
Pic_url=re.findall(r'
<img\ssrc=" (.*?)".*?="">
',data,re.DOTALL)
max=re.findall(r'
…
(.*?)
',data,re.DOTALL)
#创建存储文件夹
path="./Pic/"+title[0]
if os.path.exists(path):
print("图片存储位置:"+path)
pass
else:
print("成功创建存储文件夹"+path)
os.makedirs(path)
#############
#开始下载图片
for i in range(1,int(max[0])+1):
if i<10:
i_str="0"+str(i)
else:
i_str=str(i)
pic_url=Pic_url[0][:-6]+i_str+".jpg"
print("开始下载"+pic_url)
try:
response=requests.get(pic_url,headers=header)
store_path=path+"/"+i_str+".jpg"
with open(store_path,"wb+") as fp:
fp.write(response.content)
fp.close()
time.sleep(0.5)
except:
print(pic_url+"下载失败,下载下一张")
pass
return
def get_Url(url):
header={
"Referer": url,
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
response=requests.get(url,headers=header)
data=response.content.decode()
all_url=re.findall(r"https://www.xxxxxx.com/\d{4,6}",data)
return list(set(all_url))#去重之后返回
def get_wight():
print("页数区间实例:4-10,爬行第四页到第十页。")
in_ = input("请输入想爬行的页数区间(页数过多可能导致服务停止,最大10页):")
wight = re.findall(r".*(\d{1,2}).(\d{1,2}).*", in_, re.DOTALL)
if wight == []:
print("爬行区间输入有误!")
exit(0)
else:
(start, end) = wight[0]
start = int(start)
end = int(end)
if start <= 0 or start > end:
print("请重新输入爬行区间。")
exit(0)
elif end > 230:
print("末区间超过最大页数。")
exit(0)
elif end - start > 10:
print("区间间隔过大,请重新输入。")
exit(0)
return (start,end)
def main():
(start,end)=get_wight()
urls=[]
for i in range(start,end+1):
i_str=str(i)
url="https://www.xxxxx.com/page/%s/"% i_str
#print(url)
url_list=get_Url(url)
time.sleep(1) #休眠一秒,避免访问速度过快
urls.append(url_list)
pool=Pool(15) #创建进程池
for url_list in urls:
for url in url_list:
next_one=pool.apply_async(get_Pic,args=(url,))
time.sleep(0.5)
next_one.wait()
print("等待全部子进程结束")
pool.close()
pool.join()
print("图片下载完成")
if __name__ == '__main__':
main()
end:
之前看到有一堆人都爬过,刚好学了爬虫,来试试手,中间遇到了一些坑,还是很有成长的,继续加油!