python爬虫学习之爬取某图片网站

系统 1128 0

0x00:使用xpath进行网页解析

            
              #coding: utf-8
import requests
import os
import re
from lxml import etree
import time

def get_title(title):  #获取标题,创建文件
    path=r"./Pic/"+title

    if os.path.exists(path):   #文件夹存在,返回
        return path
    else:
        os.makedirs(path)   #创建空文件夹
        return path

def pic_get(info):#下载图片
    url = info['url']
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0",
        "Referer": url
    }
    store_path=info['path']
    all=info['all']
    print("获取"+store_path+"******************")
    for i in range (1,all+1):
        i_str=str(i)
        finall_url=url+"/"+i_str
        response=requests.get(finall_url,headers=header)
        data=response.content.decode('utf-8')
        try:
            html=etree.HTML(data)
            img_url=html.xpath("//div[@class=\"main-image\"]//img")[0].xpath("./@src")[0]
            response=requests.get(img_url,headers=header)
            if response.status_code==200:
                data=response.content
                with open(store_path+"/"+i_str+'.jpg',"wb+") as fp:
                    fp.write(data)
                fp.close()
                print(img_url)
            time.sleep(0.5)
        except:
            pass
    return

def url_create(url_path,type):
    #主url产生
    if type=='main_url':
        print("正在获取全部可访问页面....")
        parser=etree.HTMLParser(encoding="utf-8")
        html=etree.parse(url_path,parser)
        num=html.xpath("//div[@class=\"nav-links\"]/a[4]")[0].xpath('text()')[0]

        main_url=[]
        for i in range(1,int(num)-10):
            tmp_url="https://www.xxxx.com/tag/xxx/page/"+str(i)
            main_url.append(tmp_url)
        return main_url


    #图片url获取
    if type=='pic_url':
        header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
        }
        response=requests.get(url_path+"/",headers=header)
        data=response.content.decode("utf-8")
        html=etree.HTML(data)
        lis=html.xpath("//ul[@id=\"pins\"]/li/span/a")

        pic_info=[]
        for li in lis:
            tmp_url=li.xpath("./@href")[0]
            title=li.xpath("text()")[0]
            pre_rul=r"[:,.<>'\":]"
            title=re.sub(pre_rul,'-',title)
            path=get_title(title)  #创建文件夹
            info={
                "path":path,
                "url":tmp_url
            }
            pic_info.append(info)
        return pic_info


    #查看每一个主题可以下载多少图片
    if type=='title_url':
        header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
        }
        response = requests.get(url_path+"/1", headers=header)
        data = response.content.decode("utf-8")
        html = etree.HTML(data)
        all = html.xpath("//div[@class=\"pagenavi\"]/a/span")[4].xpath("text()")[0]
        return int(all)


def main():
    #首先访问主页获取基本参数信息
    url="https://www.xxxxxx.com/tag/xxxxxx/"
    header={
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
    }
    response=requests.get(url,headers=header)
    data=response.content.decode("utf-8")
    with open(r"./1.html","w+",encoding="utf-8") as fp:
        fp.write(data)
    fp.close()

    #调用链接生成函数,生成可操作链接
    url_path=r"./1.html"
    main_url=url_create(url_path,'main_url')   #获取所有可访问页面
    time.sleep(1)

    #进入每一页面,获取当前页的全部可访问图片链接
    pic_url=[]
    for page_url in main_url:
        tmp_url=url_create(page_url,'pic_url')
        pic_url.append(tmp_url)
    #print(pic_url)
    time.sleep(1)   #避免操作过快,服务器拒绝响应

    #处理获取的信息
    for first in pic_url:
        for seconde in first:
            all=url_create(seconde['url'],"title_url")
            seconde['all']=all
            time.sleep(0.5)
    print("全部信息获取完毕,开始下载图片!!!!\n")
    print(pic_url)

    for first in pic_url:
        for seconde in first:
            pic_get(seconde)
            time.sleep(0.5)

if __name__ == '__main__':
    main()

            
          

0x01:使用正则表达式进行网页数据获取:

            
              #-*-coding:utf-8 -*-
import re
import requests
from multiprocessing import Pool
import time
import os

def get_Pic(url):
    print(url)
    header = {
        "Referer": url,
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
    response = requests.get(url, headers=header)
    data = response.content.decode()
    title = re.findall(r'
              

(.*?)

',data) Pic_url=re.findall(r' ',data,re.DOTALL) max=re.findall(r' (.*?) ',data,re.DOTALL) #创建存储文件夹 path="./Pic/"+title[0] if os.path.exists(path): print("图片存储位置:"+path) pass else: print("成功创建存储文件夹"+path) os.makedirs(path) ############# #开始下载图片 for i in range(1,int(max[0])+1): if i<10: i_str="0"+str(i) else: i_str=str(i) pic_url=Pic_url[0][:-6]+i_str+".jpg" print("开始下载"+pic_url) try: response=requests.get(pic_url,headers=header) store_path=path+"/"+i_str+".jpg" with open(store_path,"wb+") as fp: fp.write(response.content) fp.close() time.sleep(0.5) except: print(pic_url+"下载失败,下载下一张") pass return def get_Url(url): header={ "Referer": url, "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } response=requests.get(url,headers=header) data=response.content.decode() all_url=re.findall(r"https://www.xxxxxx.com/\d{4,6}",data) return list(set(all_url))#去重之后返回 def get_wight(): print("页数区间实例:4-10,爬行第四页到第十页。") in_ = input("请输入想爬行的页数区间(页数过多可能导致服务停止,最大10页):") wight = re.findall(r".*(\d{1,2}).(\d{1,2}).*", in_, re.DOTALL) if wight == []: print("爬行区间输入有误!") exit(0) else: (start, end) = wight[0] start = int(start) end = int(end) if start <= 0 or start > end: print("请重新输入爬行区间。") exit(0) elif end > 230: print("末区间超过最大页数。") exit(0) elif end - start > 10: print("区间间隔过大,请重新输入。") exit(0) return (start,end) def main(): (start,end)=get_wight() urls=[] for i in range(start,end+1): i_str=str(i) url="https://www.xxxxx.com/page/%s/"% i_str #print(url) url_list=get_Url(url) time.sleep(1) #休眠一秒,避免访问速度过快 urls.append(url_list) pool=Pool(15) #创建进程池 for url_list in urls: for url in url_list: next_one=pool.apply_async(get_Pic,args=(url,)) time.sleep(0.5) next_one.wait() print("等待全部子进程结束") pool.close() pool.join() print("图片下载完成") if __name__ == '__main__': main()

end:

之前看到有一堆人都爬过,刚好学了爬虫,来试试手,中间遇到了一些坑,还是很有成长的,继续加油!


更多文章、技术交流、商务合作、联系博主

微信扫码或搜索:z360901061

微信扫一扫加我为好友

QQ号联系: 360901061

您的支持是博主写作最大的动力,如果您喜欢我的文章,感觉我的文章对您有帮助,请用微信扫描下面二维码支持博主2元、5元、10元、20元等您想捐的金额吧,狠狠点击下面给点支持吧,站长非常感激您!手机微信长按不能支付解决办法:请将微信支付二维码保存到相册,切换到微信,然后点击微信右上角扫一扫功能,选择支付二维码完成支付。

【本文对您有帮助就好】

您的支持是博主写作最大的动力,如果您喜欢我的文章,感觉我的文章对您有帮助,请用微信扫描上面二维码支持博主2元、5元、10元、自定义金额等您想捐的金额吧,站长会非常 感谢您的哦!!!

发表我的评论
最新评论 总共0条评论