Qt与Python脚本实战之一(爬虫)
-
Python环境安装
1.Python版本选择(2.7 or 3.6.x) 版本不同区别蛮大
2.安装pip 一个Python包管理工具 类似nodejs的npm(都是提供了海量第三方包)
-
编写python代码实现爬取.
1.需要用到的库有: Requests lxml 如果没有安装的请自己安装一下(pip install xxx)
2.IDE : pycharm or Qtcreator
3.python 版本: 3.6
4.代码实现的是多线程下载
-
实现功能
-
爬取指定网站的图片
-
按分类写入本地目录
-
按分类将本地图片写成ppt
-
QML界面展示爬取的图片内容
-
代码展示
# This Python file uses the following encoding: utf-8
# if__name__ == "__main__":
# pass
import sys
import requests
import os
import pathlib
import pptx
from pptx.util import Inches
from lxml import etree
from threading import *
from time import sleep
nMaxThread = 3 #这里设置需要开启几条线程
ThreadLock = BoundedSemaphore(nMaxThread)
gHeads = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}
#开始将该目录下的图片进行插入到ppt操作
def writeppt(subDir):
print('subDir: ',subDir)
pptFile = pptx.Presentation()
picFiles = [fn for fn in os.listdir(subDir) if fn.endswith('.JPG') or fn.endswith('.jpg')]
# print('fn: ',fn)
# 按图片编号顺序导入
for fn in sorted(picFiles, key=lambda item:str(item[:item.rindex('.')])):
slide = pptFile.slides.add_slide(pptFile.slide_layouts[1])
print('fn: ',fn)
# 为PPTX文件当前幻灯片中第一个文本框设置文字,本文代码中可忽略
slide.shapes.placeholders[0].text = fn[:fn.rindex('.')]
fullfn = subDir+'\\'+fn
# 导入并为当前幻灯片添加图片,起始位置和尺寸可修改
slide.shapes.add_picture(fullfn, Inches(0), Inches(0), Inches(10), Inches(7.5))
pptFile.save("%s.pptx"%(subDir))
#开始轮询某个目录下的子目录
def lookupRootDir(root):
dirs = os.listdir( root )
print('dirs: ',dirs,root)
# 输出所有文件和文件夹
for file in dirs:
full = root+'\\'+file;
path = pathlib.Path(full)
print('file: ',path.is_dir())
if path.is_dir():
writeppt(full)
#开始将该网站下的图片进行分析 下载等操作的类
class JinTu(Thread):
def __init__(self,mainReferer,url,title):
Thread.__init__(self)
self.MainReferer = mainReferer
self.url = url
self.title = title[20:] #这里是为了把
给删除
self.dir = title[20:-8];
print('dir: ',self.dir)
def run(self):
try:
urlList = [self.url];
if len(urlList) > 0 and urlList != None:
self.SavePath(urlList)
finally:
ThreadLock.release()
def GetPhotoUrl(self):
heads={
"Referer":self.MainReferer
}
heads.update(gHeads)
html = requests.get(self.url,headers=heads)
if html.status_code == 200:
xmlContent = etree.HTML(html.text)
urlList = xmlContent.xpath("//div/@datasrc")
print('url list: '+urlList)
return urlList
else:
return None
def SavePath(self,urlList):
heads = {
"Referer": self.url
}
heads.update(gHeads)
savePath = "./photo/%s" % self.dir
if not os.path.exists(savePath):
os.makedirs(savePath)
for i in range(len(urlList)):
j = 0
while j<5:
#print("Download : %s/%d.jpg" % (self.title.encode("gbk"), i + 1))
print("Download Url: %s" %(urlList[i]))
html = requests.get(urlList[i],headers=heads)
if html.status_code == 200:
with open(savePath + "/%s"%(self.title),"wb") as f:
f.write(html.content)
break
elif html.status_code == 404:
j+=1
sleep(0.05)
continue
else:
return None
#开始进行爬虫操作
def startSplider():
nNum = 35
for i in range(nNum):
myid = str(i);
myid = myid.zfill(2)
url = "http://www.jinfutech.com/wx/pyitem/item/PPTView.Aspx?ID=%s"%(myid)
html = requests.get(url,headers=gHeads)
if html.status_code == 200:
xmlContent = etree.HTML(html.content)
#http://www.jinfutech.com/wx/pyitem/Style/PPT/Images/Item02-113.JPG?id=1
#../Style/PPT/Images/Item01-003.JPG?id=1
hrefList = xmlContent.xpath("//div/@datasrc")
for i in range(len(hrefList)):
ThreadLock.acquire()
partUrl = hrefList[i][3:]
t = JinTu(url,"http://www.jinfutech.com/wx/pyitem/"+partUrl,hrefList[i])
t.start()
#主函数入口
if __name__ == '__main__':
startSplider()
lookupRootDir(sys.path[0]+'\photo')
import QtQuick 2.10
import QtQuick.Window 2.10
import io.thp.pyotherside 1.3 //导入qml插件 具体插件见底部说明
//http://www.jinfutech.com/wx/pyitem/item/PPTView.Aspx?ID=02
Window {
visible: true
width: 640
height: 480
title: qsTr("Hello World")
Python {
id: py //Python实例
Component.onCompleted: {
addImportPath(Qt.resolvedUrl('.'));
//jintu 为py文件 注意如果直接使用python.exe 运行则需要去掉py文件中的main函数
importModule('jintu', function (success) {
console.log('module imported: ' + success);
//等待模块初始化完进行py方法调用
getCoinlist("startSplider",0);
});
}
}
function getCoinlist(functionName,pageId){
var functionId = 'jintu.'+functionName;
py.call(functionId, '', function(result) {
console.log();
});
}
}
- 贡献
csdn-HarlanHong
寒山-居士
pyotherside-qml插件
- 愿景
- 实现国内针对QtQuick与Python结合的最新信息的更新和传播
- 扩大QtQuick在移动开发领域的知名度
- 兼容各个主流平台的开发
- 为自身的产品打下基石