要爬取的网站不需要登陆,没有反爬机制,操作很简单
首先安装需要的程序包
pip install requests
pip install beautifulsoup4
pip install xlwt
具体的实现类GetInfo.py
#信息实体类
class product_info(object):
serios = '' # 存放商品系列
productActualPrice = '' # 存放商品成交价
productOldPrice = '' # 存放商品面价
detailString = '' # 存放商品详情
productCategory = '' # 产品类目
productName = '' # 产品名称
productTypeNum = '' # 产品型号
productFactory = '' # 产品厂家
'''
实际下载方法
'''
class downloader(object):
def __init__(self):
self.server = ''
self.target = ''
self.pageUrls = [] # 存放各个页面链接
self.productUrls = [] # 存放各个商品链接
self.productInfo = [] # 商品信息列表,用以保存至Excel
'''
初始化serverUrl及targetUrl
'''
def init(self,serverUrl,targetUrl):
self.server = serverUrl
self.target = targetUrl
'''
获取全部的分页页面
'''
def get_page_urls(self):
req = requests.get(url=self.target)
self.pageUrls.append(self.target)
html = req.text
div_bf = BeautifulSoup(html, 'html.parser')
a = div_bf.find_all('div', class_='m-pagination')[0].find_all('a')
for each in a:
if each.text!='下一页' and each.text!='末页' and each.text!='上一页' and each.text!='首页':
self.pageUrls.append(self.server+each.get('href'))
'''
获取全部商品的url
'''
def get_prodect_urls(self):
for item in self.pageUrls:
req = requests.get(url=item)
html = req.text
bf = BeautifulSoup(html, 'html.parser')
imageDivs = bf.find('div',id="goodsList",class_="。。。").find_all(class_="。。。")
for item in imageDivs:
temp = item.find_all('a')[0].get('href')
self.productUrls.append(self.server + temp);
'''
获取商品具体的内容
'''
def get_contents(self,productList):
print('productList长度%d'%len(productList));
i=0;
for targetUrl in self.productUrls:
i += 1
if i%5==0:
press = i/len(self.productUrls)*100
print('爬取进度%f' % press)
req = requests.get(url=targetUrl)
html = req.text
# 整个页面的soup对象#
soup = BeautifulSoup(html, 'html.parser')
# 获取页面头部信息#
headInfo = soup.find('div', class_='。。。')
productName = headInfo.find_all('h1', class_="。。。")[0]
if productName != None:
productName = headInfo.find_all('h1', class_="。。。")[0].text.strip().replace('\n', '');
productActualPrice = headInfo.find('tr', class_="。。。").find(
class_="。。。").find('span')
if productActualPrice != None:
productActualPrice = headInfo.find('tr', class_="。。。").find(
class_="。。。").find('span').text.strip().replace('\n', '');
productTypeNum = headInfo.find('table', class_="。。。").find_all('tr')[1].find('td')
if productTypeNum != None:
productTypeNum = headInfo.find('table', class_="。。。").find_all('tr')[1].find(
'td').text.strip().replace('\n', '');
#productWeight = headInfo.find('table', class_="。。。").find_all('tr')[2].find('td').text.strip().replace('\n', '');
#print(productTypeNum)
#print(productWeight)
detailTable=[]
serios=''
#保存爬取的数据
good = product_info()
good.serios = serios
good.productActualPrice = productActualPrice
good.detailString = detailString
good.productCategory = '电气'
good.productName = productName
good.productTypeNum = productTypeNum
good.productFactory = productFactory
if good not in productList:
productList.append(good);
'''
保存到Excel中
'''
def writer(self,productList):
print('开始写入excel成功')
workbook = xlwt.Workbook(encoding='utf-8')
sheet = workbook.add_sheet('ProdectInfo')
head = ['产品类目', '系列', '产品名称', '型号', '产品描述', '厂家','产品成交价'] # 表头
for h in range(len(head)):
sheet.write(0, h, head[h])
i = 1
for product in productList:
sheet.write(i, 0, product.productCategory)
sheet.write(i, 3, product.serios)
sheet.write(i, 4, product.productName)
sheet.write(i, 5, product.productTypeNum)
sheet.write(i, 6, product.detailString)
sheet.write(i, 7, product.productFactory)
sheet.write(i, 9, product.productActualPrice)
i += 1
workbook.save('C:/Users/Desktop/.....xls')
print('写入excel成功')
if __name__ == "__main__":
#保存所有爬到的商品信息
productList = []
#保存所有要爬的网页
urlList = []
urlList.append('https://www....')
urlList.append('https://www.....')
# 网址去重,防止数据重复
news_ids = []
for item in urlList:
if item not in news_ids:
news_ids.append(item)
i = 0;
for item in news_ids:
dl = downloader()
i += 1
print('开始爬取第%d个网址' % i)
dl.init('https://www....', item)
dl.get_page_urls()
dl.get_prodect_urls();
dl.get_contents(productList)
dl.writer(productList)