Python爬虫,使用BeautifulSoup可以轻松解析页面结果,下面是使用该方法爬取boss页面的职位信息:包括职位名称、薪资、地点、公司名称、公司融资情况等信息。通过这个示例可以轻松看到BeautifulSoup的使用方法。
1、爬取boss直聘的职位信息
import requests
from bs4 import BeautifulSoup
from middlewares import get_random_proxy,get_random_agent
import time
class Boss_Spider(object):
def __init__(self, page=3):
self.proxies = []
self.verify_pro = []
self.page = page
self.headers = {}
#第一步:获取首页所有招聘连接
def Parse_pre(self):
base_url = 'https://www.zhipin.com/'
headers = get_random_agent()
proxy = get_random_proxy()
time.sleep(1)
resp = requests.get(base_url, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, 'lxml')
for job_menu in soup.find_all(class_='menu-sub'):
for li in job_menu.find_all('li'):
job_type = li.find('h4').get_text()
for job_list in li.find_all('a'):
job_sub = job_list.get_text()
job_uri = job_list['href']
for i in range(0,11):
job_url = base_url + job_uri + '?page=%d&ka=page-%d' %(i,i)
requests.get(job_url,headers=headers,proxies=proxy)
meta = {
'job_type': job_type,
'job_sub': job_sub,
}
self.Parse_index(meta=meta,url=job_url)
#爬取具体页数据
def Parse_index(self,meta,url):
headers = get_random_agent()
proxy = get_random_proxy()
time.sleep(1)
resp = requests.get(url, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, 'lxml')
print(soup)
for li in soup.find(class_='job-list').find_all('li'):
print('###########')
position = li.find(class_='job-title').get_text()
salary = li.find(class_='red').get_text()
add = li.find('p').get_text()
need = li.find('p').find('em').get_text()
company_name = li.find(class_='company-text').find('a').get_text()
tag = li.find(class_='company-text').find('p')
print(position,"$$$",salary,"$$$",add,"$$$",need,"$$$",company_name,"$$$",tag)
if __name__ == '__main__':
b = Boss_Spider()
b.Parse_pre()
运行输出结果如下:
后端开发 $$$ 15-30K $$$ 北京 朝阳区 朝外3-5年本科 $$$ $$$ 米花互动 $$$ 游戏不需要融资20-99人
###########
后端开发工程师 $$$ 35-55K $$$ 北京 朝阳区 望京经验不限本科 $$$ $$$ 云账户 $$$ 移动互联网C轮100-499人
###########
2、爬取豆瓣网图书前250信息
import requests
from bs4 import BeautifulSoup
# 发出请求获得HTML源码的函数
def get_html(url):
# 伪装成浏览器访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
resp = requests.get(url, headers=headers).text
return resp
# 解析页面,获得数据信息
def html_parse():
i = 1
# 调用函数,for循环迭代出所有页面
for url in all_page():
# BeautifulSoup的解析
soup = BeautifulSoup(get_html(url), 'lxml')
# 书名
alldiv = soup.find_all('div', class_='pl2')
names = [a.find('a')['title'] for a in alldiv]
# 作者
allp = soup.find_all('p', class_='pl')
authors = [p.get_text() for p in allp]
# 评分
starspan = soup.find_all('span', class_='rating_nums')
scores = [s.get_text() for s in starspan]
# 简介
sumspan = soup.find_all('span', class_='inq')
sums = [i.get_text() for i in sumspan]
for name, author, score, sum in zip(names, authors, scores, sums):
name = '书名:' + str(name) + '\n'
author = '作者:' + str(author) + '\n'
score = '评分:' + str(score) + '\n'
sum = '简介:' + str(sum) + '\n'
data = str(i) + '\n' + name + author + score + sum
i = i + 1
# 保存数据
f.writelines(data + '=======================' + '\n')
# 获得所有页面的函数
def all_page():
url = 'https://book.douban.com/top250?start=200'
urllist = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
resp = BeautifulSoup(requests.get(url, headers=headers).text, 'lxml')
i = 1
for h in resp.find('div', class_='paginator').find_all('a'):
if i > 1:
urllist.append(h['href'])
i = i + 1
urllist.append(url)
print(urllist)
return urllist
# 文件名
filename = '豆瓣图书Top250.txt'
# 保存文件操作
f = open(filename, 'w', encoding='utf-8')
# 调用函数
html_parse()
f.close()
print('保存成功。')
3、