《Python3爬虫、数据清洗和可视化实战》
零一 韩要宾 黄园园 著
第十章 综合应用实例
实例:按性价比给用户推荐旅游产品
第一部分:数据采集
import requests
import json
import urllib. Request
import time
import csv
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver,support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
def globalVals():
global driver
global driver_
driver = webdriver.Chrome()
driver_ = webdriver.Chrome ()
def init_ csv() :
global f
global writer
csvFile = "D:/qunar_routes.csv"
#打幵文件后如果乱码,則將utf-8改成gb18030
f = open(csvFile, "w",newline="", encoding='utf-8')
writer = csv.writer(f)
writer.writerow(["出父地","目的地","路践信息","酒店信息"])
def close_csv() :
global f
f.close()
def dump_routes_csv(dep,arr):
global driver
global driver_
global writer
#定位所有路銭信息
routes = driver.find_elemerts_by_css_selector(".item.g-flexbox.list-item")
for route in routes:
try:
print ("\nroute info:%s" % route.text)
#获取路线详细页URL
url = route.get_attribute ("data-ur1")
print ("url:%s" % url)
#在另一个浏览器对象打开路线详情页
driver_.get(url)
time.sleep(random.uniform(2, 3))
if "fhtouch" in url: #机酒自由行
try:
# we have to wait for the page to refresh
WebDriverWait(driver_,10).until(EC.presence_of_element_located((By .css_SELECTOR,”#allHotels”)))
Source=diver_.find_element_css_selector(‘#main-page’)
target=diver_.find_element_css_selector(‘#allHotels’)
except:
print (str(e))
continue
else: #自由行
try:
#等待頁面刷新成功
WebDriverWait(driver_,10).until(EC.presence_of_element_located((By .css_SELECTOR,”.m-ball.m-ball-back”)))
Source=diver_.find_element_css_selector(‘.flex.scrollable’)
target=diver_.find_element_css_selector(‘.m-ball.m-ball-back’)
except:
print (str(e))
continue
#路线详情页需須通过drag_and_drop动作获得焦点,否则[rage Down]鍵无效 ActionChains(driver_).drag_and_drop(source, target).perform()
for i in tange(3):
#模拟[Page Down]鍵的輸入,实现下拉滚动条动作 (3次)
ActionChains(driver_).send_keys (Keys.PAGE_DOWN).perform()
#路线详情页下拉滚动条后才可定位到下面的元素
try:
# we have to wait for the page to refresh
WebDriverWait(driver_,10).until(EC.presence_of_elenent_located(By.css_SELECTOR,".tit .score")))
except Exception as e:
print(str(e))
continue
try:
#获取酒店评分
rating = driver_.find_element_by_cas_selector(“.tit .score"
#获取酒店类型
type=driver.find_element_by_css_selector(".tit+ .tag-list > .g-tag.solid")
#拼接成酒店信息
hotel = '\n'.join([rating.text, type.text])
print ("hotel info:%s" % hotel)
except Exception as e:
print (str(e))
continue
#将这一条路线信息写入CSV文件
writet.writerow([dep, arr, route.text, hotel])
except:
continue
if __name__ == "__main_":
globalVals()
init_csv()
dep.cities = [“杭州"]
for ecp in dep cities:
strhtmI = requests.get('https://m.dujia.qunar.com/golfz/sight/arriveRecommend?dep=' + urllib.request.quote(dep) + '&exclude=&extensionImg=255, 175’)
arrive_dict = json.loads(strhtml.text)
for arr_item in arrive_dict['data']:
#本例只爬取国内自由行路线,如需爬取国际路线,可将下面两行注释掉
if acr_item['title'] != "国内":
continue
for arr_item_1 in arr_item[ 'subModules'] :
for guery in arr_item_1['items'] :
#本例只爬取杭州-丽江的自由行路线,如需爬取杭州-全国路线,注释下面两行
if query['query'] != "丽江":
continue
#打幵移动端自由行路线捜索结果頁面
driver.get ("https://touch.dujia.qunar.com/p/list?cfrom=zyx&dep=" + urllib. request.quote(dep) + "&query=" + urlib.request.quote(query['query']) + "%e8%87%aa%e7%94%b%e8%a1%8c%it=n_index_free"l
try:
#we have to wait for the page to refresh
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,"item g-flexbox list-item ")))
except Exception as e:
print(str(e))
raise
print("dep:%s arr:%s" % (dep, query["query"]))
#连续下拉滚动条50次获取更多的信息
for I in range(50):
time.sleep(random.uniform(2, 3))
print("page %d" % (i+1))
#模拟动作实现下拉
ActionChains(driver).send_keys (Keys.PAGE_DOWN).perform()
#将出发地-目的地的自由行路线写入СЅV 文件
dump_ routes_csv (dep, query["query"])
close_csv()
driver.close()
driver_.close()
第二部分:数据清洗、建模
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api assm
#读取路线信息csv文件
df = pd.read_csv("D: /qunar_coutes.csv")
print(df.head())
print(df.info())
#从路线信息中提取天数、价格信息
df[“天数”]=df.路线信息.str.extract(‘(d+)天\d+晚’).apply(lambda x: int(x))
df["价格"]=df.路线信息.str.extract('(\d+)起/人').apply(lambda x: int(x))
#从酒店信息中提取评分、等级信息
df["酒店评分"]=df.酒店信息,str.extract('(\d\.\d)分').apply(lambda x: float(x))
df["酒店等级"]=df.酒店信息.str.extract('\n(.*)')
print (df.head())
print (df.info() )
#将酒店等级信息由文本型映射成数值型
class map = {"其他":0, "经济型":1, "舒适型":2, "高档型":3, "豪华型":4}
df["酒店等级"]=df["酒店等级"].map (class_map)
#对变量画直方图,查看是否有异常值
fig, axes = plt.subplots(1,3,figsize=(12,4))
df["酒店等级"].plot (ax=axes[0],kind='hist',title="酒店等级")
df["酒店评分"].plot(ax=axes[1], kind='hist',title="酒店评分")
df["价格"].plot (ax=axes[2],kind='hist', title="价格")
#提取自变量X,因变量y
X,y = df.ix[:,4:-1].values,df.ix[:,-1].values
#拟合OLS线性回归模型
ols = sm.OLS (y,X)
result = ols.fit()
#查看拟合效果,R=0.886
print (result. summary())
#用训练好的线性回归模型来预测路线价格
y_pred = result.predict (X)
#性价比定义为预测价格和实际价格的比值
ratio = y_pred/y
df["性价比"] = ratio
#按性价比从高到低排序
print(df.sort_values ("性价比",ascending=False))
(
"Don't waste your time looking back, you're not going that way."--《Vikings》
)