我每个月都要读一本书写一个书评,平时写书评用的插图都是在网上找的图,前段时间觉得这样不够炫酷要做一点炫酷的东西。最开始的想法是提取小说中的高频词做成词云,实践下来发现效果并不理想,主要是有吸引力的词汇太少并不能突出这本书的特点;于是想到用爬虫爬取评论来提取关键词,试验下来发现效果不错。
有了思路接下来要看怎么实现了,由于我本人是写java语言的,而java语言上并没有很好词云工具,于是自然想到了python。python我并不是很熟悉,完成这个小程序也遇到了不少坑,这里把代码贴一下,如果大家遇到类似的问题可以借鉴一下这个思路.
效果如下:
技术点
- python基本语法
- 网络爬虫
- 线程池
- 生成图片
- wordcolud词云
- 中文分词
代码
#import的包大家需要自己安装一下,安装方式非常简单,pip install xxx就可以了
import json
import re
from urllib import parse
import jieba
import matplotlib
import numpy as np
import requests
from bs4 import BeautifulSoup as bf
from wordcloud import WordCloud,STOPWORDS
from PIL import Image, ImageDraw, ImageFont
import os
import concurrent.futures
matplotlib.use('agg')
import matplotlib.pyplot as plt
#需要去爬取评论的书名
bookName = '悲剧人偶'
#作者
author = ''
bookSearchUrl = 'https://book.douban.com/j/subject_suggest?q=%s'
commentUrl = '%s/reviews?start=%s'
FULLCOMMENTURL = 'https://book.douban.com/j/review/%s/full'
#不指定字体中文无法显示,这里的字体是词云内显示的字体
#词云中显示中文的字体,某些字体不支持中文,因此要妥善选择
commentFont = '/System/Library/fonts/PingFang.ttc'
#词云轮廓的字体
profileFont = "/Users/daiwenkai/Library/Fonts/RuiZiYunZiKuPangTouYuTiGBK-1.ttf"
requestPool = concurrent.futures.ThreadPoolExecutor(max_workers=5)
def crawlCommentDetail(commentId):
try:
fullComment = requests.get(FULLCOMMENTURL % commentId,cookies=cookies,headers=headers)
fullCommentJson = json.loads(fullComment.text)
fullCommentContent = fullCommentJson['html']
fullCommentContent = striphtml(fullCommentContent)
except Exception as exc:
print('crawlCommentDetail there is something worong {}'.format(exc))
return fullCommentContent
def crawlCommentInfo(i):
ids = []
commentContent = requests.get(commentUrl % (bookUrl, i),cookies=cookies,headers=headers)
commentContentBf = bf(commentContent.text, "html.parser")
shortCommentLists = commentContentBf.find_all("div", {"data-cid": True})
for shortComment in shortCommentLists:
ids.append(shortComment["data-cid"])
return ids
def striphtml(data):
p = re.compile(r'<.*?>')
return p.sub('', data)
bookSearchUrl = bookSearchUrl % parse.quote(bookName)
#我在爬取过程中遇到过豆瓣的校验,需要使用cookie和header才能绕过校验
#cookies = {}
#headers = {}
#搜索书名,获得书详情的链接
bookNameList = requests.get(bookSearchUrl,cookies=cookies,headers=headers)
bookNameListJson = json.loads(bookNameList.text)
bookUrl = ''
for bookInfo in bookNameListJson:
if bookName.lower() in bookInfo['title'].lower() and author.lower() in bookInfo['author_name'].lower():
bookNameListJson = json.loads(bookNameList.text)
bookUrl = bookNameListJson[0]['url']
bookName = bookInfo['title'].lower()
break
if bookUrl:
print('获取书籍相关信息成功!')
else:
print('未搜索到相关书籍')
os._exit(0)
#访问书籍详情连接,主要是为了获取评论的地址
bookinfoHtmlContent = requests.get(bookUrl,cookies=cookies,headers=headers)
bookinfoHtmlContentBf = bf(bookinfoHtmlContent.text, "html.parser")
commentUrlSuffix = bookinfoHtmlContentBf.find_all("p", class_="pl")
#得到评论的地址
try:
commentUrlSuffix = commentUrlSuffix[0].a.get('href')
except Exception as exc:
print('评论模块出现故障!')
os._exit(0)
commentContent = requests.get(bookUrl + "/" + commentUrlSuffix,cookies=cookies,headers=headers)
if commentContent:
print('获取书籍评论成功!')
commentContentBf = bf(commentContent.text, "html.parser")
#获取总评论数量
total = commentContentBf.find("span",{"class":"count"})
total = re.findall(r"\d+",total.string)[0]
commentDetailIds = []
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
commentId = {executor.submit(crawlCommentInfo, i): i for i in range(0,int(total),20)}
for future in concurrent.futures.as_completed(commentId):
commentDetailId = future.result()
commentDetailIds.extend(commentDetailId)
print('获取书籍评论详情完成!')
allContent = ''
#使用线程池去爬
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
fullCommentContent = {executor.submit(crawlCommentDetail, commentId): commentId for commentId in commentDetailIds}
for future in concurrent.futures.as_completed(fullCommentContent):
try:
content = future.result()
allContent += content
except Exception as exc:
print('there is something worong {}'.format(future))
#生成词云轮廓图片
#图片的长为书名长度*4,高度为600
img = Image.new('RGB', (400*len(bookName), 600), color=(255,255,255))
#指定字体,字体大小为400
fnt = ImageFont.truetype('/Users/daiwenkai/Library/Fonts/RuiZiYunZiKuPangTouYuTiGBK-1.ttf', 400)
d = ImageDraw.Draw(img)
#指定字体在写入图片中时
d.text((0, 100), bookName, font=fnt, fill=(0, 0, 0))
#使用分词器对文本进行分词
str_list = jieba.cut(allContent, HMM=True)
outstr = ''
for word in str_list:
outstr += word
outstr += ' '
mask = np.array(img)
#这只我下载了百度的stopwords合集。使用这个合集可以排除一些无意义的介词、连词,可以使我们词云上的词汇更有吸引力。当然,不用也没关系
myStopWords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()]
stopwords = set(STOPWORDS)
stopwords.add("nbsp")
stopwords |= set(myStopWords)
wordcloud = WordCloud(stopwords=stopwords,font_path=commentFont,background_color="white",max_words=len(bookName) * 120,mask=mask,contour_width=1, contour_color='green',height=800,width=1000).generate_from_text(outstr)
#使用 interpolation="bilinear" 让图片显示的更平滑
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
wordcloud.to_file(bookName + ".png")
后续我还会持续对这个小程序进行改进,最终目标是把这个应用做成一个小程序来让大家使用