改一下用户名和密码可以直接用(要开通VIP才能抓VIP的章节),代码如下:
# -*- coding: utf-8 -*-
# @Time : 2019/5/19 17:53
# @Author : LM
import requests
from bs4 import BeautifulSoup
import json
import os
class Spider(object):
def __init__(self):
self.headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
print('请输入要爬取的小说名')
self.novel_name = input()
self.chapters_url = ''
self.download_url_ls = {}
self.loginName = ''
self.password = ''
def login(self):
data = {
'loginName': self.loginName,
'password': self.password
}
login_url = 'https://passport.17k.com/ck/user/login'
session = requests.Session()
html = session.post(url=login_url, data=data, headers=self.headers).text
json_data = json.loads(html)
if json_data['status']['msg'] == 'succ':
print('登陆成功,当前账户:{}'.format(self.loginName))
return session
else:
print('用户名或者密码错误')
def catalogue(self):
url = 'https://search.17k.com/search.xhtml?c.st=0&c.q=' + self.novel_name
html = requests.get(url=url, headers=self.headers).text
soup = BeautifulSoup(html, 'lxml')
res = soup.findAll(attrs={'class': 'textmiddle'})
for i in res:
info = i.select(' dl > dt > a')[0]
searched_name = info.get_text().strip()
if searched_name == self.novel_name:
print('{}:查找成功'.format(self.novel_name))
self.chapters_url = 'https://' + info.get('href')[2:].replace('book', 'list')
break
print('查找失败,不存在该小说或拼写错误')
def get_download_url(self):
html = requests.get(url=self.chapters_url, headers=self.headers)
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text, 'lxml')
volume = soup.find_all('dl', class_='Volume')
for c in volume:
chapters_res = BeautifulSoup(str(c), 'lxml')
all_chapters = chapters_res.find_all('a', target='_blank')
for each_chapter in all_chapters:
download_url = each_chapter.get('href').strip()
is_vip = each_chapter.find_all('span')[0].get('class')[1]
self.download_url_ls['{}'.format(download_url)] = is_vip
def download_content(self):
session = self.login()
if os.path.exists('./{}.txt'.format(self.novel_name)):
os.remove('./{}.txt'.format(self.novel_name))
print('小说存在,已删除')
print('开始爬取小说:{}'.format(self.novel_name))
for u in self.download_url_ls.keys():
if self.download_url_ls[u] == '':
url = 'https://www.17k.com' + u
html = requests.get(url=url, headers=self.headers)
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text, 'lxml')
read_area = soup.find_all('div', class_='readAreaBox content')[0]
title = read_area.select('h1')[0].get_text()
print('正在爬取章节:{}'.format(title))
content = ''
for c in read_area.select('p'):
content += ' ' + c.get_text() + '\n'
else:
book_id = u.split('/')[2]
chapter_id = u.split('/')[3].split('.')[0]
url = 'https://www.17k.com/ck/book/{}/chapter/{}?subAllPrice=1&appKey=2406394919'.format(book_id, chapter_id)
html = session.get(url=url, headers=self.headers).text
#print(html)
json_data = json.loads(html)
title = json_data['data']['name']
print('正在爬取章节:{}'.format(title))
content = ' ' + json_data['data']['content'][0]['text'].replace('\r', '')
with open('./{}.txt'.format(self.novel_name), 'a', encoding='utf-8') as f:
f.write(title + '\n\n')
f.write(content)
f.write('\n\n')
print('爬取完成。')
if __name__ == "__main__":
s = Spider()
s.catalogue()
s.get_download_url()
s.download_content()