使用beautifulsoup 爬取小说,并整合到txt中。
"""
======================
@Auther:CacheYu
@Time:2019/9/16:16:09
======================
"""
# -*- coding:utf-8 -*-
import urllib.request
import urllib.error
import bs4
from bs4 import BeautifulSoup
def readdown(url):
soup = BeautifulSoup(urllib.request.urlopen(url), 'html.parser')
fixed_html = soup.prettify()
table = soup.find('table', attrs={'id': 'tabletxt'})
# # if isinstance(table, bs4.element.Tag):
# tds = table.find_all('td')
i = table.find('i').string
print(i)
div = table.find_all('div', attrs={'class': 'txt'})
content = div[0].get_text().strip()
couple = i + '\n' + content
return couple
page_url = 'https://www.dushiyanqing.net/book/90/90659/index.html'
book = r'E:\story\谁把风声听成离别歌.txt'
soup = BeautifulSoup(urllib.request.urlopen(page_url), 'html.parser')
fixed_html = soup.prettify()
table = soup.find('table')
if isinstance(table, bs4.element.Tag):
tds = table.find_all('td', attrs={'class': 'k4'})
default_encode = 'utf-8'
print('开始写入,请稍等……')
with open(book, 'r+', encoding=default_encode) as target_file_writer:
for td in tds:
a = td.find('a')
if a is not None:
href = 'https://www.dushiyanqing.net' + a.get('href')
# print(href)
target_file_writer.write(readdown(href))
# time.sleep(random.randint(5, 10))
print('已完成!\n目录地址为:', book)