使用scrapy抓取BLAH全部EPUB格至书籍
- 作者: so糖果果
- 来源: 51数据库
- 2022-08-12
# -*- coding:utf-8 -*-
__author__ = 'Kiun'
import scrapy
from scrapy.selector import Selector
from scrapy.contrib.loader import ItemLoader, Identity
from sys import argv
import requests
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from scrapy import log
log.msg("This is a warning", level=log.WARNING)
class NovelSpider(scrapy.Spider):
name = "novel"
allowed_domains = ["blah.me"]
start_urls = [
"http://blah.me/"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath("//div[@class='ok-book-item']")
i = -1
for site in sites:
i += 1
author = site.xpath("//div[@class='ok-book-author']/text()").extract()
link = site.xpath("//a[@data-book-type='epub']/@href").extract()
title =site.xpath("//a[@data-book-type='epub']/@data-book-title").extract()
with open('/caonima.txt','a') as f:
f.write(title[i].strip()+':http://blah.me'+link[i]+'\n')
j = -1
for l in link:
j += 1
url = 'http://blah.me'+ l
filename = title[j]+'.epub'
with open(filename, 'wb') as handle:
response = requests.get(url,stream=True)
if not response.ok:
# Something went wrong
print 'failed:%s' % (title[j]+':'+url)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
print '%s finished' % (title[j])
for link in range(2,100):
request = scrapy.Request("http://blah.me/?p="+str(link), callback=self.parse)
yield request
推荐阅读
热点文章
Discord.py(重写)on_member_update 无法正常工作
0
Discord.py 在 vc 中获取用户分钟数
0
discord.py 重写 |为我的命令出错
0
Discord.py rewrite 如何 DM 命令?
0
播放音频时,最后一部分被切断.如何解决这个问题?(discord.py)
0
在消息删除消息 Discord.py
0
如何使 discord.py 机器人私人/直接消息不是作者的人?
0
(Discord.py) 如何获取整个嵌入内容?
0
Discord bot 尽管获得了许可,但不能提及所有人
0
Discord.py discord.NotFound 异常
0
