准备工作
首先创建项目,代码操作参照我之前的博客,这里强调一下,由于scrapy是异步io,同时处理多个http,所以要想按顺序存一个txt每章按顺序写入,可以实现但有点繁琐,这里只为了scrapy的熟悉和了解其爬取能力,我就只是每一章存了一个txt(当然,有很多合并方法)。
用Pycharm打开项目,并且自定义一个Spider为TTSpider,继承自scrapy.spider,这里在写spider过程中逐渐完善添加item最后得到的item需要为下。
各个脚本文件的内容如下。
items.py
import scrapy
class TiantianshuwuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 每一个章节链接 link_url = scrapy.Field() # 每一章的章节名 dir_name = scrapy.Field() # 每一章的内容 dir_content = scrapy.Field() # 每一章内容存放的js文件 content_js_url = scrapy.Field()
settings.py
BOT_NAME = 'tiantianshuwu'
SPIDER_MODULES = ['tiantianshuwu.spiders'] NEWSPIDER_MODULE = 'tiantianshuwu.spiders'
ITEM_PIPELINES = { 'tiantianshuwu.pipelines.TiantianshuwuPipeline': 300, } # Crawl responsibly by identifying yourself (and your website) on the user-agent STORE = r"D:圣墟" # Obey robots.txt rules ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 0.25
开始写爬虫(具体注释)
TTSpider.py
import scrapy from tiantianshuwu.items import TiantianshuwuItem
class TTSpider(scrapy.Spider): name = "tianshu"
def __init__(self): # 链接头 self.server_link = 'http://www.ttshu.com' # 限制域名 self.allowed_domains = ['www.ttshu.com'] # 其实http文件 self.start_url = "http://www.ttshu.com/html/content/18424482.html"
def start_requests(self): yield scrapy.Request(url=self.start_url, callback=self.parse1)
# 解析出每一个章节链接 def parse1(self, response): items = [] # 查询到存放章节链接的a标签,获得href链接组成列表,该列表存放的就是每一章的链接尾巴 chapter_urls = response.xpath(r'//td[@bgcolor="#F6F6F6"]/a/@href').extract() # 遍历每一个链接放入item for index in range(len(chapter_urls)): item = TiantianshuwuItem() item["link_url"] = self.server_link + chapter_urls[index] items.append(item) # 对每个链接发出request for item in items: yield scrapy.Request(url=item['link_url'], meta={"data": item}, callback=self.parse2)
def parse2(self, response): # 获得item对象数据 item = response.meta['data'] # 提取h1标签中的章节名称 item['dir_name'] = response.xpath(r'//h1/text()').extract()[0] # 提取js代码链接位置 item['content_js_url'] = self.server_link + response.xpath(r'//p/script/@src').extract()[0] # 请求js文件 yield scrapy.Request(url=item['content_js_url'], meta={"data": item}, callback=self.parse3)
# 解析js文件解码后的字符串,去掉html文件的符号代替符 def solve_text(self, content): content = content.replace("document.write('", "") content = content.replace("' ;", "") content = content.replace(")", " ") content = content.replace("</br>", "n") content = content.replace("<br />", "n") content = content.replace("<br><br>", "n") content = content.replace(" ", " ") return content
def parse3(self, response): item = response.meta["data"] # 解析文档整体,获得文本内容 item['dir_content'] = self.solve_text(str(response.body.decode('gb2312', 'ignore'))) yield item
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from tiantianshuwu import settings import os class TiantianshuwuPipeline(object): def process_item(self, item, spider): # 检查存放目录是否存在,不存在则创建目录 if os.path.exists(settings.STORE): pass else: os.makedirs(settings.STORE) # 每一章内容以txt文件写入文件夹 with open(settings.STORE+'\'+item['dir_name'].strip()+".txt", 'w') as f: f.write(item['dir_content']) return item
通过上面的运行,1161章的内容全部爬取成功。 打开文件,可以阅读,但是我们发现还是有字符处理不正确,希望改进。 本文的源码开放于我的Github,欢迎star或者fork。
神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试