**1.创建项目**
```
E:pythonText>scrapy startproject newdongguan
```
**2.创建爬取规则**
```
E:pythonTextnewdongguan>scrapy genspider dongdong -t crawl "wz.sun0769.com"
```
**3.settings.py文件设置**
~~~
ITEM_PIPELINES = {
'newdongguan.pipelines.NewdongguanPipeline': 300,
}
LOG_FILE = "sun.log"
LOG_LEVEL = "DEBUG"
~~~
**4.pipelines.py文件**
~~~
import codecs
import json
class NewdongguanPipeline(object):
def __init__(self):
#创建一个文件
self.filename = codecs.open("dongguan.json","w",encoding="utf-8")
#self.filename = open("dongguan.json","w")
def process_item(self, item, spider):
json_text = json.dumps(dict(item),ensure_ascii=False) + "rn"
self.filename.write(json_text)
return item
def close_spider(self,spider):
self.filename.close()
~~~
**5.items.py文件**
~~~
import scrapy
class NewdongguanItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
num = scrapy.Field()
content = scrapy.Field()
url = scrapy.Field()
~~~
**6.爬虫文件**
~~~
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from newdongguan.items import NewdongguanItem
class DongdongSpider(CrawlSpider):
name = 'dongdong'
allowed_domains = ['wz.sun0769.com']
start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']
rules = (
#添加r是不用正则匹配
Rule(LinkExtractor(allow=r'type=4'), process_links='deal_links'),
Rule(LinkExtractor(allow=r'/html/question/d+/d+.shtml'),callback="parse_item")
)
# links 是当前response里提取出来的链接列表
def deal_links(self,links):
for link in links:
#print link
link.url = link.url.replace("?","&").replace("Type&","Type?")
return links
def parse_item(self, response):
item = NewdongguanItem()
item['title'] = response.xpath('//div[@class="wzy1"]//td[2]/span[1]/text()').extract()[0]
item['num'] = response.xpath('//div[@class="wzy1"]//td[2]/span[2]/text()').extract()[0].split(":")[-1]
content = response.xpath('//div[@class="contentext"]/text()').extract()
if len(content) == 0:
content = response.xpath('//div[@class="wzy1"]//tr[1]/td[@class="txt16_3"]/text()').extract()
item['content'] = "".join(content).replace(u'xa0', u'')
else:
item['content'] = "".join(content).replace(u'xa0', u'')
item['url'] = response.url
yield item
~~~
**第二种解决方法**
~~~
# -*- coding: utf-8 -*-
import scrapy
from newdongguan.items import NewdongguanItem
class DongdongSpider(scrapy.Spider):
name = 'xixi'
allowed_domains = ['wz.sun0769.com']
url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page='
offset = 0
start_urls = [url + str(offset)]
#处理当前页面的内容
def parse(self, response):
# 每一页里的所有帖子的链接集合
links = response.xpath('//div[@class="greyframe"]/table//td/a[@class="news14"]/@href').extract()
# 迭代取出集合里的链接
for link in links:
# 提取列表里每个帖子的链接,发送请求放到请求队列里,并调用self.parse_item来处理
yield scrapy.Request(link, callback=self.parse_item)
# 页面终止条件成立前,会一直自增offset的值,并发送新的页面请求,调用parse方法处理
if self.offset <= 71160:
self.offset += 30
# 发送请求放到请求队列里,调用self.parse处理response
yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
#处理每个详情页面内的内容
def parse_item(self, response):
item = NewdongguanItem()
item['title'] = response.xpath('//div[@class="wzy1"]//td[2]/span[1]/text()').extract()[0]
item['num'] = response.xpath('//div[@class="wzy1"]//td[2]/span[2]/text()').extract()[0].split(":")[-1]
content = response.xpath('//div[@class="contentext"]/text()').extract()
if len(content) == 0:
content = response.xpath('//div[@class="wzy1"]//tr[1]/td[@class="txt16_3"]/text()').extract()
item['content'] = "".join(content).replace(u'xa0', u'')
else:
item['content'] = "".join(content).replace(u'xa0', u'')
item['url'] = response.url
yield item
~~~
神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试