—多url爬取—
import scrapy from many_url_crawl.items import ManyUrlCrawlItem
# 爬取之后N页数据 class HandleSpider(scrapy.Spider): name = 'handle' # allowed_domains = ['www.qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/text/']
url = 'https://www.qiushibaike.com/text/page/%d/' pageNum = 1
def parse(self, response): div_list = response.xpath('//*[@id="content-left"]/div')
for i in div_list: author = i.xpath('.//div[@class="author clearfix"]/a[2]/h2/text()').extract()[0] content = i.xpath('.//div[@class="content"]/span/text()').extract()[0] item = ManyUrlCrawlItem() item['author'] = author item['content'] = content yield item
if self.pageNum <= 13: # 控制!否则无限递归了。。 self.pageNum += 1 print('爬第:%d 页' % self.pageNum) new_url = self.url % self.pageNum # callback 回调函数,页面进行解析 yield scrapy.Request(url=new_url, callback=self.parse)
—爬取POST请求
import scrapy
class ScrPostSpider(scrapy.Spider): name = 'scr_post' # allowed_domains = ['www.baidu.com'] start_urls = ['https://fanyi.baidu.com/translate']
# 该方法是父类的一个方法:该方法可以对start_urls列表中的元素进行get请求的发送 # 发起post请求二中方式: #1.将Request方法中的method参数赋值成post #2.FormRequest()可以发起post请求(推荐) def start_requests(self): data={ 'query':'dog' } for url in self.start_urls: #formdata:请求参数对应的字典 yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse)
def parse(self, response): print(response.text)
—在middlewarses—
#自定义一个下载中间件的类,在类中事先process_request(处理中间件拦截到的请求)方法 class MyProxy(object): def process_request(self,request,spider): #请求ip的更换 request.meta['proxy'] = 'http://120.210.219.101:8080'
在setting55行
DOWNLOADER_MIDDLEWARES = {
'proxy.middlewares.MyProxy': 543,
}
—指定日志等级—
在setting中设置LOG_LEVEL = 'ERROR'
转载于:https://www.cnblogs.com/zhuzhiwei-2019/p/10915174.html
神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试