Scrapy 多url爬取、爬取post请求、更换代理ip、指定日志等级

540次阅读
没有评论
Scrapy

—多url爬取—

import scrapy from many_url_crawl.items import ManyUrlCrawlItem

# 爬取之后N页数据 class HandleSpider(scrapy.Spider): name = 'handle' # allowed_domains = ['www.qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/text/']

url = 'https://www.qiushibaike.com/text/page/%d/' pageNum = 1

def parse(self, response): div_list = response.xpath('//*[@id="content-left"]/div')

for i in div_list: author = i.xpath('.//div[@class="author clearfix"]/a[2]/h2/text()').extract()[0] content = i.xpath('.//div[@class="content"]/span/text()').extract()[0] item = ManyUrlCrawlItem() item['author'] = author item['content'] = content yield item

if self.pageNum <= 13: # 控制!否则无限递归了。。 self.pageNum += 1 print('爬第:%d 页' % self.pageNum) new_url = self.url % self.pageNum # callback 回调函数,页面进行解析 yield scrapy.Request(url=new_url, callback=self.parse)

 

—爬取POST请求

import scrapy

class ScrPostSpider(scrapy.Spider): name = 'scr_post' # allowed_domains = ['www.baidu.com'] start_urls = ['https://fanyi.baidu.com/translate']

# 该方法是父类的一个方法:该方法可以对start_urls列表中的元素进行get请求的发送 # 发起post请求二中方式: #1.将Request方法中的method参数赋值成post #2.FormRequest()可以发起post请求(推荐) def start_requests(self): data={ 'query':'dog' } for url in self.start_urls: #formdata:请求参数对应的字典 yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse)

def parse(self, response): print(response.text)

 

—在middlewarses—

#自定义一个下载中间件的类,在类中事先process_request(处理中间件拦截到的请求)方法 class MyProxy(object): def process_request(self,request,spider): #请求ip的更换 request.meta['proxy'] = 'http://120.210.219.101:8080'

在setting55行

DOWNLOADER_MIDDLEWARES = {
'proxy.middlewares.MyProxy': 543,
}

—指定日志等级—

在setting中设置LOG_LEVEL = 'ERROR'

转载于:https://www.cnblogs.com/zhuzhiwei-2019/p/10915174.html

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

相关文章:

版权声明:Python教程2022-10-25发表,共计1761字。
新手QQ群:570568346,欢迎进群讨论 Python51学习