功能点:如何爬取列表页,并根据列表页获取详情页信息?
爬取网站:东莞ipipgo政务网
完整代码:https://files.cnblogs.com/files/bookwed/yangguang.zip
主要代码:
yg.py
import scrapy from yangguang.items import YangguangItem
class YgSpider(scrapy.Spider): name = 'yg' allowed_domains = ['sun0769.com'] start_urls = ['http://wz.sun0769.com/index.php/question/report']
def parse(self, response): tr_list = response.xpath("//div[@class='greyframe']/table[2]//tr") for tr in tr_list: item = YangguangItem() item["title"] = tr.xpath("./td[2]/a[2]/text()").extract_first() item["href"] = tr.xpath("./td[2]/a[2]/@href").extract_first() item["status"] = tr.xpath("./td[3]/span/text()").extract_first() item["publish_time"] = tr.xpath("./td[last()]/text()").extract_first() if type(item["href"]) == str: # 请求详情页 yield scrapy.Request( item["href"], callback=self.parse_detail, meta={"item": item} )
# 翻页 next_url = response.xpath("//a[text()='>']/@href").extract_first() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)
# 解析详情页 def parse_detail(self, response): item = response.meta["item"] # 获取详情页的内容、图片 item["content"] = response.xpath("//div[@class='wzy1']/table[2]//tr[1]/td[@class='txt16_3']/text()").extract() item["content_image"] = response.xpath("//div[@class='wzy1']/table[2]//tr[1]/td[@class='txt16_3']//img/@src").extract() item["content_image"] = ["http://wz.sun0769.com"+i for i in item["content_image"]] yield item # 对返回的数据进行处理
pipelines.py
class YangguangPipeline(object): def __init__(self): self.f = open('yangguang.json', 'w', encoding='utf-8')
def process_item(self, item, spider): item["content"] = self.process_content(item["content"]) self.f.write(json.dumps(dict(item), ensure_ascii=False) + ',n') return item
def process_content(self, content): # 对内容项里的xa0 和 空白字符替换为空 content = [re.sub(r"xa0|s", "", i) for i in content] # 对替换过的空字符串去除 content = [i for i in content if len(i) > 0] return content
转载于:https://www.cnblogs.com/bookwed/p/10617789.html
神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试