python 爬虫：requests抓取的页面信息和浏览器中看到的不一样

1,600次阅读

网址:https://www.rmfysszc.gov.cn/statichtml/rm_obj/108362.shtml

python

用

请求出的网页解析为

</div>

from selenium import webdriver

import requests

from scrapy.http import HtmlResponse

from scrapy.selector import Selector

headers = {"User-Agent": choice(

[

"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",

"Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",

"Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/78.0.3904.97 Safari/535.11"

]),

"X-Forwarded-For": str(randint(0, 255)) + "." + str(randint(0, 255)) + "." + str(

randint(0, 255)) + "." + str(randint(0, 255))

}

driver = webdriver.Chrome()

url = "https://www.rmfysszc.gov.cn/statichtml/rm_obj/108362.shtml"

driver.get(url=url)

res = requests.get(url=url, headers=headers)

res.encoding = res.apparent_encoding

print(res.text)

driver.close()

解决办法：

方法一:

可以通过使用HtmlResponse来获取状态及成交价

from scrapy.http import HtmlResponse

from scrapy.selector import Selector

res = HtmlResponse(url=url, body=driver.page_source, headers=headers, encoding="utf-8")

print(res.xpath('//*[@id="time1"]').extract())

# ['<div id="time1" style="width:100%; height:60px; text-indent:30px; font-size:14px; line-height:60px;color:#2f2f2f;font-family:\'SimSun\'">状态:成交; <span class="ti">成交价:65.213276万元</span></div>']

stats = Selector(text=res.text).css('div#time1::text').extract()

print(stats) # ['状态:成交; ']

price = Selector(text=res.text).css('span.ti::text').extract()

print(price) # ['成交价:65.213276万元']

driver.close()

方法二:

1、查找获取内容所在位置，找到class或者id属性

python

2、查找id或者class

python

3、点击2中的步骤2

python

从图中看出是使用Ajax渲染，

4、点击Network,选中ALL,重新加载页面，在查找栏输入3中的urlOne中的框部分

python

from selenium import webdriver

import requests

import re

from random import random, randint

import time

headers = {"User-Agent": choice(

[

"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",

"Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",

"Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/78.0.3904.97 Safari/535.11"

]),

"X-Forwarded-For": str(randint(0, 255)) + "." + str(randint(0, 255)) + "." + str(

randint(0, 255)) + "." + str(randint(0, 255))

}

start_url = "https://www.rmfysszc.gov.cn/statichtml/rm_obj/108362.shtml"

jsoncallback = randint(10 ** 21, 10 ** 22 – 1)

oid = re.findall('/([0-9]+).shtml', start_url)[0]

fir_time = int(time.time() * 1000)

sec_time = fir_time + 60 # 第二个时间要大于第一个时间

pid = randint(10 ** 7, 10 ** 8 – 1)

driver = webdriver.Chrome()

url = "https://www1.rmfysszc.gov.cn/Object/Finish.shtml?jsoncallback=jQuery{jsoncallback}_{fir_time}&oid={oid}&pid={pid}&_={sec_time}".format(

jsoncallback=jsoncallback, fir_time=fir_time, oid=oid, pid=pid, sec_time=sec_time)

driver.get(url=url)

res = requests.get(url=url, headers=headers)

res.encoding = res.apparent_encoding

print(

res.text) # jQuery111101452328104477283_1578361521690({state:'0',time:'2019-12-24 10:20:15',price:'65.213276',fun1:'0'})

price = re.findall("price:'(.*?)'", res.text)[0] # 0

stat = re.findall("state:'(.*?)'", res.text)[0] # 65.213276

time_ = re.findall("time:'(.*?)'", res.text)[0] # 2019-12-24 10:20:15

print(stat, price, time_) # 0 65.213276 2019-12-24 10:20:15

driver.close()

关于AJAX数据爬取，参考文章https://www.cnblogs.com/zheng1076/p/11133695.html

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

发表于：Python爬虫

2022-10-25

# Python爬虫

复制链接

赏

python 爬虫：requests抓取的页面信息和浏览器中看到的不一样

相关文章：

HTTP代理设置详解：一步步配置指南

什么是Socks5代理IP及其优势

Socks5代理配置教程及注意事项

什么是代理服务器IP：如何选择合适的

国外代理服务器的优势及选择建议

如何找到可靠的免费代理服务器

在线代理服务器的使用与推荐

HTTP代理服务器的设置及应用实例

静态代理IP怎么填写：步骤与示例

海外静态IP的代理选择与配置