第1章 网络爬虫入门
1.选择题
(1)B (2)A (3)D
2.简答题
(1)预先设定一个或若干个初始网页URL,将初始URL加入到待爬取URL列表中;从待爬取列表中逐个读取URL,并将URL加入到已爬取URL列表中,然后下载网页;解析已下载的网页,并存储提取的数据,从中获取新的URL;将新的URL在已爬取的URL列表中进行比对,检查该网页是否已爬取,如果网页没有被爬取,则将新的URL地址放入到待爬取URL列表的末尾,等待读取;如此往复,直到待爬取URL列表为空或者满足设定的终止条件,最终达到遍历网页的目的。
(2)门户网站、搜索引擎和大型网络服务提供商是网络爬虫的主要应用场景;有些企业也会时常借助网络爬虫采集和分析数据;当然,普通用户也可以利用网络爬虫采集自己关注的数据。
3.实践题
print('*'*26)
print('我用Python编写网络爬虫。')
print('*'*26)
第2章 爬虫基础
1.选择题
(1)A (3)A (4)D (5)C (6)C
2.填空题
(1)request、error、parse和robotparser
(2)请求的网址、请求方法、请求头和请求体
(3)响应状态码、响应头和响应体
3.实践题
import requests #导入requests模块
import time #导入time模块
headersvalue = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'Cookie':'__jdu=1673555654;shshshfpa=565be4ca-07c2-cd67-78e4-92691c5c0e78-1590370825;shshshfpb=rVrKSmn2IcLo8w4GOV3s%2FvQ%3D%3D;unpl=V2_ZzNtbRAEREJxAUNcfEkLV2JQEl0RUkUUJQgTVnMdDgQzU0ZfclRCFnQUR1JnGFwUZwAZWEJcRhRFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHseWgdjBxZaRlJzJXI4dmR5GFUDYTMTbUNnAUEpAE9ceR5VSGcEFF9GU0cScQ12VUsa;__jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_cb6f58496afc4b11b470a1d395c0eae3|1593578499694;areaId=1;ipLoc-djd=1-2800-0-0;PCSYCityID=CN_110000_110100_110108;user-key=a7ecfb5e-4ce5-45f3-bc50-f34f38529f35; __jda=122270672.1673555654.1590370824.1593575769.1593578500.5; __jdc=122270672;shshshfp=a361ce6062d6acb4a213e3fccb63cea4;cart-main=xx;cd=0;wlfstk_smdl=o0y1hhq7lhtt3bmuyyrsq4frzsm2x25l;TrackID=1bBtBpjzT4KptbszIfvAs3fgoavSXZLPO-rjd9P-eg3J0f2YgpvBSjdIiKW0synUQAkjr8iSOKnvciMPU3XkYwhLdym1pPC6JJlsY52WVqIk;thor=53783C64FD0A997288CB6CC2D828217654920A82353860FFBC08C3E814CD72E23EC5E6849607FB867D56BB6BB176E21719C7A0E6ED254BD9BEFAE84419AE8A5164F65661B13E6E9FE5D6D9A1543D1E5ED2933AA8FF5FBE38555C48D1EE365B4B2437708D39597BAF689B54B61F3E88115CFAB7D39814771A44D458655F67F6DE109AD4D41497D61F1BA892BB9B996453;pinId=fGixLn-KPTpnavPdRtFWag;pin=linglanwangyi; unick=%E5%87%8C-%E8%93%9D; ceshi3.com=000;_tp=12hoQaUhHWSC8herrlonvg%3D%3D;_pst=linglanwangyi;cn=4;3AB9D23F7A4B3C9B=PTX54LWTNEJQ2FVWJLZFXG7JIEW2QC7F7FN2Q4EQD2RWZ4HKYWOJ6WA3MFHDAPSX2SHQLN27F57I5BPG4U3L6YGVAY;shshshsID=8230af78e8b1a5f089e9e71304a10a50_7_1593578796165; __jdb=122270672.12.1673555654|5.1593578500'
} #设置请求头User-Agent和Cookie参数
url = 'https://cart.jd.com/cart.action' #定义url字符串
for i in range(1,100): #循环
#异常处理
try:
#发送请求,并将返回结果赋值给r
r = requests.get(url, headers=headersvalue, timeout=1)
except requests.Timeout: #捕获Timeout异常
print('Timeout!') #输出“Timeout!”
else:
print(r.status_code) #输出返回状态码
print(r.text) #输出返回状态码
time.sleep(1) #设置1s休眠时间
第3章 网页解析基础
1.选择题
(1)A (2)D (3)C (4)C (5)B (6)A (7)B (8)A
2.填空题
(1)//ul/*[@class=’listmain’]
(2)a[bcd]e
(3)loads()和dumps()
3.实践题
(1)
import chardet #导入chardet模块
import requests #导入requests模块
import urllib.parse #导入parse模块
from lxml import etree #导入etree模块
#定义字符串novel_base_url
novel_base_url = 'http://www.biqukan.com'
#将合并的url赋值给novel_url
novel_url = urllib.parse.urljoin(novel_base_url, '/50_50096/')
chapter_url_list = [] #定义列表chapter_url_list
headersvalue = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
} #设置请求头User-Agent信息
#定义函数,获取小说每个章节的链接
def fetch_chapter_urls():
# 设置headers,发送请求,并将返回结果赋值给r
r = requests.get(novel_url, headers=headersvalue)
html = etree.HTML(r.text) #创建HTML对象html
hrefs= html.xpath('//dd/a/@href')#选择a节点并提取href属性
for href in hrefs: #遍历
#判断href是否在chapter_url_list列表中
if href not in chapter_url_list:
#将合并的url赋值给chapter_url
chapter_url=urllib.parse.urljoin(novel_base_url,href)
#将chapter_url加入chapter_url_list列表中
chapter_url_list.append(chapter_url)
chapter_url_list.sort() #对chapter_url_list列表进行排序
#定义函数,获取每个章节的标题和正文
def parse_chapter(url):
# 设置headers,发送请求,并将返回结果赋值给r
r = requests.get(url, headers=headersvalue)
#检测返回内容编码类型
code_type = chardet.detect(r.content)['encoding']
#判编码类型是否是GB2312,如果是,改变编码类型为GBK
if code_type == 'GB2312':
code_type = 'GBK'
r.encoding = code_type #重定义返回内容编码类型
html = etree.HTML(r.text) #创建HTML类对象html
#选择h1节点并提取文本,将返回的列表第一项赋值给title
title = html.xpath('//h1/text()')[0]
#选择id属性值为“content”的节点并提取文本,将返回的列表赋值给contents
contents = html.xpath('//*[@id="content"]/text()')
content = '' #定义content字符串
for i in contents: #遍历
#移除字符串头尾的空格或换行符,与content相加并赋值给content
content += i.strip()
save_novel(title, content) #调用save_novel函数
#定义保存文件函数,将爬取的正文保存到txt文件中
def save_novel(title,content):
#异常处理
try:
#打开文件
with open(title+'.txt', 'w+', encoding='utf-8') as f:
f.write(content.strip()) #写入文件
except urllib.error.HTTPError as e: #捕获HTTPError异常
print(e.reason) #输出异常原因
else:
print('下载完成:'+title) #输出下载完成提示
if __name__=='__main__': #文件作为脚本直接执行
fetch_chapter_urls() #调用fetch_chapter_urls函数
for chapter in chapter_url_list: #遍历
parse_chapter(chapter) #调用parse_chapter函数
(2)
import requests #导入requests模块
import csv #导入csv模块
import re #导入re模块
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2020.html' #定义字符串url
headersvalue = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
} #设置请求头User-Agent信息
#发送请求,并将返回结果赋值给r
r = requests.get(url, headers=headersvalue)
r.encoding = 'utf-8' #重定义返回内容编码类型
html=r.text #将响应内容赋值给html
#编译正则表达式,并将返回的正则表达式对象赋值给pattern
pattern=re.compile('<tr.*?alt.*?<td>(.*?)</td>.*?<div.*?>(.*?)</div>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>',re.S)
items=re.findall(pattern, html)#查找页面中所有符合条件的字符串
#打开school_rank.csv文件写入数据
with open('school_rank.csv', 'w', newline='') as file:
writer = csv.writer(file) #初始化writer对象
writer.writerow(['排名', '学校名称', '省市', '学校类型', '总分', '办学层次得分', '学科水平得分', '办学资源得分', '师资规模与结构得分', '人才培养得分', '科研研究得分', '服务社会得分', '高端人才得分', '重大项目与成果得分', '国际竞争力得分']) #写入一行
writer.writerows(items) #写入多行
第4章 爬取动态加载数据
1.选择题
(1)A (2)B (3)A (4)D (5)B
2.填空题
(1)XHR和JS
(2)find_element_by_xpath()
(3)地址和端口
3.实践题
(1)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
import re
'''options = Options()
options.add_argument('–headless')
browser = webdriver.Chrome(options=options)'''
browser = webdriver.Chrome()
browser.maximize_window()
wait = WebDriverWait(browser, 10)
def search(keyword):
browser.get('https://search.jd.com/')
input_ = wait.until(EC.presence_of_element_located((By.ID, 'keyword')))
submit = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "input_submit")))
input_.clear()
input_.send_keys(keyword)
submit.click()
#滑到最底端
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
#总页数
number = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.p-skip b'))).text
return number
def change_page(page):
print("正在爬第", page, "页")
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(3)
page_box = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.p-skip input')))
page_box.clear()
page_box.send_keys(str(page))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.p-skip .btn')))
submit.click()
# 检查是否加载成功
wait.until(EC.text_to_be_present_in_element_value((By.CSS_SELECTOR, '.p-skip input'), str(page)))
def get_comment(link):
product_id = re.search("https://item.jd.com/(d+).html#comment", link).group(1)
browser.get(link)
count = 0
file = open("JD_%s_comments.txt" % product_id, "a", encoding='utf-8')
while True:
try:
if count % 10 == 0:
time.sleep(3)
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, "#comment .comments-list [data-tab=item] .comment-con")))
soup = BeautifulSoup(browser.page_source, 'lxml')
url_list = soup.select("#comment .comments-list [data-tab=item] .comment-con")
for url in url_list:
file.write(url.text.strip() + "n")
count += 1
next_page = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#comment .ui-page .ui-pager-next")))
browser.execute_script("arguments[0].click();", next_page)
except TimeoutException:
print("已爬取", count, "页评论")
file.close()
break
if __name__ == '__main__':
number = search("口罩")
link_list = []
for page in range(1, int(number) + 1):
change_page(page)
time.sleep(3)
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.gl-item .p-name [target=_blank]')))
url_list = browser.find_elements_by_css_selector(".gl-item .p-name [target=_blank]")
for url in url_list:
link_list.append(url.get_attribute("href") + "#comment")
for link in link_list:
get_comment(link)
(2)
import requests #导入requests模块
import json #导入json模块
import pymysql #导入mysql模块
import time #导入time模块
url = 'http://www.bjjqe.com/admin_1/json.php'
datavalue={
'act': 'index_boutique_replace',
'boutique_type': '4'
}
#连接MySQL
db = pymysql.connect(host='localhost', user='root', password='123456', port=3306)
#使用cursor()方法获取操作游标
cursor = db.cursor()
#创建数据库product_sql
cursor.execute('CREATE DATABASE IF NOT EXISTS product_sql Character Set GBK')
db.close() #断开连接
#连接MySQL,并选择product_sql数据库
db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='product_sql')
#使用cursor()方法获取操作游标
cursor = db.cursor()
#创建表products
sql = 'CREATE TABLE IF NOT EXISTS products (bookName CHAR(100), author CHAR(100), price CHAR(20), publish_company CHAR(50))'
cursor.execute(sql) #执行SQL语句
#发送HTTP请求
return_data = requests.post(url, data=datavalue).text
data = json.loads(return_data) #对HTTP响应的数据JSON化
news = data['goods_result'] #索引到需要爬取的内容信息
for n in news: #对索引出来的JSON数据进行遍历和提取
bookName = n['title']
author = n['editor']
price = n['price']
publish_company = n['publishing']
print('书名:',bookName,'作者:',author,'价格:',price,'出版社:',publish_company)
poduct = (bookName, author, price, publish_company)
try:
sql = 'INSERT INTO products(bookName, author, price, publish_company) VALUES(%s, %s, %s, %s)'
cursor.execute(sql, poduct) #执行多条SQL语句
db.commit() #提交到数据库执行
print('插入数据成功')
except:
db.rollback()
print('插入数据失败')
time.sleep(1)
第5章 反爬虫策略
1.选择题
(1)B (2)B (3)C
2.填空题
(1)通过Headers反爬虫、基于用户行为反爬虫和采用动态网页反爬虫
(2)设置Headers、使用代理IP、降低请求频率、逆向分析请求页面和使用Selenium模拟浏览器
(3)time库
3.实践题
import time #导入time模块
import random #导入random模块
import requests #导入requests模块
from bs4 import BeautifulSoup #从bs4库中导入BeautifulSoup类
#定义base_url字符串
base_url='https://www.pythontab.com/html/pythonhexinbiancheng/'
headersvalue = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
} #设置请求头的User-Agent信息
#定义代理IP列表
proxiesvalue = [
{'http': 'http://121.232.148.167:9000'},
{'http': 'http://39.105.28.28:8118'},
{'http': 'http://113.195.18.133:9999'}
]
#定义函数获取每一页URL
def get_onepage_url(url):
url_list = [] #定义列表
#异常判断
try:
#设置代理IP,发送HTTP请求
r = requests.get(url, headers=headersvalue, proxies=random.choice(proxiesvalue))
except:
print('请求失败') #请求错误,输出“请求失败”
else:
soup = BeautifulSoup(r.text, 'lxml')#初始化BeautifulSoup对象
items = soup.select('#catlist li') #查找包含文章的li节点
for item in items:
url1 = item.select('a')[0].attrs['href'] #获取每篇文章的URL
url_list.append(url1) #将URL添加到列表
#设置随机休眠时间
sleep_time = random.randint(0, 2) + random.random()
time.sleep(sleep_time) #程序休眠sleep_time
return url_list
#定义函数获取文章内容
def get_article(url):
# 异常判断
try:
# 设置代理IP,发送HTTP请求
r = requests.get(url, headers=headersvalue, proxies=random.choice(proxiesvalue))
except:
print('请求失败') #请求错误,输出“请求失败”
else:
soup = BeautifulSoup(r.text, 'lxml') #创建BeautifulSoup对象
title = soup.select('#Article h1')[0].string #获取文章标题
#获取文章内容
content = soup.select('#Article .content')[0].text
towrite(title, content) #调用towrite
# 设置随机休眠时间
sleep_time = random.randint(0, 2) + random.random()
time.sleep(sleep_time) #程序休眠sleep_time
#定义函数保存文章
def towrite(title, content):
#定义string字符串,表示文件命名时不能包含的特殊字符
string = ['?', '*', ':', '"', '< ', '>', '\', '/', '|']
for i in string:
if i in title: #判断title中是否包含特殊字符
#如果包含特殊字符,则替换为“#”
title = title.replace(i, '#')
try:
# 打开文件
with open(title + '.txt', 'w+', encoding='utf-8') as f:
f.write(content.strip()) #写入文件
except: #捕获写入文件异常
print('写入文件失败:' + title) #输出写入文件失败提示
else:
print('下载完成:' + title) #输出下载完成提示
if __name__=='__main__':
for i in range(1,28): #循环
if i > 1:
url = base_url+str(i)+'.html' #组合网页URL
else:
url = base_url #第一页URL
try:
url_list = get_onepage_url(url) #调用get_onepage_url
except: #捕获请求异常
print('请求失败') #输出“请求失败”
else:
for url1 in url_list: #遍历
get_article(url1) #调用get_article
第6章 模拟登录和处理验证码
1.实践题
(1)
from selenium import webdriver #导入webdriver模块
import time #导入sleep模块
#导入ActionChains模块
from selenium.webdriver import ActionChains
from PIL import Image #导入Image模块
import Chaojiying#导入Chaojiying模块
#初始化Google Chrome浏览器对象,并赋值给browser
browser = webdriver.Chrome()
browser.maximize_window() #浏览器最大化
#发送请求
browser.get('https://passport.bilibili.com/login')
time.sleep(0.5) #休眠0.5s
#定位节点并输入用户名
user_tag=browser.find_element_by_id('login-username')
user_tag = user_tag.send_keys('15210985825')
#定位节点并输入密码
pwd_tag=browser.find_element_by_id('login-passwd')
pwd_tag = pwd_tag.send_keys('123456qwerty')
time.sleep(1) #休眠1s
browser.find_element_by_css_selector('[class="btn btn-login"]').click()
time.sleep(1) #休眠1s
#显示点触验证码图片的登录页面截图
browser.save_screenshot('./main.png')
#定位验证码图片节点
img_tag=browser.find_element_by_class_name('geetest_panel_next')
location = img_tag.location #获取图片在页面中的起始位置
size = img_tag.size #获取图片在页面中显示的大小
#存储验证码图片左下角和右上角的坐标
rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
i = Image.open('./main.png') #打开登录页面截图
frame = i.crop(rangle) #根据rangle进行验证码的裁剪
frame.save('./code.png') #保存验证码图片
#将验证码图片提交给超级鹰
result = Chaojiying.transform_codeImg('./code.png', 9004, 'lingyiyi', '123456qwerty', '907397')
print('超级鹰返回的坐标位置:', result) #输出超级鹰返回的坐标位置
all_list = [] #定义列表
list_1 = result.split('|') #将超级鹰返回的坐标位置以“|”切割
count_1 = len(list_1) #获取list_1的长度
for i in range(count_1):
xy_list = [] #定义列表
x = int(list_1[i].split(',')[0]) #获取x坐标
y = int(list_1[i].split(',')[1]) #获取y坐标
xy_list.append(x) #将x坐标添加到xy_list中
xy_list.append(y) #将y坐标添加到xy_list中
all_list.append(xy_list) #将xy_list添加到xy_list中
print('转换后的坐标位置', all_list)
for loc in all_list:
x = loc[0]
y = loc[1]
#通过定位到验证码图片进行位置滑动,perform立即执行
ActionChains(browser).move_to_element_with_offset(img_tag, x, y).click().perform()(2)
from selenium import webdriver #导入webdriver模块
from selenium.webdriver.common.by import By #导入By模块
#导入WebDriverWait模块
from selenium.webdriver.support.ui import WebDriverWait
#导入ActionChains 模块
from selenium.webdriver import ActionChains
#导入expected_conditions模块
from selenium.webdriver.support import expected_conditions as EC
from PIL import Image #导入Image模块
from PIL import ImageFont#导入ImageFont模块
from PIL import ImageDraw#导入ImageDraw模块
from io import BytesIO #导入BytesIO模块
import Chaojiying#导入Chaojiying模块
import time #导入time模块
#初始化Google Chrome浏览器对象,并赋值给browser
browser = webdriver.Chrome()
border = 30 #定义滑块移动距离误差
#定义函数,给图片添加文字
def add_text_to_image(image):
#设置字体样式
font = ImageFont.truetype('simhei.ttf', 32)
#在图片上添加文字 "请点击缺口左上角"
add_text = '请点击缺口左上角'
draw = ImageDraw.Draw(image)
draw.text((2, 120), add_text, '#FF0000', font=font)
#保存图片
image.save('snap.png')
#定义函数,获取验证码图片位置
def get_position():
#获取验证码图片节点
img = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '[class="geetest_window"]')))
time.sleep(2) #休眠2s
location = img.location #获取验证码图片坐标
size = img.size #获取验证码图片大小
#获取验证码图片位置
top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size['width']
return (top, bottom, left, right) #返回验证码图片位置
#定义函数,获取图片并保存
def get_image():
time.sleep(0.2) #休眠0.2s
screenshot = browser.get_screenshot_as_png() #截取页面全屏图片
# 调用get_position函数
top, bottom, left, right = get_position()
screenshot = Image.open(BytesIO(screenshot)) #打开图片
captcha = screenshot.crop((left, top, right, bottom))
return captcha #返回图片
#定义函数,根据偏移模拟人为滑动轨迹,获取移动轨迹
def get_track(distance):
track = [] #定义移动轨迹列表track
current = 0 #初始化当前位移current
mid = distance * 4 / 5 #初始化减速阈值mid
t = 0.2 #初始化计算间隔t
v = 0 #初始化初速度v
while current < distance: #如果当前位移小于滑块偏移
if current < mid: #如果当前位移小于减速阈值
a = 2 #加速度为正2
else: #如果当前位移大于减速阈值
a = -3 #加速度为负3
v0 = v #初始化初速度v0
v = v0 + a * t #计算当前速度v
move = v0 * t + 1 / 2 * a * t * t #计算移动距离move
current += move #计算当前位移
#将该时间段位移添加到移动轨迹列表
track.append(round(move))
return track #返回移动轨迹列表
#定义函数,拖动滑块到缺口处
def move_to_gap(slider, track):
#按住滑块
ActionChains(browser).click_and_hold(slider).perform()
for x in track:
#执行移动轨迹动作
ActionChains(browser).move_by_offset(xoffset=x, yoffset=0).perform()
time.sleep(1) #休眠1s
ActionChains(browser).release().perform() #释放动作
#定义函数,实现验证操作
def crack():
# 调用get_image函数,截取带缺口图片
image = get_image()
add_text_to_image(image)
# 将验证码图片提交给超级鹰
result = Chaojiying.transform_codeImg('./snap.png', 9101, 'lingyiyi', '123456qwerty', '907397')
# 对比两张图片,获取滑动距离
gap = int(result.split(",")[0])
distance = gap – border #减去滑块移动距离误差
track = get_track(distance) #模拟人为滑动轨迹
#获取滑块节点
slider = wait.until(EC.element_to_be_clickable(
(By.CLASS_NAME,'geetest_slider_button')))
move_to_gap(slider, track) #拖动滑块
time.sleep(5) #休眠5s
if __name__ == '__main__':
#请求BOSS直聘登录页面,打开一个浏览器窗口
browser.get('https://login.zhipin.com/?ka=header-login')
time.sleep(1) #休眠1s
browser.maximize_window() #窗口最大化
time.sleep(1) #休眠1s
wait = WebDriverWait(browser, 10) #初始化WebDriverWait对象
#点击验证按钮
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_wait'))).click()
time.sleep(2) #休眠2s
crack() #调用crack函数
第7章 爬取App和PC客户端
1.实践题
(1)
import requests
import json
headersvalue={
'Cookie': '1&_device=android&2354647e-4be8-3c87-b26c-5e33a91bc1b0&6.6.72;channel=and-d3;impl=com.ximalaya.ting.android;osversion=28;device_model=SEA-AL10;XUM=5P2h0sPL;XIM=312ea7019ebca;c-oper=%E4%B8%AD%E5%9B%BD%E7%A7%BB%E5%8A%A8;net-mode=WIFI;freeFlowType=0;res=1080%2C2259;NSUP=42e8aeb9%2C421fdc10%2C1599117494905;AID=rSUrJvkxLdw=;manufacturer=HUAWEI;XD=YrBEo/u5jJlB11rljobr+vkUi6YnAZSx8hFyh1gmtdKEmRDnqx3cy9eozH5gjlMk/rkaXTaqBETMu4cd7ZneKElTaJEPETRMzaT2HTiN5JBlgRKnhLIX3smzSPWhEgv5P6DXGNGNv2+xdH8m3BpvY5Sj/6m7OeIW/OFFSBD05/E=;umid=aied88addff903ebd606cd7b8156776b8d;xm_grade=0;minorProtectionStatus=0;oaid=feeefcdb-effa-9bba-5e55-85ed7efd65af;newChannelId=yz-huawei;fp=00921715752122q22v64vv55080000;domain=.ximalaya.com;path=/;',
'Cookie2': '$version=1',
'Accept': '*/*',
'user-agent': 'ting_6.6.72(SEA-AL10,Android28)',
'Host': 'mobwsa.ximalaya.com',
'Connection': 'Keep-Alive',
'Accept-Encoding': 'gzip'
}
requests.packages.urllib3.disable_warnings()
for i in range(5):
url = 'http://mobwsa.ximalaya.com/discovery-ranking-web/v3/category/concreteRankList/ts-1599118668274?device=android&isAnchor=false&pageSize=20&rankingListId=51&version=6.6.72'
url = url + '&pageId=' + str(i+1)
r = requests.get(url, headers=headersvalue, verify=False)#
json_data = json.loads(r.text)['data']['list']
for data in json_data:
print(data['title'], data['intro'])
(2)
import requests
import json
headersvalue={
'Host': 'mp.weixin.qq.com',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.5 WindowsWechat',
'X-Requested-With': 'XMLHttpRequest',
'Accept': '*/*',
'Referer':'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzI4NjIxNTc2OA==&uin=MTE1MTAwMjM4Mg%3D%3D&key=281d398fa0af70c9f36243839c648f99351ec37d937e93339c8405b43dbe6dcc5343d63a12abb3975e17d9489436e3bcc2799ff2e02581a73412320cf4cd5cdc2fe44a45b343c2e1d972fa2521a2d0cde91e18f37374a9c705d46e0d0880a937f3989dfe204bb53c573b1cddeb2fdb301a61b47b99fa44ec31d8a42c87f6b5c7&devicetype=Windows+10+x64&version=62090538&lang=zh_CN&a8scene=7&pass_ticket=c0WoudB5pZm9Qn4wsO4orgJHjAEBPQvM6hhNNIdEt0TWozisRjt8SpK%2B8N%2FYMBuC&winzoom=1',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4',
'Cookie': 'rewardsn=; wxtokenkey=777; wxuin=1151002382; devicetype=android-28; version=27000d39;lang=zh_CN;pass_ticket=c0WoudB5pZm9Qn4wsO4orgJHjAEBPQvM6hhNNIdEt0TWozisRjt8SpK8N/YMBuC;wap_sid2=CI7O66QEEooBeV9IRVJ4aXY0UmVTVlF5OUV4bV9kcERHRDg1NGY5TEJtQk5YZktZT2x0ZlQ0VFB1bUFhV0tXVjUwY1pSUHBhRHpXZkhPSmt4QWMxSVNYNGlQdFJRZW5RMXhrSWZEQWs5dlhSTTlCZjF3NjZLM294eS1SR2xERGxqeUhob2xxYkE3bHJ4UVNBQUF+MKLci/sFOA1AlU4='
}
requests.packages.urllib3.disable_warnings()
for i in range(99):
url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzI4NjIxNTc2OA'
'==&f=json&count=10&is_ok=1&scene=&uin=MTE1MTAwMjM4Mg%3D%3D&key='
'281d398fa0af70c9f36243839c648f99351ec37d937e93339c8405b43dbe6dc'
'c5343d63a12abb3975e17d9489436e3bcc2799ff2e02581a73412320cf4cd5c'
'dc2fe44a45b343c2e1d972fa2521a2d0cde91e18f37374a9c705d46e0d0880a'
'937f3989dfe204bb53c573b1cddeb2fdb301a61b47b99fa44ec31d8a42c87f6'
'b5c7&pass_ticket=c0WoudB5pZm9Qn4wsO4orgJHjAEBPQvM6hhNNIdEt0TWoz'
'isRjt8SpK%2B8N%2FYMBuC&wxtoken=&appmsg_token=1079_D4vhykniky0g3'
'R6fLrWwBFfxFIpIapUodqy4iQ~~&x5=0&f=json'
url = url + '&offset='+str(i)
r = requests.get(url, headers=headersvalue, verify=False)
json_data = json.loads(r.text)
general_msg_list = json.loads(json_data['general_msg_list'])
title = general_msg_list['list'][0]['app_msg_ext_info']['title']
print(title)
content_url = general_msg_list['list'][0]['app_msg_ext_info']['content_url']
print(content_url)
神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试