《python网络爬虫》1

1,841次阅读
没有评论
《python网络爬虫》1

第1章  网络爬虫入门

1.选择题

(1)B (2)A (3)D

2.简答题

(1)预先设定一个或若干个初始网页URL,将初始URL加入到待爬取URL列表中;从待爬取列表中逐个读取URL,并将URL加入到已爬取URL列表中,然后下载网页;解析已下载的网页,并存储提取的数据,从中获取新的URL;将新的URL在已爬取的URL列表中进行比对,检查该网页是否已爬取,如果网页没有被爬取,则将新的URL地址放入到待爬取URL列表的末尾,等待读取;如此往复,直到待爬取URL列表为空或者满足设定的终止条件,最终达到遍历网页的目的。

(2)门户网站、搜索引擎和大型网络服务提供商是网络爬虫的主要应用场景;有些企业也会时常借助网络爬虫采集和分析数据;当然,普通用户也可以利用网络爬虫采集自己关注的数据。

3.实践题

print('*'*26)

print('我用Python编写网络爬虫。')

print('*'*26)

第2章  爬虫基础

1.选择题

(1)A (3)A (4)D (5)C (6)C

2.填空题

(1)request、error、parse和robotparser

(2)请求的网址、请求方法、请求头和请求体

(3)响应状态码、响应头和响应体

3.实践题

import requests #导入requests模块

import time #导入time模块

headersvalue = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',

'Cookie':'__jdu=1673555654;shshshfpa=565be4ca-07c2-cd67-78e4-92691c5c0e78-1590370825;shshshfpb=rVrKSmn2IcLo8w4GOV3s%2FvQ%3D%3D;unpl=V2_ZzNtbRAEREJxAUNcfEkLV2JQEl0RUkUUJQgTVnMdDgQzU0ZfclRCFnQUR1JnGFwUZwAZWEJcRhRFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHseWgdjBxZaRlJzJXI4dmR5GFUDYTMTbUNnAUEpAE9ceR5VSGcEFF9GU0cScQ12VUsa;__jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_cb6f58496afc4b11b470a1d395c0eae3|1593578499694;areaId=1;ipLoc-djd=1-2800-0-0;PCSYCityID=CN_110000_110100_110108;user-key=a7ecfb5e-4ce5-45f3-bc50-f34f38529f35; __jda=122270672.1673555654.1590370824.1593575769.1593578500.5; __jdc=122270672;shshshfp=a361ce6062d6acb4a213e3fccb63cea4;cart-main=xx;cd=0;wlfstk_smdl=o0y1hhq7lhtt3bmuyyrsq4frzsm2x25l;TrackID=1bBtBpjzT4KptbszIfvAs3fgoavSXZLPO-rjd9P-eg3J0f2YgpvBSjdIiKW0synUQAkjr8iSOKnvciMPU3XkYwhLdym1pPC6JJlsY52WVqIk;thor=53783C64FD0A997288CB6CC2D828217654920A82353860FFBC08C3E814CD72E23EC5E6849607FB867D56BB6BB176E21719C7A0E6ED254BD9BEFAE84419AE8A5164F65661B13E6E9FE5D6D9A1543D1E5ED2933AA8FF5FBE38555C48D1EE365B4B2437708D39597BAF689B54B61F3E88115CFAB7D39814771A44D458655F67F6DE109AD4D41497D61F1BA892BB9B996453;pinId=fGixLn-KPTpnavPdRtFWag;pin=linglanwangyi; unick=%E5%87%8C-%E8%93%9D; ceshi3.com=000;_tp=12hoQaUhHWSC8herrlonvg%3D%3D;_pst=linglanwangyi;cn=4;3AB9D23F7A4B3C9B=PTX54LWTNEJQ2FVWJLZFXG7JIEW2QC7F7FN2Q4EQD2RWZ4HKYWOJ6WA3MFHDAPSX2SHQLN27F57I5BPG4U3L6YGVAY;shshshsID=8230af78e8b1a5f089e9e71304a10a50_7_1593578796165; __jdb=122270672.12.1673555654|5.1593578500'

} #设置请求头User-Agent和Cookie参数

url = 'https://cart.jd.com/cart.action' #定义url字符串

for i in range(1,100): #循环

    #异常处理

    try:

#发送请求,并将返回结果赋值给r

        r = requests.get(url, headers=headersvalue, timeout=1)  

    except requests.Timeout:   #捕获Timeout异常

        print('Timeout!')   #输出“Timeout!”

    else:

        print(r.status_code) #输出返回状态码

        print(r.text) #输出返回状态码

    time.sleep(1) #设置1s休眠时间

第3章  网页解析基础

1.选择题

(1)A (2)D (3)C (4)C (5)B (6)A (7)B (8)A

2.填空题

(1)//ul/*[@class=’listmain’]

(2)a[bcd]e

(3)loads()和dumps()

3.实践题

(1)

import chardet #导入chardet模块

import requests #导入requests模块

import urllib.parse #导入parse模块

from lxml import etree #导入etree模块

#定义字符串novel_base_url

novel_base_url = 'http://www.biqukan.com'

#将合并的url赋值给novel_url

novel_url = urllib.parse.urljoin(novel_base_url, '/50_50096/')

chapter_url_list = [] #定义列表chapter_url_list

headersvalue = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',

} #设置请求头User-Agent信息

#定义函数,获取小说每个章节的链接

def fetch_chapter_urls():

    # 设置headers,发送请求,并将返回结果赋值给r

    r = requests.get(novel_url, headers=headersvalue)

    html = etree.HTML(r.text) #创建HTML对象html

    hrefs= html.xpath('//dd/a/@href')#选择a节点并提取href属性

    for href in hrefs: #遍历

        #判断href是否在chapter_url_list列表中

        if href not in chapter_url_list:

            #将合并的url赋值给chapter_url

            chapter_url=urllib.parse.urljoin(novel_base_url,href)

             #将chapter_url加入chapter_url_list列表中

            chapter_url_list.append(chapter_url)

    chapter_url_list.sort() #对chapter_url_list列表进行排序

#定义函数,获取每个章节的标题和正文

def parse_chapter(url):

    # 设置headers,发送请求,并将返回结果赋值给r

    r = requests.get(url, headers=headersvalue)

    #检测返回内容编码类型

    code_type = chardet.detect(r.content)['encoding']

    #判编码类型是否是GB2312,如果是,改变编码类型为GBK

    if code_type == 'GB2312':

        code_type = 'GBK'

    r.encoding = code_type #重定义返回内容编码类型

    html = etree.HTML(r.text) #创建HTML类对象html

    #选择h1节点并提取文本,将返回的列表第一项赋值给title

    title = html.xpath('//h1/text()')[0]

    #选择id属性值为“content”的节点并提取文本,将返回的列表赋值给contents

    contents = html.xpath('//*[@id="content"]/text()')

    content = '' #定义content字符串

    for i in contents: #遍历

        #移除字符串头尾的空格或换行符,与content相加并赋值给content

        content += i.strip()

    save_novel(title, content) #调用save_novel函数

#定义保存文件函数,将爬取的正文保存到txt文件中

def save_novel(title,content):

    #异常处理

    try:

        #打开文件

        with open(title+'.txt', 'w+', encoding='utf-8') as f:

            f.write(content.strip()) #写入文件

    except urllib.error.HTTPError as e: #捕获HTTPError异常

        print(e.reason) #输出异常原因

    else:

        print('下载完成:'+title) #输出下载完成提示

if __name__=='__main__': #文件作为脚本直接执行

    fetch_chapter_urls() #调用fetch_chapter_urls函数

    for chapter in chapter_url_list: #遍历

        parse_chapter(chapter) #调用parse_chapter函数

(2)

import requests #导入requests模块

import csv #导入csv模块

import re #导入re模块

url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2020.html' #定义字符串url

headersvalue = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'

} #设置请求头User-Agent信息

#发送请求,并将返回结果赋值给r

r = requests.get(url, headers=headersvalue)

r.encoding = 'utf-8' #重定义返回内容编码类型

html=r.text #将响应内容赋值给html

#编译正则表达式,并将返回的正则表达式对象赋值给pattern

pattern=re.compile('<tr.*?alt.*?<td>(.*?)</td>.*?<div.*?>(.*?)</div>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>.*?>(.*?)</td>',re.S)

items=re.findall(pattern, html)#查找页面中所有符合条件的字符串

#打开school_rank.csv文件写入数据

with open('school_rank.csv', 'w', newline='') as file:

    writer = csv.writer(file) #初始化writer对象

    writer.writerow(['排名', '学校名称', '省市', '学校类型', '总分', '办学层次得分', '学科水平得分', '办学资源得分', '师资规模与结构得分', '人才培养得分', '科研研究得分', '服务社会得分', '高端人才得分', '重大项目与成果得分', '国际竞争力得分']) #写入一行

    writer.writerows(items) #写入多行

第4章  爬取动态加载数据

1.选择题

(1)A (2)B (3)A (4)D (5)B

2.填空题

(1)XHR和JS

(2)find_element_by_xpath()

(3)地址和端口

3.实践题

(1)

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.support.wait import WebDriverWait

from selenium.common.exceptions import TimeoutException

from bs4 import BeautifulSoup

import time

import re

'''options = Options()

options.add_argument('–headless')

browser = webdriver.Chrome(options=options)'''

browser = webdriver.Chrome()

browser.maximize_window()

wait = WebDriverWait(browser, 10)

def search(keyword):

    browser.get('https://search.jd.com/')

    input_ = wait.until(EC.presence_of_element_located((By.ID, 'keyword')))

    submit = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "input_submit")))

    input_.clear()

    input_.send_keys(keyword)

    submit.click()

    #滑到最底端

    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')

    #总页数

    number = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.p-skip b'))).text

    return number

def change_page(page):

    print("正在爬第", page, "页")

    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')

    time.sleep(3)

    page_box = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.p-skip input')))

    page_box.clear()

    page_box.send_keys(str(page))

    submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.p-skip .btn')))

    submit.click()

    # 检查是否加载成功

    wait.until(EC.text_to_be_present_in_element_value((By.CSS_SELECTOR, '.p-skip input'), str(page)))

def get_comment(link):

    product_id = re.search("https://item.jd.com/(d+).html#comment", link).group(1)

    browser.get(link)

    count = 0

    file = open("JD_%s_comments.txt" % product_id, "a", encoding='utf-8')

    while True:

        try:

            if count % 10 == 0:

                time.sleep(3)

            browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')

            wait.until(EC.presence_of_element_located(

                (By.CSS_SELECTOR, "#comment .comments-list [data-tab=item] .comment-con")))

            soup = BeautifulSoup(browser.page_source, 'lxml')

            url_list = soup.select("#comment .comments-list [data-tab=item] .comment-con")

            for url in url_list:

                file.write(url.text.strip() + "n")

            count += 1

            next_page = wait.until(

                EC.presence_of_element_located((By.CSS_SELECTOR, "#comment .ui-page .ui-pager-next")))

            browser.execute_script("arguments[0].click();", next_page)

        except TimeoutException:

            print("已爬取", count, "页评论")

            file.close()

            break

if __name__ == '__main__':

    number = search("口罩")

    link_list = []

    for page in range(1, int(number) + 1):

        change_page(page)

        time.sleep(3)

        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.gl-item .p-name [target=_blank]')))

        url_list = browser.find_elements_by_css_selector(".gl-item .p-name [target=_blank]")

        for url in url_list:

            link_list.append(url.get_attribute("href") + "#comment")

        for link in link_list:

            get_comment(link)

(2)

import requests #导入requests模块

import json #导入json模块

import pymysql #导入mysql模块

import time #导入time模块

url = 'http://www.bjjqe.com/admin_1/json.php'

datavalue={

    'act': 'index_boutique_replace',

    'boutique_type': '4'

}

#连接MySQL

db = pymysql.connect(host='localhost', user='root', password='123456', port=3306)

#使用cursor()方法获取操作游标

cursor = db.cursor()

#创建数据库product_sql

cursor.execute('CREATE DATABASE IF NOT EXISTS product_sql Character Set GBK')

db.close() #断开连接

#连接MySQL,并选择product_sql数据库

db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='product_sql')

#使用cursor()方法获取操作游标

cursor = db.cursor()

#创建表products

sql = 'CREATE TABLE IF NOT EXISTS products (bookName CHAR(100), author CHAR(100), price CHAR(20), publish_company CHAR(50))'

cursor.execute(sql) #执行SQL语句

#发送HTTP请求

return_data = requests.post(url, data=datavalue).text

data = json.loads(return_data) #对HTTP响应的数据JSON化

news = data['goods_result'] #索引到需要爬取的内容信息

for n in news: #对索引出来的JSON数据进行遍历和提取

    bookName = n['title']

    author = n['editor']

    price = n['price']

    publish_company = n['publishing']

    print('书名:',bookName,'作者:',author,'价格:',price,'出版社:',publish_company)

    poduct = (bookName, author, price, publish_company)

    try:

        sql = 'INSERT INTO products(bookName, author, price, publish_company) VALUES(%s, %s, %s, %s)'

        cursor.execute(sql, poduct) #执行多条SQL语句

        db.commit() #提交到数据库执行

        print('插入数据成功')

    except:

        db.rollback()

        print('插入数据失败')

    time.sleep(1)

第5章  反爬虫策略

1.选择题

(1)B (2)B (3)C

2.填空题

(1)通过Headers反爬虫、基于用户行为反爬虫和采用动态网页反爬虫

(2)设置Headers、使用代理IP、降低请求频率、逆向分析请求页面和使用Selenium模拟浏览器

(3)time库

3.实践题

import time #导入time模块

import random #导入random模块

import requests #导入requests模块

from bs4 import BeautifulSoup #从bs4库中导入BeautifulSoup类

#定义base_url字符串

base_url='https://www.pythontab.com/html/pythonhexinbiancheng/'

headersvalue = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'

} #设置请求头的User-Agent信息

#定义代理IP列表

proxiesvalue = [

    {'http': 'http://121.232.148.167:9000'},

    {'http': 'http://39.105.28.28:8118'},

    {'http': 'http://113.195.18.133:9999'}

]

#定义函数获取每一页URL

def get_onepage_url(url):

    url_list = [] #定义列表

    #异常判断

    try:

        #设置代理IP,发送HTTP请求

        r = requests.get(url, headers=headersvalue, proxies=random.choice(proxiesvalue))

    except:

        print('请求失败') #请求错误,输出“请求失败”

    else:

        soup = BeautifulSoup(r.text, 'lxml')#初始化BeautifulSoup对象

        items = soup.select('#catlist li') #查找包含文章的li节点

        for item in items:

            url1 = item.select('a')[0].attrs['href'] #获取每篇文章的URL

            url_list.append(url1)  #将URL添加到列表

    #设置随机休眠时间

    sleep_time = random.randint(0, 2) + random.random()

    time.sleep(sleep_time) #程序休眠sleep_time

    return url_list

#定义函数获取文章内容

def get_article(url):

    # 异常判断

    try:

        # 设置代理IP,发送HTTP请求

        r = requests.get(url, headers=headersvalue, proxies=random.choice(proxiesvalue))

    except:

        print('请求失败') #请求错误,输出“请求失败”

    else:

        soup = BeautifulSoup(r.text, 'lxml') #创建BeautifulSoup对象

        title = soup.select('#Article h1')[0].string #获取文章标题

        #获取文章内容

        content = soup.select('#Article .content')[0].text

        towrite(title, content) #调用towrite

    # 设置随机休眠时间

    sleep_time = random.randint(0, 2) + random.random()

    time.sleep(sleep_time) #程序休眠sleep_time

#定义函数保存文章

def towrite(title, content):

    #定义string字符串,表示文件命名时不能包含的特殊字符

    string = ['?', '*', ':', '"', '< ', '>', '\', '/', '|']

    for i in string:

        if i in title: #判断title中是否包含特殊字符

            #如果包含特殊字符,则替换为“#”

            title = title.replace(i, '#')

    try:

        # 打开文件

        with open(title + '.txt', 'w+', encoding='utf-8') as f:

            f.write(content.strip()) #写入文件

    except: #捕获写入文件异常

        print('写入文件失败:' + title) #输出写入文件失败提示

    else:

        print('下载完成:' + title) #输出下载完成提示

if __name__=='__main__':

    for i in range(1,28): #循环

        if i > 1:

            url = base_url+str(i)+'.html' #组合网页URL

        else:

            url = base_url #第一页URL

        try:

            url_list = get_onepage_url(url) #调用get_onepage_url

        except:                          #捕获请求异常

            print('请求失败')            #输出“请求失败”

        else:

            for url1 in url_list: #遍历

                get_article(url1) #调用get_article

第6章  模拟登录和处理验证码

1.实践题

(1)

from selenium import webdriver #导入webdriver模块

import time #导入sleep模块

#导入ActionChains模块

from selenium.webdriver import ActionChains

from PIL import Image #导入Image模块

import Chaojiying#导入Chaojiying模块

#初始化Google Chrome浏览器对象,并赋值给browser

browser = webdriver.Chrome()

browser.maximize_window() #浏览器最大化

#发送请求

browser.get('https://passport.bilibili.com/login')

time.sleep(0.5) #休眠0.5s

#定位节点并输入用户名

user_tag=browser.find_element_by_id('login-username')

user_tag = user_tag.send_keys('15210985825')

#定位节点并输入密码

pwd_tag=browser.find_element_by_id('login-passwd')

pwd_tag = pwd_tag.send_keys('123456qwerty')

time.sleep(1) #休眠1s

browser.find_element_by_css_selector('[class="btn btn-login"]').click()

time.sleep(1) #休眠1s

#显示点触验证码图片的登录页面截图

browser.save_screenshot('./main.png')

#定位验证码图片节点

img_tag=browser.find_element_by_class_name('geetest_panel_next')

location = img_tag.location #获取图片在页面中的起始位置

size = img_tag.size #获取图片在页面中显示的大小

#存储验证码图片左下角和右上角的坐标

rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))

i = Image.open('./main.png') #打开登录页面截图

frame = i.crop(rangle) #根据rangle进行验证码的裁剪

frame.save('./code.png') #保存验证码图片

#将验证码图片提交给超级鹰

result = Chaojiying.transform_codeImg('./code.png', 9004, 'lingyiyi', '123456qwerty', '907397')

print('超级鹰返回的坐标位置:', result) #输出超级鹰返回的坐标位置

all_list = [] #定义列表

list_1 = result.split('|') #将超级鹰返回的坐标位置以“|”切割

count_1 = len(list_1) #获取list_1的长度

for i in range(count_1):

    xy_list = [] #定义列表

    x = int(list_1[i].split(',')[0]) #获取x坐标

    y = int(list_1[i].split(',')[1]) #获取y坐标

    xy_list.append(x) #将x坐标添加到xy_list中

    xy_list.append(y) #将y坐标添加到xy_list中

    all_list.append(xy_list) #将xy_list添加到xy_list中

print('转换后的坐标位置', all_list)

for loc in all_list:

    x = loc[0]

    y = loc[1]

    #通过定位到验证码图片进行位置滑动,perform立即执行

    ActionChains(browser).move_to_element_with_offset(img_tag, x, y).click().perform()(2)

from selenium import webdriver #导入webdriver模块

from selenium.webdriver.common.by import By #导入By模块

#导入WebDriverWait模块

from selenium.webdriver.support.ui import WebDriverWait

#导入ActionChains 模块

from selenium.webdriver import ActionChains

#导入expected_conditions模块

from selenium.webdriver.support import expected_conditions as EC

from PIL import Image #导入Image模块

from PIL import ImageFont#导入ImageFont模块

from PIL import ImageDraw#导入ImageDraw模块

from io import BytesIO #导入BytesIO模块

import Chaojiying#导入Chaojiying模块

import time #导入time模块

#初始化Google Chrome浏览器对象,并赋值给browser

browser = webdriver.Chrome()

border = 30 #定义滑块移动距离误差

#定义函数,给图片添加文字

def add_text_to_image(image):

    #设置字体样式

    font = ImageFont.truetype('simhei.ttf', 32)

    #在图片上添加文字 "请点击缺口左上角"

    add_text = '请点击缺口左上角'

    draw = ImageDraw.Draw(image)

    draw.text((2, 120), add_text, '#FF0000', font=font)

    #保存图片

    image.save('snap.png')

#定义函数,获取验证码图片位置

def get_position():

    #获取验证码图片节点

    img = wait.until(

        EC.presence_of_element_located((By.CSS_SELECTOR, '[class="geetest_window"]')))

    time.sleep(2) #休眠2s

    location = img.location #获取验证码图片坐标

    size = img.size #获取验证码图片大小

    #获取验证码图片位置

    top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size['width']

    return (top, bottom, left, right) #返回验证码图片位置

#定义函数,获取图片并保存

def get_image():

    time.sleep(0.2) #休眠0.2s

    screenshot = browser.get_screenshot_as_png() #截取页面全屏图片

    # 调用get_position函数

    top, bottom, left, right = get_position()

    screenshot = Image.open(BytesIO(screenshot)) #打开图片

    captcha = screenshot.crop((left, top, right, bottom))

    return captcha #返回图片

#定义函数,根据偏移模拟人为滑动轨迹,获取移动轨迹

def get_track(distance):

    track = [] #定义移动轨迹列表track

    current = 0 #初始化当前位移current

    mid = distance * 4 / 5 #初始化减速阈值mid

    t = 0.2 #初始化计算间隔t

    v = 0 #初始化初速度v

    while current < distance: #如果当前位移小于滑块偏移

        if current < mid: #如果当前位移小于减速阈值

            a = 2 #加速度为正2

        else: #如果当前位移大于减速阈值

            a = -3 #加速度为负3

        v0 = v #初始化初速度v0

        v = v0 + a * t #计算当前速度v

        move = v0 * t + 1 / 2 * a * t * t #计算移动距离move

        current += move #计算当前位移

#将该时间段位移添加到移动轨迹列表

        track.append(round(move))

    return track #返回移动轨迹列表

#定义函数,拖动滑块到缺口处

def move_to_gap(slider, track):

#按住滑块

    ActionChains(browser).click_and_hold(slider).perform()

    for x in track:

        #执行移动轨迹动作

        ActionChains(browser).move_by_offset(xoffset=x, yoffset=0).perform()

    time.sleep(1) #休眠1s

    ActionChains(browser).release().perform() #释放动作

#定义函数,实现验证操作

def crack():

    # 调用get_image函数,截取带缺口图片

    image = get_image()

    add_text_to_image(image)

    # 将验证码图片提交给超级鹰

    result = Chaojiying.transform_codeImg('./snap.png', 9101, 'lingyiyi', '123456qwerty', '907397')

    # 对比两张图片,获取滑动距离

    gap = int(result.split(",")[0])

    distance = gap – border #减去滑块移动距离误差

    track = get_track(distance) #模拟人为滑动轨迹

    #获取滑块节点

    slider = wait.until(EC.element_to_be_clickable(

        (By.CLASS_NAME,'geetest_slider_button')))

    move_to_gap(slider, track) #拖动滑块

    time.sleep(5) #休眠5s

if __name__ == '__main__':

    #请求BOSS直聘登录页面,打开一个浏览器窗口

    browser.get('https://login.zhipin.com/?ka=header-login')

    time.sleep(1) #休眠1s

    browser.maximize_window() #窗口最大化

    time.sleep(1) #休眠1s

    wait = WebDriverWait(browser, 10) #初始化WebDriverWait对象

#点击验证按钮

    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_wait'))).click()

    time.sleep(2) #休眠2s

    crack() #调用crack函数

第7章  爬取App和PC客户端

1.实践题

(1)

import requests

import json

headersvalue={

    'Cookie': '1&_device=android&2354647e-4be8-3c87-b26c-5e33a91bc1b0&6.6.72;channel=and-d3;impl=com.ximalaya.ting.android;osversion=28;device_model=SEA-AL10;XUM=5P2h0sPL;XIM=312ea7019ebca;c-oper=%E4%B8%AD%E5%9B%BD%E7%A7%BB%E5%8A%A8;net-mode=WIFI;freeFlowType=0;res=1080%2C2259;NSUP=42e8aeb9%2C421fdc10%2C1599117494905;AID=rSUrJvkxLdw=;manufacturer=HUAWEI;XD=YrBEo/u5jJlB11rljobr+vkUi6YnAZSx8hFyh1gmtdKEmRDnqx3cy9eozH5gjlMk/rkaXTaqBETMu4cd7ZneKElTaJEPETRMzaT2HTiN5JBlgRKnhLIX3smzSPWhEgv5P6DXGNGNv2+xdH8m3BpvY5Sj/6m7OeIW/OFFSBD05/E=;umid=aied88addff903ebd606cd7b8156776b8d;xm_grade=0;minorProtectionStatus=0;oaid=feeefcdb-effa-9bba-5e55-85ed7efd65af;newChannelId=yz-huawei;fp=00921715752122q22v64vv55080000;domain=.ximalaya.com;path=/;',

    'Cookie2': '$version=1',

    'Accept': '*/*',

    'user-agent': 'ting_6.6.72(SEA-AL10,Android28)',

    'Host': 'mobwsa.ximalaya.com',

    'Connection': 'Keep-Alive',

    'Accept-Encoding': 'gzip'

}

requests.packages.urllib3.disable_warnings()

for i in range(5):

    url = 'http://mobwsa.ximalaya.com/discovery-ranking-web/v3/category/concreteRankList/ts-1599118668274?device=android&isAnchor=false&pageSize=20&rankingListId=51&version=6.6.72'

    url = url + '&pageId=' + str(i+1)

    r = requests.get(url, headers=headersvalue, verify=False)#

    json_data = json.loads(r.text)['data']['list']

    for data in json_data:

        print(data['title'], data['intro'])

(2)

import requests

import json

headersvalue={

    'Host': 'mp.weixin.qq.com',

    'Connection': 'keep-alive',

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.5 WindowsWechat',

    'X-Requested-With': 'XMLHttpRequest',

    'Accept': '*/*',

'Referer':'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzI4NjIxNTc2OA==&uin=MTE1MTAwMjM4Mg%3D%3D&key=281d398fa0af70c9f36243839c648f99351ec37d937e93339c8405b43dbe6dcc5343d63a12abb3975e17d9489436e3bcc2799ff2e02581a73412320cf4cd5cdc2fe44a45b343c2e1d972fa2521a2d0cde91e18f37374a9c705d46e0d0880a937f3989dfe204bb53c573b1cddeb2fdb301a61b47b99fa44ec31d8a42c87f6b5c7&devicetype=Windows+10+x64&version=62090538&lang=zh_CN&a8scene=7&pass_ticket=c0WoudB5pZm9Qn4wsO4orgJHjAEBPQvM6hhNNIdEt0TWozisRjt8SpK%2B8N%2FYMBuC&winzoom=1',

    'Accept-Encoding': 'gzip, deflate',

    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4',

    'Cookie': 'rewardsn=; wxtokenkey=777; wxuin=1151002382; devicetype=android-28; version=27000d39;lang=zh_CN;pass_ticket=c0WoudB5pZm9Qn4wsO4orgJHjAEBPQvM6hhNNIdEt0TWozisRjt8SpK8N/YMBuC;wap_sid2=CI7O66QEEooBeV9IRVJ4aXY0UmVTVlF5OUV4bV9kcERHRDg1NGY5TEJtQk5YZktZT2x0ZlQ0VFB1bUFhV0tXVjUwY1pSUHBhRHpXZkhPSmt4QWMxSVNYNGlQdFJRZW5RMXhrSWZEQWs5dlhSTTlCZjF3NjZLM294eS1SR2xERGxqeUhob2xxYkE3bHJ4UVNBQUF+MKLci/sFOA1AlU4='

}

requests.packages.urllib3.disable_warnings()

for i in range(99):

url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzI4NjIxNTc2OA'

          '==&f=json&count=10&is_ok=1&scene=&uin=MTE1MTAwMjM4Mg%3D%3D&key='

          '281d398fa0af70c9f36243839c648f99351ec37d937e93339c8405b43dbe6dc'

          'c5343d63a12abb3975e17d9489436e3bcc2799ff2e02581a73412320cf4cd5c'

          'dc2fe44a45b343c2e1d972fa2521a2d0cde91e18f37374a9c705d46e0d0880a'

          '937f3989dfe204bb53c573b1cddeb2fdb301a61b47b99fa44ec31d8a42c87f6'

          'b5c7&pass_ticket=c0WoudB5pZm9Qn4wsO4orgJHjAEBPQvM6hhNNIdEt0TWoz'

          'isRjt8SpK%2B8N%2FYMBuC&wxtoken=&appmsg_token=1079_D4vhykniky0g3'

          'R6fLrWwBFfxFIpIapUodqy4iQ~~&x5=0&f=json'

    url = url + '&offset='+str(i)

    r = requests.get(url, headers=headersvalue, verify=False)

    json_data = json.loads(r.text)

    general_msg_list = json.loads(json_data['general_msg_list'])

    title = general_msg_list['list'][0]['app_msg_ext_info']['title']

    print(title)

    content_url = general_msg_list['list'][0]['app_msg_ext_info']['content_url']

    print(content_url)

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

相关文章:

版权声明:Python教程2022-10-24发表,共计24201字。
新手QQ群:570568346,欢迎进群讨论 Python51学习