爬虫基础及应用

591次阅读

爬虫：获取网页并提取和保存信息的自动化程序

爬虫引发的问题：
1.性能问题：受限与编写水平和目的，为web服务器带来巨大的资源开销，从而对网站运营形成骚扰。
2.法律风险：服务器上的数据有归属权，牟利带来法律风险。
3.隐私风险：可能具备突破简单访问控制的能力获得被保护的数据从而泄露隐私。

网络爬虫的限制：
1.来源审查：判断uer-agent进行限制：检查来访HTTP的uer-agent域，只响应浏览器和友好的爬虫
2.发布公告：Robots协议（君子协定）：告知所有网站的爬取策略，要求爬虫遵守
Robots协议，放在网页首页一个叫做robot.txt的文件中
https://www.baidu.com/robots.txt

response对象的相关属性：
status-code：响应状态码
text：响应内容的字符串形式
content：响应内容的二进制形式
encoding：内容编码格式
apparent-encoding：从内容分析出来的编码方式

import requests url = 'https://mp.csdn.net/console/article' res = requests.get(url, timeout = 6) print(res.text) print(res.apparent_encoding)

xpath简述
xml路径语言
解析xml配置文件，在xml文档中查找信息的语言，同样适用于html文档的搜索
xpath的使用规则

.选取当前节点
… 选取当前节点的父节点
@选取属性
/从当前节点选取直接子节点
//从当前节点选取直接子孙节点
//div[@class=“name”] # 从当前节点选取直接子孙节点属性为name的节点

lxml简述：
python的解析库，支持xpath解析方式，支持html和xml解析，效率高，使用lxml中的etree模块中的HTML方法可以对网页进行解析，返回html的element对象。

爬虫爬取邓紫棋贴吧照片

import io import sys import requests import os from lxml import etree # alt+enter 安装requests 或 conda install -n tq=requests

def gethtml(name,page): url="http://tieba.baidu.com/f" params={"ie":"utf-8","kw":name,"page":page} headers = {"User-Agent": "Mozilla/5.0"} # about:version 用户代理 res=requests.get(url=url,params=params,headers=headers) # 传入参数 # 或者使用下列方法 '''url="https://s.weibo.com/weibo/{}topnav=1&wvr=6&b=1".format(name) headers={"user-agent":"Mozilla/5.0"} res=requests.get(url=url,headers=headers)''' requests.adapters.DEFAULT_RETRIES = 5 # request的连接数过多而导致Max retries exceeded sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码 print(res.text) # 输出网页的内容 with open("{}.html".format(name),"w",encoding="utf-8")as f: f.write(res.text) # 将网页内容写入文档中 return res.text

def getvalue(html): # 获取图片链接 hl=etree.HTML(html) res=hl.xpath("//div[@class='t_con cleafix']//img/@bpic") # 选取class='t_con cleafix'的子孙节点img的属性bpic return res

def getimg(urls): # 下载图片 for i in urls: img=requests.get(i) # 返回图片数据 imgname=i.split('/')[–1] # 提取图片名称 print("kkkkkk") if not os.path.exists("img"): os.mkdir("img") # 创建目录 with open("img/"+imgname,"wb")as f: # 二进制写入文件 f.write(img.content)

if __name__=="__main__": name = input("输入需要爬取的艺人名称(汉字)") for i in range(0,10): html=gethtml(name,50*i) # url分析得出page加一页参数加50 urls=getvalue(html) getimg(urls) print("第{}页图片获取完毕".format(i+1))

爬取微博照片

import requests from lxml import etree import os

def getHtml(name): url="https://s.weibo.com/weibo/{}?topnav=1&wvr=6&b=1".format(name) headers={"user-agent":"Mozilla/5.0"} res=requests.get(url=url,headers=headers) return res.text

def getvalue(html): # 解析网页 hl=etree.HTML(html) res= hl.xpath("//div[@class='media media-piclist']//img/@src") h2=list() for item in res: print("http:"+item) h2.append("http:"+item) return h2

def getimg(urls): for url in urls: img= requests.get(url) imgName=url.split("/")[–1] # 判断img文件夹是否存在 if not os.path.exists("img"): # 不存在则创建 os.mkdir("img")

with open("img/"+imgName,"wb") as f: f.write(img.content)

if __name__ == '__main__': name = input("请输入需要爬取的对象(中文)") html=getHtml(name) urls= getvalue(html) getimg(urls) print("图片获取完毕")

爬取网易翻译的译文

import requests import json

def getfy(val): url="http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule" # 翻译后分析网址。 headers = {"User-Agent": "Mozilla/5.0"} data={"i":val,"doctype":"json"} res=requests.post(url,data=data,headers=headers) # 内容显示在网页的response中 result=json.loads(res.text) # {'type': 'ZH_CN2EN', 'errorCode': 0, 'elapsedTime': 0, 'translateResult': [[{'src': '你', 'tgt': 'you'}]]} return result.get("translateResult")[0][0].get("tgt")

if __name__=="__main__": # main加回车就可以出来 a=input("please input the word you want to translate:") res=getfy(a) print("the result is:{}".format(res))

爬取表格中的数据并对其进行分析画图展示

#–author:zhaozhao #date:2020/08/05

# http://www.tianqihoubao.com/lishi/changping/month/201903.html import requests from bs4 import BeautifulSoup import csv headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' }

def get_info(url): html = requests.get(url,headers=headers) soup = BeautifulSoup(html.text,'lxml') tr = soup.find('table', class_='b').find_all('tr') listData=[] for j in tr[1:]: # 遍历第1行到最后一行，表头为第0行 td = j.find_all('td') # td表格 time = td[0].get_text().strip() # 遍历时间 temp = td[2].get_text().split('/')[–1].strip() # 遍历得到最低气温 listData.append([time,temp]) info = { 'time':time, 'temp':temp, } name=url.split('/')[–1].split('.')[–2] save_csv('{}.csv'.format(name),info)

# 保存分析结果 def save_csv(filename,info): with open(filename,'a',encoding='utf-8') as f: # 创建csv文件 fieldnames = ['time','temp'] writer = csv.DictWriter(f,fieldnames=fieldnames) writer.writerow(info)

if __name__ == '__main__': a=input("输入城市(拼音)") url1 = 'http://www.tianqihoubao.com/lishi/{}/month/201903.html'.format(a) # 2019年3月的天气 url2 = 'http://www.tianqihoubao.com/lishi/{}/month/201803.html'.format(a) # 2018年3月的天气 get_info(url1) get_info(url2)

画图分析

import matplotlib.pyplot as plt # 画图模块 x1 = list() y1 = list() x2 = list() y2 = list() def picture(): for i in ['201803.csv','201903.csv']: print(i) with open(i,'r',encoding='utf-8') as f: info=f.readlines() for l in range(len(info)): if l%2==0: info[l]=info[l].split(',') j=info[l][0].split('月')[–1].split('日')[0] x1.append(j) k=info[l][–1].strip() y1.append(k) if i != '201903.csv': x2 = x1.copy() # 不可以拷贝地址，需要拷贝数据 y2 = y1.copy() x1.clear( ) y1.clear( )

plt.plot(x2,y2,label='2018',color='r') plt.plot(x1,y1,label='2019',color='b') plt.xlabel('time') plt.ylabel('temperature') plt.title('variation') plt.legend() plt.show() # 不加图像就无法显示

picture()

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

发表于：Python爬虫

2022-10-24

# Python爬虫

复制链接

赏

爬虫基础及应用

相关文章：

HTTP代理设置详解：一步步配置指南

什么是Socks5代理IP及其优势

Socks5代理配置教程及注意事项

什么是代理服务器IP：如何选择合适的

国外代理服务器的优势及选择建议

如何找到可靠的免费代理服务器

在线代理服务器的使用与推荐

HTTP代理服务器的设置及应用实例

静态代理IP怎么填写：步骤与示例

海外静态IP的代理选择与配置