案例1:爬取搜狗指定词条对应的搜索结果(简易网页采集器)
import requests url = “https://www.sogou.com/web” #处理URL携带的参数: kw =input(“输入要搜索的关键字”) param={ ‘query’:kw } #对指定的url发起的请求的url是携带参数的,并在请求过程中处理了参数 response = requests.get(url=url,params=param) page_text = response.text fileName =kw+’.html’ with open(fileName,’w’,encoding=’utf-8′) as fp: fp.write(page_text) print(fileName+”保存成功!!!”)
反爬:#UA伪装
#UA伪装 #User-Agent headers = { “User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36” } response = requests.get(url=url,params=param,headers=headers)
案例2:破解百度翻译
import requests import json #1.指定url post_url =”https://fanyi.baidu.com/langdetect” #2.UA伪装 headers ={ “User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36” } #3.post请求参数处理(同get请求一样) data = { # ‘from’: ‘en’, # ‘to’: ‘zh’, ‘query’: ‘dog’,
} #4.请求发送 response = requests.post(url=post_url,data=data,headers=headers) #5.获取响应数据 .json()方法返回的是obj(如果确认响应数据是json类型的,才可以用json()) dic_obj = response.json() print(dic_obj) #进行持久化存储 fp =open(‘./dog.json’,’w’,encoding=’utf-8′) json.dump(dic_obj,fp=fp,ensure_ascii=False) fp.close() print(“over”)
案例3:爬取豆瓣电影分类排行榜 https://movie.douban.com/中的电影详情数据
#-*- coding = utf-8 -*- #@Time : 2022/2/19 17:33 #@File : requests实战之豆瓣电影.py #@software : PyCharm
import requests import json
headers ={ “User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36” } url = “https://movie.douban.com/j/chart/top_list” param ={ ‘type’: ’24’, ‘interval_id’: ‘100:90’, ‘action’:”, ‘start’: ’40’, ‘limit’: ’20’ } response = requests.get(url=url,params=param,headers=headers) list_data = response.json() fp =open(‘./douban.json’,’w’,encoding=’utf-8′) json.dump(list_data,fp=fp,ensure_ascii=False) fp.close() print(“over”)
案例4:爬取肯德基餐厅查询 http://www.kfc.com.cn/kfccda/index.aspx中指定地点的餐厅数
#-*- coding = utf-8 -*- #@Time : 2022/2/19 18:01 #@File : requests实战之肯德基.py #@software : PyCharm import requests import json headers = { “User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36” } url = “http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword” kw = input(“请输入要查询的城市名称”) param ={ ‘cname’:”, ‘pid’: ”, ‘keyword’: kw, ‘pageIndex’: ‘1’, ‘pageSize’: ’10’ } response = requests.get(url=url,params=param,headers=headers) page_text = response.text with open(‘./kfc.text’,’w’,encoding=’utf-8′) as fp: fp.write(page_text)
print(“over”)
案例5:爬取国家药品监督管理总局中基于中华人民共和国化妆品生产许可证相关数据
http://scxk.nmpa.gov.cn:81/xk/ #-*- coding = utf-8 -*- #@Time : 2022/2/19 19:08 #@File : requests实战之药监总局相关数据爬取.py #@software : PyCharm
import requests import json #批量获取不同企业的id值 if __name__ ==”__main__”:
id_list = [] # 存储企业的id all_data_list = [] # 存储所有企业详情 headers = { “User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36” } url = “http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList”
#参数的封装 for page in range(1,6): page =str(page) data ={ ‘on’: ‘true’, ‘page’: page, ‘pageSize’: ’15’, ‘productName’: ”, ‘conditionType’: ‘1’, ‘applyname’: ”, ‘applysn’: ”, }
json_ids = requests.post(url=url,headers=headers,data=data).json() for dic in json_ids[‘list’]: id_list.append(dic[‘ID’]) #详情页url的域名都是一样的,只有携带的参数不同 #获取企业详情信息
for id in id_list: data_xq ={ “id”: id } url_xq = “http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById” json_xq = requests.post(url=url_xq,headers=headers,data=data_xq).json() print(json_xq) all_data_list.append(json_xq) # fp = open(‘.json’, ‘w’, encoding=’utf-8′) # json.dump(json_xq, fp=fp, ensure_ascii=False) # fp.close() #持久化存储 fp =open(‘./allData.json’,’w’,encoding=’utf-8′) json.dump(all_data_list,fp =fp,ensure_ascii=False) print(“ov”)
神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试