python 爬虫urllib基础示例

542次阅读
没有评论
python

环境使用python3.5.2  urllib3-1.22  

下载安装

wget https://www.python.org/ftp/python/3.5.2/Python-3.5.2.tgz

tar -zxf Python-3.5.2.tgz

cd Python-3.5.2/

./configure –prefix=/usr/local/python

make && make install

mv /usr/bin/python /usr/bin/python275

ln -s /usr/local/python/bin/python3 /usr/bin/python

wget https://files.pythonhosted.org/packages/ee/11/7c59620aceedcc1ef65e156cc5ce5a24ef87be4107c2b74458464e437a5d/urllib3-1.22.tar.gz

tar zxf urllib3-1.22.tar.gz 

cd urllib3-1.22/

python setup.py install

浏览器模拟示例

添加headers一:build_opener() import urllib.request url="http://www.baidu.com" headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36") opener=urllib.request.build_opener() opener.addheaders=[headers] data=opener.open(url).read() fl=open("/home/urllib/test/1.html","wb") fl.write(data) fl.close()添加headers二:add_header() import urllib.request url="http://www.baidu.com" req=urllib.request.Request(url) req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36") data=urllib.request.urlopen(req).read() fl=open("/home/urllib/test/2.html","wb") fl.write(data) fl.close()

增加超时设置

timeout超时 import urllib.request for i in range(1,100): try: file=urllib.request.urlopen("http://www.baidu.com",timeout=1) data=file.read() print(len(data)) except Exception as e: print("出现异常—->"+str(e))

HTTP协议GET请求一

get请求 import urllib.request keywd="hello" url="http://www.baidu.com/s?wd="+keywd req=urllib.request.Request(url) req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36") data=-urllib.request.urlopen(req).read() fl=open("/home/urllib/test/3.html","wb") fl.write(data) fl.close()

HTTP协议GET请求二

get请求 (编码) import urllib.request keywd="中国" url="http://www.baidu.com/s?wd=" key_code=urllib.request.quote(keywd) url_all=url+key_code req=urllib.request.Request(url_all) req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36") data=-urllib.request.urlopen(req).read() fl=open("/home/urllib/test/4.html","wb") fl.write(data) fl.close()

HTTP协议POST请求

post请求 import urllib.request import urllib.parse url="http://www.baidu.com/mypost/" postdata=urllib.parse.urlencode({ "user":"testname", "passwd":"123456" }).encode('utf-8') req=urllib.request.Request(url,postdata) red.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36") data=urllib.request.urlopen(req).read() fl=open("/home/urllib/test/5.html","wb") fl.write(data) fl.close()

使用代理服务器

def use_proxy(proxy_addr,url): import urllib.request proxy=urllib.request.ProxyHandler({'http':proxy_addr}) opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler) urllib.request.install_opener(opener) data=urllib.request.urlopen(url).read().decode('utf-8') return data proxy_addr="201.25.210.23:7623" url="http://www.baidu.com" data=use_proxy(proxy_addr,url) fl=open("/home/urllib/test/6.html","wb") fl.write(data) fl.close()

开启DebugLog

import urllib.request url="http://www.baidu.com" httpd=urllib.request.HTTPHandler(debuglevel=1) httpsd=urllib.request.HTTPSHandler(debuglevel=1) opener=urllib.request.build_opener(httpd,httpsd) urllib.request.install_opener(opener) data=urllib.request.urlopen(url) fl=open("/home/urllib/test/7.html","wb") fl.write(data) fl.close()

URLError异常处理

URLError异常处理 import urllib.request import urllib.error try: urllib.request.urlopen("http://blog.csdn.net") except urllib.error.URLError as e: print(e.reason)

HTTPError处理 import urllib.request import urllib.error try: urllib.request.urlopen("http://blog.csdn.net") except urllib.error.HTTPError as e: print(e.code) print(e.reason)

结合使用 import urllib.request import urllib.error try: urllib.request.urlopen("http://blog.csdn.net") except urllib.error.HTTPError as e: print(e.code) print(e.reason) except urllib.error.URLError as e: print(e.reason)

推荐方法: import urllib.request import urllib.error try: urllib.request.urlopen("http://blog.csdn.net") except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason)

cookie使用

import urllib.request import urllib.parse import http.cookiejar

url="http://bbs.chinaunix.net/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=LvfR9" postdata=urllib.parse.urlencode({ "formhash":"11154664", "loginsubmit":"true", "username":"superli", "password":"123456789", "referer":"http://bbs.chinaunix.net/", "return_type":"" }).encode('utf-8') req=urllib.request.Request(url,postdata) req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0") cjar=http.cookiejar.CookieJar() opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar)) urllib.request.install_opener(opener) data=opener.open(req).read() fl=open("/home/urllib/test/11.html","wb") fl.write(data) fl.close() url2="http://bbs.chinaunix.net/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=LvfR9" data2=urllib.request.urlopen(url2).read() fl=open("/home/urllib/test/12.html","wb") fl.write(data) fl.close()

同时增加headers,代理,cookie,timeout,debuglog

headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36") httpd=urllib.request.HTTPHandler(debuglevel=1) httpsd=urllib.request.HTTPSHandler(debuglevel=1) cjar=http.cookiejar.CookieJar() proxy=urllib.request.ProxyHandler({'http':"127.0.0.1:8888"}) opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler,urllib.request.HTTPCookieProcessor(cjar),httpd,httpsd) opener.addheaders=[headers] urllib.request.install_opener(opener) data=urllib.request.urlopen(url,timeout=2).read().decode('utf-8')

示例仅供参考

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

相关文章:

版权声明:Python教程2022-11-01发表,共计5772字。
新手QQ群:570568346,欢迎进群讨论 Python51学习