复制粘贴,拿来直接跑就行,url之类的都不用改。(python学习群 q: 467604262 )
import urllib.request
import urllib.parse
import re
import os
#添加header,其中Referer是必须的,否则会返回403错误,User-Agent是必须的,这样才可以伪装成浏览器进行访问
header=\
{
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
"referer":"https://image.baidu.com"
}
url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word={word}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&cg=girl&pn={pageNum}&rn=30&gsm=1e00000000001e&1490169411926="
keyword = input("请输入搜索关键字:")
#转码
keyword = urllib.parse.quote(keyword,'utf-8')
n = 0
j = 0
while(n<3000):
error = 0
n+=30
#url
url1 = url.format(word=keyword,pageNum=str(n))
#获取请求
rep = urllib.request.Request(url1,headers=header)
#打开网页
rep = urllib.request.urlopen(rep)
#获取网页内容
try:
html = rep.read().decode('utf-8')
# print(html)
except:
print("出错了!")
error = 1
print("出错页数:"+str(n))
if error == 1:
continue
#正则匹配
p = re.compile("thumbURL.*?\.jpg")
#获取正则匹配到的结果,返回list
s = p.findall(html)
if os.path.isdir(r"C:\Users\87419\Desktop\Pa") != True:
os.makedirs(r"C:\Users\87419\Desktop\Pa")
with open("testpic.txt","a") as f:
#获取图片
for i in s:
print(i)
i = i.replace('thumbURL":"','')
print(i)
f.write(i)
f.write("\n")
#保存图片
urllib.request.urlretrieve(i,r"C:\Users\87419\Desktop\Pa/pic{num}.jpg".format(num=j))
j+=1
f.close()
print("总共爬取图片数为:"+str(j))
神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试