Python 实现简单的爬虫

893次阅读
没有评论
Python

Python 是一种跨平台的计算机程序设计语言,面向对象动态类型语言,Python是纯粹的自由软件,源代码和解释器cpython遵循 GPL(GNU General Public License)协议,随着版本的不断更新和语言新功能的添加,Python 越来越多被用于独立的、大型项目的开发。

快速抓取网页: 使用urllib最基本的抓取功能,将百度首页的内容保存到本地目录下.

import urllib.request

res=urllib.request.urlopen(“https://www.baidu.com”)
print(res.read().decode(“utf-8”))
f=open("./test.html",“wb”) #保存在本地
f.write(res.read())
f.close()
实现POST请求: 上述的例子是通过请求百度的get请求获得百度,下面使用urllib的post请求.

import urllib.parse
import urllib.request

data=bytes(urllib.parse.urlencode({“hello”:“lyshark”}),encoding=“utf-8”)
print(data)
response = urllib.request.urlopen(‘http://www.baidu.com/post’,data=data)
print(response.read())
设置TIMEOUT时间: 我们需要给请求设置一个超时时间,而不是让程序一直在等待结果.

import urllib.request
response = urllib.request.urlopen(‘http://www.baidu.com’,timeout=1)
print(response.read())
获取网站状态: 我们可以通过status、getheaders(),getheader(“server”),获取状态码以及头部信息.

import urllib.request

res=urllib.request.urlopen(“https://www.python.org”)
print(type(res))
<class ‘http.client.HTTPResponse’>

res.status
res.getheaders()
res.getheader(“server”)
伪装访问网站: 给请求添加头部信息,从而定制自己请求网站是时的头部信息,防止被和谐.

from urllib import request,parse
url = ‘http://www.baidu.com’
headers = {
‘User-Agent’: ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)’,‘Host’: ‘mkdirs.org’
}
dict = {
‘name’: ‘LyShark’
}
data = bytes(parse.urlencode(dict),encoding=‘utf8’)
req = request.Request(url=url,data=data,headers=headers,method=‘POST’)
response = request.urlopen(req)
print(response.read().decode(‘utf-8’))
简单的URL页面拼接:

import re
def Get_Url(target,start,ends):
urls=[]
for i in range(start,ends):
url = target+"/"+str(i)
urls.append(url)
return urls
if name == “main”:
url = Get_Url(“https://jq.qq.com/?_wv=1027&k=sqgP9S9Y”,1,10)
print(url)
request库的使用:

import re
import requests
head={‘user-agent’:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/71.0.3578.98 Safari/537.36’}
if name == “main”:
ret = requests.get(url=“https://jq.qq.com/?_wv=1027&k=sqgP9S9Y”,headers=head,timeout=1)
all_pic_link = re.findall(’<img src="(.*?)"’,ret.text,re.S)
print(all_pic_link)
简单实现爬取图片:

import re
import urllib.request
def open_url(url):
ret = urllib.request.Request(url)
ret.add_header(‘user-agent’,‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36’)
page = urllib.request.urlopen(ret)
html =page.read().decode(“utf-8”)
return html
def get_img(html):
ret = re.findall(’<img src="([^"]+.jpg)"’,html)
for each in ret:
filename = each.split("/")[-1]
print(“完整路径:”,each)
print(“文件名称:”,filename)
urllib.request.urlretrieve(each,filename,None)
if name == ‘main’:
url = open_url(“https://jq.qq.com/?_wv=1027&k=sqgP9S9Y”)
get_img(url)
爬每日CVE漏洞列表:

import re
import requests
from bs4 import BeautifulSoup
head={‘user-agent’:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/71.0.3578.98 Safari/537.36’}
def Get_CVE(url):
new_cve = []
ret = requests.get(url=url,timeout=3)
bs = BeautifulSoup(ret.text,‘html.parser’)
for i in bs.find_all(‘a’):
href = i.get(‘href’)
new_cve.append(href)
return(new_cve)
def Get_Number(list):
new = []
for i in list:
temp = re.findall("[0-9]{1,}-.*",str(i))
new.append(“CVE-{}”.format(temp))
return new
if name == “main”:
url= “https://cassandra.cerias.purdue.edu/CVE_changes/today.html”
cve = Get_CVE(url)
number = Get_Number(cve)
for i in number:
print(“今日份的漏洞:”,i)
简单爬取ipipgo代理地址: 此处我们就用简单的正则匹配爬取,该方法比较笨拙.

import re
import requests
head = {‘user-agent’: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/71.0.3578.98 Safari/537.36’}
ret = requests.get(url=“https://jq.qq.com/?_wv=1027&k=sqgP9S9Y”,timeout=3)
data = re.findall(’.*’,ret.text)
sum =0
for i in range(0,20):
IP = data[sum].replace("","").replace("","")
Port = data[sum+1].replace("","")
Type = data[sum+2].replace("","")
times = data[sum+3].replace("","")
year = data[sum+4].replace("","")
print(“IP地址:{} 端口号:{} 类型:{} 生存周期:{} 时间:{}”.format(IP,Port,Type,times,year))
sum = sum+5
BeautifulSoup 定位技巧: 使用bs库需要安装,三个依赖包 pip install requests bs4 lxml

from bs4 import BeautifulSoup
import requests
head = {‘user-agent’: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/71.0.3578.98 Safari/537.36’}
ret = requests.get(url=“https://jq.qq.com/?_wv=1027&k=sqgP9S9Y”,timeout=3)
ret.encoding=“utf-8” # 出现乱码需要改这里
bs = BeautifulSoup(ret.text,“lxml”)

查找head头节点里面的所有link标签,过滤出0个里面的,href成员

print(bs.select(‘head link’)[0][‘href’])

查找文中所有a标签,且类名是c_b_p_desc_readmore的,并提取出其href字段

print(bs.find_all(‘a’,class_=‘c_b_p_desc_readmore’)[0][‘href’])

提取所有a标签,且id等于blog_nav_admin类等于menu,并提取出其href字段

print(bs.find_all(‘a’,id=‘blog_nav_admin’,class_=‘menu’)[0][‘href’])
print(bs.find_all(‘a’,class_=‘menu’)[0].attrs[‘href’])

提取DIV标签里面,id是page_begin_html且里面是link标签的

print(bs.select(‘div[id=“page_begin_html”] link’)[0][‘href’])
print(bs.select(‘ul[id=“navList”] .menu’)[0][‘href’])

提取 body 标签下面的 div标签并且匹配id=page_begin_html标签里面第1个link元素

print(bs.select(‘body > div[id=“page_begin_html”] > link’)[0])

提取指定标签里面的内容

print(bs.select(‘title’)[0].get_text())
print(bs.select(‘a[href=“https://www.cnblogs.com/LyShark/archive/2019/12/04.html”]’))

定位body标签下面的div下面子标签div下面的span标签

print(bs.select(‘div[id=“header”] div[id=“blogTitle”] a[id=“lnkBloglogo”]’))
print(bs.select(‘body div[id=“header”] div[class=“blogStats”] span[id=“stats_post_count”]’))
stripped_strings方法的简单应用: 提取出house-name标签下面的所有字符串

from bs4 import BeautifulSoup
import requests
import html5lib
head = {‘user-agent’: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/71.0.3578.98 Safari/537.36’}
ret = requests.get(url=“https://gz.centanet.com/ershoufang/”,timeout=3)
text = str(ret.content.decode(‘utf-8’))
bs = BeautifulSoup(text,“html5lib”)
ret = bs.select(‘div[class=“section”] div[class=“house-item clearfix”] p[class=“house-name”]’)
for i in ret:
#house = i.get_text() # 提取出文中的所有字符串以及其格式
house = list(i.stripped_strings) # 提取出字符串并以列表的形式返回
print(house)
实现爬取中国天气网:

from bs4 import BeautifulSoup
import requests
import html5lib
head = {‘user-agent’: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/71.0.3578.98 Safari/537.36’}
ret = requests.get(url=“http://www.weather.com.cn/textFC/shandong.shtml”,“html5lib”)
bs.find_all(‘div’,class_=‘conMidtab’)[1] # 定位到第一个标签上
tr = bs.find_all(‘tr’)[2:] # 在conMidtab里面找,tr标签并从第3个标签开始保存
for i in tr:
td = i.find_all(‘td’) # 循环找代码中的所有td标签
city_td = td[0] # 找所有的td标签,并找出第一个td标签
# stripped_strings 获取目标路径下所有的子孙非标签字符串,自动去掉空字符串
city = list(city_td.stripped_strings)[0]
temp = td[-5] # 取出度数的标签
temperature = list(temp.stripped_strings)[0]
print(‘城市:{} 温度:{}’.format(city,temperature))
使用bs4库爬取ipipgo代理: 使用库的方式爬取,啪啪啪,三下五除二搞定.

import re
import requests
from bs4 import BeautifulSoup
head = {‘user-agent’: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/71.0.3578.98 Safari/537.36’}
ret = requests.get(url=“https://www.shenlongip.com/wt/”,timeout=3)
bs = BeautifulSoup(ret.text,“lxml”)
ret = bs.select(‘table[id=“ip_list”] tr[class=“odd”]’)
ip=[]
for i in ret:
house =list(i.stripped_strings)
ip.append(house)
for i in range(0,50):
format = “http://{}:{}”.format(ip[i][0],ip[i][1])
print(format,file=open(“save.log”,‘a+’,encoding=‘utf-8’))
print(“代理地址(已保存) {}”.format(format))
Request使用代理IP地址

import re
from time import sleep
import requests
head = {‘user-agent’: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/71.0.3578.98 Safari/537.36’}
proxy = { “http”:“http://127.0.0.1:9999” }

无密码写法:“http”: “http://ip:端口号”

有密码写法:“https”: “https://username:password@ip:端口号”

file = open(“save.log”,“r”,encoding=“utf-8”)
for i in file.readlines():
data = “”.join(i.split(’n’)) # 去除空格
proxy.update(http=data) # 更新proxy中的数据为当前行
ret = requests.get(url=“https://www.cnblogs.com/LyShark/”,timeout=3,proxies=proxy)
if ret.status_code == 200:
print(“代理:{} 访问完成”.format(proxy[“http”]))
else:
print(“代理:{} 不在线,失败”.format(proxy[“http”]))
sleep(1)
Request代理下载文件

import requests
head = {‘user-agent’: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/71.0.3578.98 Safari/537.36’}
proxy = { “http”:“http://117.69.200.46:9999” }
url = “https://nmap.org/dist/nmap-7.80-win32.zip”
ret = requests.get(url=url,stream=True,proxies=proxy)
fp = open(“nmap.zip”,“wb”)
for chunk in ret.iter_content(chunk_size=4096):
if chunk:
print("本次保存长度:{} ".format(len(chunk)))
fp.write(chunk)

简单爬取子域名

对于新手小白想更轻松的学好Python基础,Python爬虫,web开发、大数据,数据分析,人工智能等技术,这里给大家分享系统教学资源,架下我尉(同英): 2763177065 【教程/工具/方法/解疑】

import requests
import json
def GetSubDomain(domain):
url = “http://ce.baidu.com/index/getRelatedSites?site_address={}”.format(domain)
ret = requests.get(url=url)
obj = json.loads(ret.text)
list = obj.get(“data”)
print(“子域名个数:{}”.format(len(list)))
fp = open(“domain.log”,“w”)
for item in list:
fp.write(item.get(“domain”))
fp.write("n")
print(item)
fp.close()
GetSubDomain(“qq.com”)

from bs4 import BeautifulSoup
import requests,os
header = {“User-Agent”:“Mozilla/5.0 (iPhone; U; cpu like Mac OS X) AppleWebKit/420.1 (KHTML,like Gecko) Version/3.0 Mobile/4A93 Safari/419.3”}
def get_url(name,start_page,end_page):
title = []
value = []
for x in range(start_page,end_page+1):
url = “https://www.cnblogs.com/{}/default.html?page={}”.format(name,x)
response = requests.get(url,headers=header,timeout=5)
text = str(response.content.decode(“utf-8”))
bs = BeautifulSoup(text,“lxml”)
ret = bs.select(‘div[class=“day”] div[class=“postTitle”] a’)
for item in range(0,10):
x = ret[item].get_text().replace("n","")
y = ret[item].get(‘href’).replace("n","")
title.append(x)
value.append(y)
print("[+] 文章路径: —> 地址: {} —> 标题: {}".format(y,x))
return title,value
def down_page(page_name,url):
params = { “enc”: “utf-8” }
response = requests.get(url=url,params=params,headers=header)
#print(response.encoding) # 打印出所请求页面返回的编码方式
#print(response.apparent_encoding) # 通过内容分析出的编码方式,这里是urf-8
content = response.text.encode(response.encoding).decode(response.apparent_encoding)
os.system(“mkdir {}”.format(page_name))
# 下载页面并放入相应目录下
with open(page_name + “/” + page_name+".html",‘w’,encoding=‘utf-8’) as f:
f.write(content)
# 下载图片相关内容
bs = BeautifulSoup(content,“lxml”)
ret = bs.select(‘div[id=“cnblogs_post_body”] div[class=“left-9-code”] img’)
for item in range(0,len(ret)):
src = ret[item].get(“src”)
src_name = src.split("/")[-1]
print("[+] —> 正在准备下载图片: {} —> 地址: {}".format(src_name,src))
img = requests.get(url=src,stream=True)
with open(page_name + “/” + src_name,‘wb’) as f:
for chunk in img.iter_content(chunk_size=1024):
f.write(chunk)
if name == ‘main’:
title,value = get_url(“lyshark”,2)
for item in range(0,len(value)):
print(title[item])
down_page(title[item].replace(" “,”"),value[item])
Selenium 自动化测试库的使用:

Selenium Test

新闻 我的博客 GitHub

用户:

密码:

hello lyshark p1

hello lyshark p2

通过简单的浏览文件并实现简单的定位.

驱动下载地址: http://chromedriver.storage.googleapis.com/index.html

from selenium import webdriver
WebPath = “C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe”
driver = webdriver.Chrome(executable_path=WebPath)
driver.set_window_size(1024,768)

常用的定位变量参数如下所示.

driver.get(“http://lyshark.com”)
print(“当前URL: {}”.format(driver.current_url))
print(“当前标题: {}”.format(driver.title))
print(“网页代码: {}”.format(driver.page_source))

基本的 find_element 标签查找定位方式

print(driver.find_element_by_id(“user”)) # 通过ID来查找元素
print(driver.find_element_by_name(“p1”).text) # 通过name属性来定位
print(driver.find_element_by_class_name(“s_ipt”)) # 通过类名来定位

通过xpath定位,xpath定位有N种写法,这里列几个常用写法

print(driver.find_element_by_xpath("//form[@class=‘fms’]//input[@id=‘user’]"))
print(driver.find_element_by_xpath("//p[@name=‘p1’]"))
print(driver.find_element_by_xpath("//html/body/form/p/input"))
print(driver.find_elements_by_css_selector(".fms #user"))

定位a标签中的关键字.

print(driver.find_element_by_link_text(“新闻”))
print(driver.find_element_by_partial_link_text(“我”))
简单实现多个标签之间互相切换

— coding:utf-8 —

from selenium import webdriver
import time
WebPath = “C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe”
driver = webdriver.Chrome(executable_path=WebPath)
driver.set_window_size(1024,768)
driver.get(“https://www.baidu.com”)
driver.find_element_by_id(“kw”).send_keys(“lyshark”) # 发送给id=kw的编辑框,搜索关键字 lyshark
driver.find_element_by_id(“su”).click() # 点击搜索按钮,百度一下的ID是su
time.sleep(1)

xpath 语法 寻找 div id是1里面的 a标签取出标签中的 contains text()

driver.find_element_by_xpath("//div[@id=‘1’]//a[contains(text(),’-’)]").click()
time.sleep(1)
handle = driver.current_window_handle # 获取当前窗口句柄
handle_all = driver.window_handles # 获取当前所有开启窗口的句柄
print(handle_all)
driver.switch_to.window(handle_all[0]) # 切换到第一个窗口中
time.sleep(1)
driver.find_element_by_id(“kw”).clear() # 接着清空搜索框中的内容
通过xpath定位标签并自动输入内容,发送登录请求到后端,写法如下.

from selenium import webdriver
WebPath = “C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe”
driver = webdriver.Chrome(executable_path=WebPath)
driver.set_window_size(1024,768)
driver.get(“http://lyshark.com”)

通过xpath语法定位到用户名的标签上并且自动输入lyshark这个用户名

driver.find_element_by_xpath("//form[@class=‘fms’]/p//input[@id=‘user’]").send_keys(“lyshark”)

通过xpath语法定位到密码的标签上清空默认值,然后输入123123密码

driver.find_element_by_xpath("//form[@class=‘fms’]/p//input[@id=‘pass’]").clear()
driver.find_element_by_xpath("//form[@class=‘fms’]/p//input[@id=‘pass’]").send_keys(“123123”)

提交这个请求,默认有两种提交方式一种是 click() 一种是submit()

driver.find_element_by_xpath("//form[@class=‘fms’]/input[@type=‘submit’]").click()
通过类库实现模拟键盘鼠标操作记录.

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
WebPath = “C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe”
driver = webdriver.Chrome(executable_path=WebPath)
driver.set_window_size(1024,768)
driver.get(“https://www.baidu.com”)

————————————————————————

ActionChains 类提供了鼠标操作的常用方法,鼠标事件的常用函数说明

perform(): 鼠标悬浮于标签

context_click(): 右击

double_click(): 双击

drag_and_drop(): 拖动

move_to_element():鼠标悬停

定位到要悬停的元素

above = driver.find_element_by_link_text(“更多产品”)

对定位到的元素执行鼠标悬停操作

ActionChains(driver).move_to_element(above).perform()

————————————————————————

webdriver.common.keys 类提供了键盘事件的操作,以下为常用的键盘操作:

send_keys(Keys.BACK_SPACE) 删除键(BackSpace)

send_keys(Keys.SPACE) 空格键(Space)

send_keys(Keys.TAB) 制表键(Tab)

send_keys(Keys.ESCAPE) 回退键(Esc)

send_keys(Keys.ENTER) 回车键(Enter)

send_keys(Keys.CONTROL,‘a’) 全选(Ctrl+A)

send_keys(Keys.CONTROL,‘c’) 复制(Ctrl+C)

send_keys(Keys.CONTROL,‘x’) 剪切(Ctrl+X)

send_keys(Keys.CONTROL,‘v’) 粘贴(Ctrl+V)

send_keys(Keys.F1) 键盘 F1

输入框输入内容

driver.find_element_by_id(“kw”).send_keys(“seleniumm”)

删除多输入的一个 m

driver.find_element_by_id(“kw”).send_keys(Keys.BACK_SPACE)

输入空格键+从入门到入土

driver.find_element_by_id(“kw”).send_keys(Keys.SPACE)
driver.find_element_by_id(“kw”).send_keys(“从入门到入土”)

ctrl+a 全选输入框内容

driver.find_element_by_id(“kw”).send_keys(Keys.CONTROL,‘a’)

ctrl+x 剪切输入框内容

driver.find_element_by_id(“kw”).send_keys(Keys.CONTROL,‘x’)

ctrl+v 粘贴内容到输入框

driver.find_element_by_id(“kw”).send_keys(Keys.CONTROL,‘v’)

通过回车键来代替单击操作

driver.find_element_by_id(“su”).send_keys(Keys.ENTER)
通过selenium模块配合自动按键即可实现简单的博客园自动爬行工具,用于备份非常不错.

from selenium import webdriver
from bs4 import BeautifulSoup
import requests,os,time,lxml
import win32api,win32con
header = {“User-Agent”:“Mozilla/5.0 (iPhone; U; cpu like Mac OS X) AppleWebKit/420.1 (KHTML,end_page):
value = []
for x in range(start_page,10):
y = ret[item].get(‘href’).replace(”n","")
value.append(y)
print("[+] 爬行地址: {} “.format(y))
return value
if name == “main”:
value = get_url(“csnd”,2)
WebPath = “C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe”
driver = webdriver.Chrome(executable_path=WebPath)
driver.set_window_size(1024,768)
for item in range(0,len(value)):
print(”[-] —> 开始保存:{}".format(value[item]))
driver.get(value[item])
# 按下ctrl+s
win32api.keybd_event(0x11,0)
win32api.keybd_event(0x53,win32con.KEYEVENTF_KEYUP,0)
win32api.keybd_event(0x11,0)
# 按下回车
time.sleep(1)
win32api.keybd_event(0x0D,0)
win32api.keybd_event(0x0D,0)
@H_502_189@html parser

#定义一个MyParser继承自HTMLParser
class MyParser(HTMLParser):
re=[]#放置结果
flg=0#标志,用以标记是否找到我们需要的标签
def handle_starttag(self,tag,attrs):
if tag==‘h3’:#目标标签
for attr in attrs:
if attr[0]‘class’ and attr[1]‘tb-main-title’:#目标标签具有的属性
self.flg=1#符合条件则将标志设置为1
break
else:
pass

def handle_data(self,data): if self.flg==1: self.re.append(data.strip())#如果标志为我们需要的标志,则将数据添加到列表中 self.flg=0#重置标志,进行下次迭代 else: pass

my=MyParser()
my.Feed(html)

总结

以上是python教程入门学习为你收集整理的Python 实现简单的爬虫全部内容,希望文章能够帮你解决Python 实现简单的爬虫所遇到的程序开发问题。

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

相关文章:

版权声明:Python教程2022-10-28发表,共计15226字。
新手QQ群:570568346,欢迎进群讨论 Python51学习