python 爬虫案例（数据2000+）

744次阅读

python

#只是获取单页的内容 import requests import re import pandas from bs4 import BeautifulSoup headers = { 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400', 'Host':'www.caibow.com' } url = 'https://www.caibow.com/dlt/kj/p1/' res = requests.get(url,headers=headers,timeout = 10) soup = BeautifulSoup(res.text,'html.parser')

#取期数 for spans in soup.select('.pd_10_20.bb_das '): #print(spans) for dateNum in spans.select('.fl.fz_16'): #print(dateNum.text) #print(type(dateNum.text)) num=re.findall(r"\d+",dateNum.text) if (len(num)): print(num[0]) print('—————————————-')

结果：
python

#取日期 date_re = re.compile('.*?(\d{4}-\d{1,2}-\d{1,2}).*?') for spans in soup.select('.pd_10_20.bb_das '): #print(spans) for dateNum2 in spans.select('.fr.fz_14'): dates = re.findall(date_re,str(dateNum2)) #print(type(dateNum2.text)) if(dates): print(dates[0]) print('—————————————-')

python

#取奖池滚存 #取亿元 /元 #数据的单位不统一，用不同的正则匹配 mnY_re = re.compile('.*?(\d+\.\d+\w+).*?') mn_re = re.compile('.*?(\d+\,\d+\,\d+\,\d+\w+).*?') for spans in soup.select('.pd_10_20.bb_das '): for money in spans.select('.fr.fz_14.lh_30'): #print(money) mone1 = re.findall(mn_re,str(money)) mone2 = re.findall(mnY_re,str(money)) if(mone1): print(mone1[0]) elif(mone2): print(mone2[0]) else: print('null') print('—————————————-')

python

#用列表字典来保存结果，方便后面制表 info = {} d = [] #取红球蓝球 #soup_list_red = soup.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="red")) #soup_list_blue = soup.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="blue")) pattern_red = re.compile('.*?(\d+).*?') pattern_blue = re.compile('.*?(\d+).*?') for spans in soup.select('.pd_10_20.bb_das '): #print(spans) info={} soup_list_red = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="red")) soup_list_blue = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="blue"))

items = re.findall(pattern_red,str(soup_list_red)) items_blue = re.findall(pattern_blue,str(soup_list_blue)) items = re.findall(pattern_red,str(soup_list_red)) items_blue = re.findall(pattern_blue,str(soup_list_blue)) print(items) print(items_blue) red_ball = ','.join(items) blue_ball = ','.join(items_blue) print(red_ball) print(blue_ball) info['红球']=red_ball info['蓝球']=blue_ball d.append(info) print('—————————————-')

python

#coding:utf-8 ''' Created on 2019年12月21日 @author: liu yan '''

import requests import re import pandas from bs4 import BeautifulSoup headers = { 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400', 'Host':'www.caibow.com' } #url = 'https://www.caibow.com/dlt/kj/p1/' #res = requests.get(url,headers=headers,timeout = 10) #soup = BeautifulSoup(res.text,'html.parser')

def getsoup(url): #打开链接 res = requests.get(url,headers=headers,timeout = 10) soup = BeautifulSoup(res.text,'html.parser') return soup

def get_one_data(spans): info = {} #取期数 for dateNum in spans.select('.fl.fz_16'): #print(dateNum.text) #print(type(dateNum.text)) num=re.findall(r"\d+",dateNum.text) if (len(num)): info['大乐透期数']=num[0]

#取日期 date_re = re.compile('.*?(\d{4}-\d{1,2}-\d{1,2}).*?') for dateNum2 in spans.select('.fr.fz_14'): dates = re.findall(date_re,str(dateNum2)) #print(dateNum2.text) #print(type(dateNum2.text)) if(dates): info['日期']=dates[0]

#取奖池滚存 mnY_re = re.compile('.*?(\d+\.\d+\w+).*?') mn_re = re.compile('.*?(\d+\,\d+\,\d+\,\d+\w+).*?') mn_re2 = re.compile('.*?(\d+\,\d+\,\d+\w+).*?') for money in spans.select('.fr.fz_14.lh_30'): #print(money) mone1 = re.findall(mn_re,str(money)) mone2 = re.findall(mnY_re,str(money)) mone3 = re.findall(mn_re2,str(money)) if(mone1): info['奖池滚存(元)']=mone1[0] elif(mone2): info['奖池滚存(元)']=mone2[0] elif(mone3): info['奖池滚存(元)']=mone3[0] else: info['奖池滚存(元)']='null'

#取球号 pattern_red = re.compile('.*?(\d+).*?') pattern_blue = re.compile('.*?(\d+).*?')

#print(spans) soup_list_red = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="red")) soup_list_blue = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="blue")) items = re.findall(pattern_red,str(soup_list_red)) items_blue = re.findall(pattern_blue,str(soup_list_blue)) items = re.findall(pattern_red,str(soup_list_red)) items_blue = re.findall(pattern_blue,str(soup_list_blue)) #print(items) #print(items_blue) red_ball = ','.join(items) blue_ball = ','.join(items_blue) #print(red_ball) #print(blue_ball) info['红球号码']=red_ball info['蓝球号码']=blue_ball

return info

#列表dlt用来保存数据，元素为字典类型 dlt=[] for i in range(1,130): url = 'https://www.caibow.com/dlt/kj/p{num}/'.format(num=i) soup = getsoup(url) for spans in soup.select('.pd_10_20.bb_das '): dlt.append(get_one_data(spans)) print(dlt[2]) print(len(dlt)) order = ['大乐透期数', '日期', '奖池滚存(元)', '红球号码','蓝球号码'] df = pandas.DataFrame(dlt) df = df[order] df.to_excel('Dlt.xlsx')

python
python

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

发表于：Python爬虫

2022-10-28

# Python爬虫

复制链接

赏

python 爬虫案例（数据2000+）

python爬取“大乐透”往期数据

网页分析

构造请求头，获取soup

获取期数标题

取日期

取奖池滚存

取红篮球号

完整代码

相关文章：

HTTP代理设置详解：一步步配置指南

什么是Socks5代理IP及其优势

Socks5代理配置教程及注意事项

什么是代理服务器IP：如何选择合适的

国外代理服务器的优势及选择建议

如何找到可靠的免费代理服务器

在线代理服务器的使用与推荐

HTTP代理服务器的设置及应用实例

静态代理IP怎么填写：步骤与示例

海外静态IP的代理选择与配置

如何找到可靠的免费代理服务器

海外静态IP的代理选择与配置

HTTP代理服务器的设置及应用实例

国外代理服务器的优势及选择建议

动态与静态代理IP的区别解析

什么是代理服务器IP：如何选择合适的

HTTP代理设置详解：一步步配置指南

静态代理IP怎么填写：步骤与示例

Socks5代理配置教程及注意事项

什么是Socks5代理IP及其优势