python爬取“大乐透”往期数据
网页分析
构造请求头,获取soup
#只是获取单页的内容 import requests import re import pandas from bs4 import BeautifulSoup headers = { 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400', 'Host':'www.caibow.com' } url = 'https://www.caibow.com/dlt/kj/p1/' res = requests.get(url,headers=headers,timeout = 10) soup = BeautifulSoup(res.text,'html.parser')
获取期数标题
#取期数 for spans in soup.select('.pd_10_20.bb_das '): #print(spans) for dateNum in spans.select('.fl.fz_16'): #print(dateNum.text) #print(type(dateNum.text)) num=re.findall(r"\d+",dateNum.text) if (len(num)): print(num[0]) print('—————————————-')
结果:
取日期
#取日期 date_re = re.compile('.*?(\d{4}-\d{1,2}-\d{1,2}).*?') for spans in soup.select('.pd_10_20.bb_das '): #print(spans) for dateNum2 in spans.select('.fr.fz_14'): dates = re.findall(date_re,str(dateNum2)) #print(type(dateNum2.text)) if(dates): print(dates[0]) print('—————————————-')
取奖池滚存
#取奖池滚存 #取亿元 /元 #数据的单位不统一,用不同的正则匹配 mnY_re = re.compile('.*?(\d+\.\d+\w+).*?') mn_re = re.compile('.*?(\d+\,\d+\,\d+\,\d+\w+).*?') for spans in soup.select('.pd_10_20.bb_das '): for money in spans.select('.fr.fz_14.lh_30'): #print(money) mone1 = re.findall(mn_re,str(money)) mone2 = re.findall(mnY_re,str(money)) if(mone1): print(mone1[0]) elif(mone2): print(mone2[0]) else: print('null') print('—————————————-')
取红篮球号
#用列表字典来保存结果,方便后面制表 info = {} d = [] #取红球蓝球 #soup_list_red = soup.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="red")) #soup_list_blue = soup.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="blue")) pattern_red = re.compile('.*?<span class="fl all_ball red_ball color_white mr10">(\d+)</span>.*?') pattern_blue = re.compile('.*?<span class="fl all_ball blue_ball color_white mr10">(\d+)</span>.*?') for spans in soup.select('.pd_10_20.bb_das '): #print(spans) info={} soup_list_red = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="red")) soup_list_blue = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="blue"))
items = re.findall(pattern_red,str(soup_list_red)) items_blue = re.findall(pattern_blue,str(soup_list_blue)) items = re.findall(pattern_red,str(soup_list_red)) items_blue = re.findall(pattern_blue,str(soup_list_blue)) print(items) print(items_blue) red_ball = ','.join(items) blue_ball = ','.join(items_blue) print(red_ball) print(blue_ball) info['红球']=red_ball info['蓝球']=blue_ball d.append(info) print('—————————————-')
完整代码
#coding:utf-8 ''' Created on 2019年12月21日 @author: liu yan '''
import requests import re import pandas from bs4 import BeautifulSoup headers = { 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400', 'Host':'www.caibow.com' } #url = 'https://www.caibow.com/dlt/kj/p1/' #res = requests.get(url,headers=headers,timeout = 10) #soup = BeautifulSoup(res.text,'html.parser')
def getsoup(url): #打开链接 res = requests.get(url,headers=headers,timeout = 10) soup = BeautifulSoup(res.text,'html.parser') return soup
def get_one_data(spans): info = {} #取期数 for dateNum in spans.select('.fl.fz_16'): #print(dateNum.text) #print(type(dateNum.text)) num=re.findall(r"\d+",dateNum.text) if (len(num)): info['大乐透期数']=num[0]
#取日期 date_re = re.compile('.*?(\d{4}-\d{1,2}-\d{1,2}).*?') for dateNum2 in spans.select('.fr.fz_14'): dates = re.findall(date_re,str(dateNum2)) #print(dateNum2.text) #print(type(dateNum2.text)) if(dates): info['日期']=dates[0]
#取奖池滚存 mnY_re = re.compile('.*?(\d+\.\d+\w+).*?') mn_re = re.compile('.*?(\d+\,\d+\,\d+\,\d+\w+).*?') mn_re2 = re.compile('.*?(\d+\,\d+\,\d+\w+).*?') for money in spans.select('.fr.fz_14.lh_30'): #print(money) mone1 = re.findall(mn_re,str(money)) mone2 = re.findall(mnY_re,str(money)) mone3 = re.findall(mn_re2,str(money)) if(mone1): info['奖池滚存(元)']=mone1[0] elif(mone2): info['奖池滚存(元)']=mone2[0] elif(mone3): info['奖池滚存(元)']=mone3[0] else: info['奖池滚存(元)']='null'
#取球号 pattern_red = re.compile('.*?<span class="fl all_ball red_ball color_white mr10">(\d+)</span>.*?') pattern_blue = re.compile('.*?<span class="fl all_ball blue_ball color_white mr10">(\d+)</span>.*?')
#print(spans) soup_list_red = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="red")) soup_list_blue = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="blue")) items = re.findall(pattern_red,str(soup_list_red)) items_blue = re.findall(pattern_blue,str(soup_list_blue)) items = re.findall(pattern_red,str(soup_list_red)) items_blue = re.findall(pattern_blue,str(soup_list_blue)) #print(items) #print(items_blue) red_ball = ','.join(items) blue_ball = ','.join(items_blue) #print(red_ball) #print(blue_ball) info['红球号码']=red_ball info['蓝球号码']=blue_ball
return info
#列表dlt用来保存数据,元素为字典类型 dlt=[] for i in range(1,130): url = 'https://www.caibow.com/dlt/kj/p{num}/'.format(num=i) soup = getsoup(url) for spans in soup.select('.pd_10_20.bb_das '): dlt.append(get_one_data(spans)) print(dlt[2]) print(len(dlt)) order = ['大乐透期数', '日期', '奖池滚存(元)', '红球号码','蓝球号码'] df = pandas.DataFrame(dlt) df = df[order] df.to_excel('Dlt.xlsx')
神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试