python 爬取数据

770次阅读
没有评论
python

目标地址:xxxx

技术选型:python

软件包管理工具:pipenv

编辑器:jupyter

分析目标地址:

gplId表示项目ID,可变参数

结果收集方式:

数据库

代码实现

导入相关模块

from urllib.parse import urlencode

from bs4 import  BeautifulSoup

import pandas as pd

import requests

import os,sys

# 网页提取函数

def get_one_page(i):

try:

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'

}

paras = {

'a': 'x',

'b': 'x',

'c': 'x',

'd': 'x',

'e': 'x'

}

url = 'xx?' + urlencode(paras)

response = requests.get(url,headers = headers)

if response.status_code == 200:

response.encoding='utf-8'

return response.text

return None

except RequestException:

print('爬取失败')

# beatutiful soup解析然后提取表格

def parse_one_page(html):

soup = BeautifulSoup(html)

content = soup.select('.ttable')[0]

tbl = pd.read_html(content.prettify(),header = 1)[0]

tbl.rename(columns = {'序号':'serial_number',

'xx':'option',

'xx':'fanwei',

'xx':'company_name',

'xx':'shigong_date',

'xx':'order_no',

'xx':'miaomu_name',

'xx':'type',

'xx':'spec',

'xx':'count',

xx':'dead_count',

'xx':'zhongzhi_midu',

'xx':'mianji',

'xx':'unit',

'xx':'danjia',

'xx':'xiaoji',

'xx':'zhongzhididian',

'xx':'chuhuonongchang',

'xx':'remark',

'xx':'image_count',

'xx':'image'

},inplace = True)

del tbl['option']

del tbl['image_count']

del tbl['image']

return tbl

# 创建表结构

import pymysql

# 创建表结构

def generate_mysql():

conn = pymysql.connect(

host='xxxx',

user='root',

password='xxxx',

port=3307,

charset = 'utf8',

db = 'miao_mu_data')

cursor = conn.cursor()

sql = 'CREATE TABLE IF NOT EXISTS miaomu ('

'serial_number INT(20) NOT NULL AUTO_INCREMENT,'

'fanwei varchar(50) ,'

'company_name VARCHAR(50) ,'

'shigong_date varchar(50),'

'order_no varchar(50),'

'miaomu_name varchar(50),'

'type varchar(50),'

'spec varchar(50),'

'count varchar(50),'

'dead_count varchar(50),'

'zhongzhi_midu varchar(50),'

'mianji varchar(50),'

'unit varchar(50),'

'danjia varchar(50),'

'xiaoji varchar(50),'

'zhongzhididian varchar(50),'

'chuhuonongchang varchar(50),'

'remark varchar(50),'

'PRIMARY KEY (serial_number))'

cursor.execute(sql)

conn.close()

# 存储到数据库

from sqlalchemy import create_engine

# 存储到数据库

def write_to_sql(tbl, db = 'miao_mu_data'):

engine = create_engine('mysql+pymysql://root:密码@ip:3307/{0}?charset=utf8'.format(db))

try:

tbl.to_sql('miaomu',con = engine,if_exists='append',index=False)

except Exception as e:

print(e)

# 主函数

import time

# 主函数

def main(page):

start_time = time.time()

for i in range(1,page):

print(i)

html = get_one_page(page)

df = parse_one_page(html)

if not df.empty:

write_to_sql(df)

# 间隔执行

time.sleep(3)

endtime = time.time()-start_time

print('程序运行了%.2f秒' %endtime)

if __name__ == '__main__':

main(100000)

# 生成表结构

#     generate_mysql()

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

相关文章:

版权声明:Python教程2022-10-27发表,共计3324字。
新手QQ群:570568346,欢迎进群讨论 Python51学习