简单爬虫-python简单爬虫

628次阅读
没有评论
简单爬虫-python简单爬虫

from pyquery import PyQuery as pq import urllib.request import pymysql import uuid

conn = pymysql.connect(host=127.0.0.1, user=root, passwd=123456, db=test, port=3306, charset=utf8) cur = conn.cursor() cur.execute(select * from user) #获取数据 users = cur.fetchall() for i in range(len(users)): print(users[i])

#获取原码 def get_content(page): url =https://saudi.souq.com/sa-en/mobile-phone-accessories/l/?rpp=32&_=1550499488459&sortby=sr&section=2&page=+ str(page) a = urllib.request.urlopen(url)#打开网址 html = a.read().decode(utf-8)#读取源代码并转为unicode return html

def get(html): doc = pq(html) items = doc(.img-link.quickViewAction.sPrimaryLink) return items

#多页处理,下载到文件 for j in range(1,3000): print(正在爬取第+str(j)+页数据…) html = get_content(j) #调用获取网页原码 #for i in get(html): for i in get(html): prodouct_link = pq(i).attr(href) # 防止有的页面,请求没反应,程序停止 try: doc = pq(url=prodouct_link) title = doc(.product-title>h1).text() price = doc(.price.is.sk-clr1).text() stock = doc(.txtcolor-alert.xleft>span).text() color = doc(span.connection.title).text() shop_name = doc(.unit-seller-link>a>b).text() sales = doc(.show-for-medium.bold-text).text() image = doc(.img-bucket>img).attr(src) prodouct_id = str(uuid.uuid1())

sql = insert into shop (product_id, product_name,product_link,product_seller,product_price,product_sales,product_stock,product_image) values (%s, %s, %s, %s,%s, %s, %s, %s) try: count =cur.execute(sql, [prodouct_id, title, prodouct_link, shop_name, price, sales, stock, image]) # 判断是否成功 if count > 0: print(添加数据成功!n) # 提交事务 conn.commit() except: pass except: pass with open(job.txt, a, encoding=utf-8) as f: f.write(prodouct_link+n) f.close()

#关闭数据库资源连接 cur.close() conn.close()

 

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

相关文章:

版权声明:Python教程2022-11-01发表,共计1567字。
新手QQ群:570568346,欢迎进群讨论 Python51学习