目录
一、爬取下厨房网站首页图片
二、爬取迁木网示例
1、单线程
2、多线程
3、整合Redis,简单分布式
一、爬取下厨房网站首页图片
# encoding: utf-8
""" @author: sunxianpeng @file: 58spider.py @time: 2019/10/25 19:19 """ import os import requests from requests.exceptions import RequestException from bs4 import BeautifulSoup from urllib.parse import urlparse class Main(): def __init__(self): pass
def reqest_url(self, url): try: response = requests.get(url) except RequestException as e: print("request is error!", e) return response
def get_img_labels(self, imgs): img_list = [] for img in imgs: # 判断img是否包含data-src属性 if img.has_attr('data-src'): # 包含则取data-src属性 img_list.append(img.attrs['data-src']) else: # 不包含则取src属性 img_list.append(img.attrs["src"]) return img_list
def dir_judge_or_create(self, dir_path): if not os.path.isdir(dir_path): os.mkdir(dir_path)
def save_img(self, img_req, img_path): with open(img_path, "wb") as f: # 每次写入1024字节 for chunk in img_req.iter_content(1024): f.write(chunk)
if __name__ == '__main__': m = Main() url = r"http://www.xiachufang.com/" html_content = m.reqest_url(url).text soup = BeautifulSoup(html_content) imgs = soup.select('img')# 选取所有的img图片标签 img_list = m.get_img_labels()# 获取包含图片url的属性内容 img_dir = os.path.join(os.curdir, "E:PythonProjectspython_studypython_requestsspiderdataimges") m.dir_judge_or_create(img_dir)
for img_url in img_list: o = urlparse(img_url) # 从url中取出图片名字 img_name = o.path[1:].split('@')[0] # 图片存储路径 img_path = os.path.join(img_dir, img_name) # 有些图片中还包含一层目录,需要创建对应dir,防止报错 m.dir_judge_or_create(os.path.dirname(img_path)) # 构建 图片url 路径 u = '%s://%s/%s' % (o.scheme, o.netloc, img_name) # 二进制格式的图片 print(u) img_req = m.reqest_url(u) m.save_img(img_req, img_path)
二、爬取迁木网示例
1、单线程
# encoding: utf-8
""" @author: sunxianpeng @file: qianmu_spider.py @time: 2019/10/26 13:32 """ import requests from requests.exceptions import RequestException import lxml from lxml import etree
class Main(): def __init__(self): pass def reqest_url(self,url): headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/67.0.3396.62 Safari/537.36'} # print("headers = ", headers) response = None try: response = requests.get(url) # body = response.text # 获取网页内容 except RequestException as e: print("request is error!", e) return response
def get_selector(self,html_content): selector = None try: selector = etree.HTML(html_content) except Exception as e: print("get selector is error!", e) return selector
def analyze_html(self,selector): data = {} keys = [] values = [] data["collage_name"] = selector.xpath('//div[@id="wikiContent"]/h1/text()')[0] # 处理单元格内有换行 table = selector.xpath('//div[@id="wikiContent"]/div[@class="infobox"]//table') if table : table = table[0] cols_k = table.xpath('.//td[1]') cols_v = table.xpath('.//td[2]') for j in range(len(cols_k)): col_k = cols_k[j] col_v = cols_v[j] keys.append(''.join(col_k.xpath('./p//text()'))) values.append(''.join(col_v.xpath('./p//text()'))) # 合并两个列表组成字典,将zip后得到的字典 添加到data字典中 data.update(zip(keys, values)) return data
def process_entrance(self,selector): data = {} links = selector.xpath('//tbody//tr[@height=19][position()>1]/td/a/@href') for i in range(len(links)): link = str(links[i]) if not link.startswith("http://www.qianmu.org"): link = "http://www.qianmu.org/%s" % link selector = self.get_selector(self.reqest_url(link).text) try: data = self.analyze_html(selector) print(data) except Exception as e: # 此处可以查看相对应的信息,解决表格非标准的形式问题,本次就不处理,直接跳过 print(link) continue return data
if __name__ == '__main__': m = Main() url = "http://www.qianmu.org/ranking/1528.htm" req = m.reqest_url(url) selector = m.get_selector(m.reqest_url(url).text) data = m.process_entrance(selector)
2、多线程
# encoding: utf-8
""" @author: sunxianpeng @file: qianmu_spider.py @time: 2019/10/26 13:32 """ import requests from requests.exceptions import RequestException from lxml import etree import threading from queue import Queue import time
class Main(): def __init__(self): pass def reqest_url(self,url): headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/67.0.3396.62 Safari/537.36'} # print("headers = ", headers) response = None try: response = requests.get(url) # body = response.text # 获取网页内容 except RequestException as e: print("request is error!", e) return response
def get_selector(self,html_content): selector = None try: selector = etree.HTML(html_content) except Exception as e: print("get selector is error!", e) return selector
def analyze_html(self,selector): data = {} keys = [] values = [] data["collage_name"] = selector.xpath('//div[@id="wikiContent"]/h1/text()')[0] # 处理单元格内有换行 table = selector.xpath('//div[@id="wikiContent"]/div[@class="infobox"]//table') if table : table = table[0] cols_k = table.xpath('.//td[1]') cols_v = table.xpath('.//td[2]') for j in range(len(cols_k)): col_k = cols_k[j] col_v = cols_v[j] keys.append(''.join(col_k.xpath('./p//text()'))) values.append(''.join(col_v.xpath('./p//text()'))) # 合并两个列表组成字典,将zip后得到的字典 添加到data字典中 data.update(zip(keys, values)) return data
def download(self,link_queue): """ """ while True: # 阻塞,直到从队列取到一个链接 link = link_queue.get() # 取不出链接,或者说取出的是None if link is None: break if not link.startswith("http://www.qianmu.org"): link = "http://www.qianmu.org/%s" % link try: selector = self.get_selector(self.reqest_url(link).text) data = self.analyze_html(selector) print(data) except Exception as e: # 此处可以查看相对应的信息,解决表格非标准的形式问题,本次就不处理,直接跳过 print(link) continue link_queue.task_done() print('remaining queue: %s',link_queue.qsize())
if __name__ == '__main__': start_time = time.time() m = Main() url = "http://www.qianmu.org/ranking/1528.htm" link_queue = Queue() req = m.reqest_url(url) selector = m.get_selector(m.reqest_url(url).text) links = selector.xpath('//tbody//tr[@height=19][position()>1]/td/a/@href') for i in range(len(links)): link = str(links[i]) link_queue.put(link) # 多线程 threads = [] thread_num = 10 # 启动线程,并将线程对象放入一个列表保存 for i in range(thread_num): t = threading.Thread(target=m.download(link_queue)) t.start() threads.append(t) #阻塞队列,直到队列被清空,此时线程未退出 link_queue.join() # 向队列发送n个None,来通知线程退出 for i in range(thread_num): link_queue.put(None) # 退出线程 for t in threads: # 堵塞主线程,直到所有的线程退出 t.join()
used_time = time.time() – start_time print("download finished !!, used time : %s" % used_time)
3、整合Redis,简单分布式
# encoding: utf-8
""" @author: sunxianpeng @file: qianmu_spider.py @time: 2019/10/26 13:32 """ import requests from requests.exceptions import RequestException from lxml import etree import threading from queue import Queue import time from redis import Redis import signal class Main(): def __init__(self): pass def reqest_url(self,url): headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/67.0.3396.62 Safari/537.36'} # print("headers = ", headers) response = None try: response = requests.get(url) # body = response.text # 获取网页内容 except RequestException as e: print("request is error!", e) return response
def get_selector(self,html_content): selector = None try: selector = etree.HTML(html_content) except Exception as e: print("get selector is error!", e) return selector
def analyze_html(self,selector): data = {} keys = [] values = [] data["collage_name"] = selector.xpath('//div[@id="wikiContent"]/h1/text()')[0] # 处理单元格内有换行 table = selector.xpath('//div[@id="wikiContent"]/div[@class="infobox"]//table') if table : table = table[0] cols_k = table.xpath('.//td[1]') cols_v = table.xpath('.//td[2]') for j in range(len(cols_k)): col_k = cols_k[j] col_v = cols_v[j] keys.append(''.join(col_k.xpath('./p//text()'))) values.append(''.join(col_v.xpath('./p//text()'))) # 合并两个列表组成字典,将zip后得到的字典 添加到data字典中 data.update(zip(keys, values)) return data
def download(self,r): """ """ while thread_on: # 阻塞,直到从队列取到一个链接 link = r.lpop("qianmu.queue") if link: if not link.startswith("http://www.qianmu.org"): link = "http://www.qianmu.org/%s" % link try: selector = self.get_selector(self.reqest_url(link).text) data = self.analyze_html(selector) print(data) except Exception as e: # 此处可以查看相对应的信息,解决表格非标准的形式问题,本次就不处理,直接跳过 print(link) continue print('remaining queue: %s', r.llen("qianmu.queue")) time.sleep(0.2) print("Thread-%s exit now" % i )
# def sigint_handler(self,signum,frame): def sigint_handler(self): print("received Ctrl+C, wait for exit gracefully !!") global thread_on thread_on = False
if __name__ == '__main__': start_time = time.time() m = Main() r = Redis()
url = "http://www.qianmu.org/ranking/1528.htm"
req = m.reqest_url(url) selector = m.get_selector(m.reqest_url(url).text) links = selector.xpath('//tbody//tr[@height=19][position()>1]/td/a/@href') for i in range(len(links)): link = str(links[i]) # 判断当前link是否已经抓取过,没有则放入队列 if r.sadd("qianmu.ifexists",link): r.rpush("qianmu.queue",link) # 多线程 threads = [] thread_num = 10 thread_on = True # 线程是否开启 # 启动线程,并将线程对象放入一个列表保存 for i in range(thread_num): t = threading.Thread(target=m.download(r), args=(i+1,)) t.start() threads.append(t) signal.signal(signal.SIGINT,m.sigint_handler) # 退出线程 for t in threads: # 堵塞主线程,直到所有的线程退出 t.join()
used_time = time.time() – start_time print("download finished !!, used time : %s" % used_time)
神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试