python爬虫之网易新闻网（简洁版）

1,214次阅读

网易新闻
爬虫
python

注释挺详细了，直接上全部代码，欢迎各位大佬批评指正。

from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from time import sleep from lxml import etree import os import requests import csv

# 创建一个无头浏览器对象 chrome_options = Options() # 设置它为无框模式 chrome_options.add_argument('–headless') # 如果在windows上运行需要加代码 chrome_options.add_argument('–disable-gpu') browser = webdriver.Chrome(chrome_options=chrome_options) # 设置一个10秒的隐式等待 browser.implicitly_wait(10) # 使用谷歌无头浏览器来加载动态js def start_get(url,news_type): browser.get(url) sleep(1) # 翻到页底 browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(1) # 点击加载更多 more_btn = browser.find_elements(By.CSS_SELECTOR, '.load_more_btn') if more_btn: try: more_btn[0].click() except Exception as e: print(e) print('继续….') sleep(1) # 再次翻页到底 browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') # 拿到页面源代码 source = browser.page_source parse_page(source)

# 对新闻列表页面进行解析 def parse_page(html): # 创建etree对象 tree = etree.HTML(html) new_lst = tree.xpath('//div[@class="news_title"]//a') for one_new in new_lst: title = one_new.xpath('./text()')[0] link = one_new.xpath('./@href')[0] try: write_in(title, link,news_type) except Exception as e: print(e)

# 将其写入到文件 def write_in(title, link,news_type): alist = [] print('开始写入新闻{}'.format(title)) browser.get(link) sleep(1) # 再次翻页到底 browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') # 拿到页面源代码 source = browser.page_source tree = etree.HTML(source)

alist.append(news_type) alist.append(title)

con_link = link alist.append(con_link)

content_lst = tree.xpath('//div[@class="post_text"]//p') con = '' if content_lst: for one_content in content_lst: if one_content.text: con = con + '\n' + one_content.text.strip() alist.append(con)

post_time_source = tree.xpath('//div[@class="post_time_source"]')[0].text if "来源:" in post_time_source: post_time = post_time_source.split("来源:")[0] alist.append(post_time)

else: post_time = post_time_source[0].text alist.append(post_time) post_source = tree.xpath('//div[@class="post_time_source"]/a[@id="ne_article_source"]')[0].text alist.append(post_source)

# browser.get(url) tiecount = tree.xpath('//a[@class="js-tiecount js-tielink"]')[0].text alist.append(tiecount)

tiejoincount = tree.xpath('//a[@class="js-tiejoincount js-tielink"]')[0].text alist.append(tiejoincount)

# 1. 创建文件对象 f = open('网易.csv', 'a+', encoding='utf-8',newline='') # 2. 基于文件对象构建 csv写入对象 csv_writer = csv.writer(f) # 3. 构建列表头 # csv_writer.writerow(["姓名", "年龄", "性别"]) # 4. 写入csv文件内容 print(alist) csv_writer.writerow(alist) f.close()

if __name__ == '__main__': urls = browser.find_elements_by_xpath('//a[@ne-role="tab-nav"]') links =[] for one in urls: url = one.get_attribute('href') links.append(url)

for one in range(len(links)): if 'http' in links[one]: # headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'} if not os.path.exists('new'): os.mkdir('new') news_type = urls[one].text start_get(links[one],news_type)

browser.quit()

结果如下：
python爬虫之网易新闻网（简洁版）
注：本文仅用于技术交流，不得用于商业用途。不遵守者，与本文作者无关。

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

发表于：Python爬虫

2022-10-28

# Python爬虫

复制链接

赏

python爬虫之网易新闻网（简洁版）

相关文章：

HTTP代理设置详解：一步步配置指南

什么是Socks5代理IP及其优势

Socks5代理配置教程及注意事项

什么是代理服务器IP：如何选择合适的

国外代理服务器的优势及选择建议

如何找到可靠的免费代理服务器

在线代理服务器的使用与推荐

HTTP代理服务器的设置及应用实例

静态代理IP怎么填写：步骤与示例

海外静态IP的代理选择与配置