spider爬取腾讯网:娱乐版块内容

387次阅读
没有评论
spider爬取腾讯网:娱乐版块内容

1.准备工作(依赖包):redis和mysql请自行准备,此案例涉及到爬取数据的保存和去重(通过redis)

<dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.6</version> </dependency>

<dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> </dependency>

<dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.47</version> </dependency>

<dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis</artifactId> <version>3.4.6</version> </dependency>

<dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.38</version> </dependency>

<dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.12</version> </dependency>

<dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>1.7.25</version> </dependency>

<dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>1.7.25</version> <scope>test</scope> </dependency>

<dependency> <groupId>redis.clients</groupId> <artifactId>jedis</artifactId> <version>3.0.1</version> </dependency>

2.配置文件

    2.1.db.properties

driver=com.mysql.jdbc.Driver url=jdbc:mysql:///spider?useUnicode=true&characterEncoding=UTF-8 username=root password=admin

    2.2.log4j.properties

# Global logging configuration log4j.rootLogger=ERROR, stdout # MyBatis logging configuration… log4j.logger.com.test.spider=TRACE # Console output… log4j.appender.stdout=org.apache.log4j.ConsoleAppender log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%5p [%t] – %m%n

   2.3.mybatis.xml

<?xml version="1.0" encoding="UTF-8" ?> <!DOCTYPE configuration PUBLIC "-//mybatis.org//DTD Config 3.0//EN" "http://mybatis.org/dtd/mybatis-3-config.dtd"> <configuration> <!–属性配置–> <properties resource="db.properties"/> <environments default="development"> <environment id="development"> <!–事务管理器–> <transactionManager type="JDBC"/> <!–数据源–> <dataSource type="POOLED"> <property name="driver" value="${driver}"/> <property name="url" value="${url}"/> <property name="username" value="${username}"/> <property name="password" value="${password}"/> </dataSource> </environment> </environments>

<mappers> <mapper resource="com/test/spider/mapper/UserMapper.xml"/> </mappers> </configuration>

3.实体类News:com.test.spider.bean;

package com.test.spider.bean;

import java.util.Date;

public class News { private Integer id; private String title; private String intro; private String source; private String vurl; private Date publishTime; @Override public String toString() { return "News{" + "id=" + id + ", title='" + title + ''' + ", intro='" + intro + ''' + ", source='" + source + ''' + ", vurl='" + vurl + ''' + ", publishTime=" + publishTime + '}'; }

public Integer getId() { return id; }

public void setId(Integer id) { this.id = id; }

public String getTitle() { return title; }

public void setTitle(String title) { this.title = title; }

public String getIntro() { return intro; }

public void setIntro(String intro) { this.intro = intro; }

public String getSource() { return source; }

public void setSource(String source) { this.source = source; }

public String getVurl() { return vurl; }

public void setVurl(String vurl) { this.vurl = vurl; }

public Date getPublishTime() { return publishTime; }

public void setPublishTime(Date publishTime) { this.publishTime = publishTime; } }

4.NewsMapper接口:com.test.spider.mapper;

package com.test.spider.mapper;

import com.test.spider.bean.News; import org.apache.ibatis.annotations.Param; import java.util.List;

public interface NewsMapper { Integer batchSave(@Param("news") List<News> list); }

5.NewsMapper.xml配置文件:com.test.spider.mapper;

<?xml version="1.0" encoding="UTF-8" ?> <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd"> <mapper namespace="com.test.spider.mapper.NewsMapper"> <insert id="batchSave"> insert into news(title,intro,source,vurl,publish_time) VALUES <foreach collection="news" separator="," item="item"> (#{item.title},#{item.intro},#{item.source},#{item.vurl},#{item.publishTime}) </foreach> </insert> </mapper>

6.MybatisUtil工具类:com.test.spider.util;

package com.test.spider.util;

import org.apache.ibatis.io.Resources; import org.apache.ibatis.session.SqlSession; import org.apache.ibatis.session.SqlSessionFactory; import org.apache.ibatis.session.SqlSessionFactoryBuilder;

import java.io.IOException; import java.io.InputStream;

public class MybatisUtil {

private static SqlSessionFactory factory = null;

static{ String resource = "mybatis.xml"; InputStream in = null; try { in = Resources.getResourceAsStream(resource); } catch (IOException e) { e.printStackTrace(); } factory = new SqlSessionFactoryBuilder().build(in); }

public static SqlSession getSession(){ return factory.openSession(); } }

7.EntertainmentSpider主要功能:com.test.spider;

package com.test.spider;

import com.alibaba.fastjson.JSON; import com.sun.org.apache.bcel.internal.generic.NEW; import com.test.spider.bean.News; import com.test.spider.mapper.NewsMapper; import com.test.spider.util.MybatisUtil; import jdk.nashorn.internal.scripts.JD; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.apache.ibatis.session.SqlSession; import org.junit.Test; import redis.clients.jedis.Jedis;

import java.io.IOException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map;

/** * Created by thinkpad on 2019/9/18. */ public class EntertainmentSpider {

public void main() throws Exception { Integer page = 0; while(true){ //1.确定Url String indexUrl = "https://pacaio.match.qq.com/irs/rcd?cid=146&token=49cbb2154853ef1a74ff4e53723372ce&ext=ent&page="+page+"&callback=__jp7"; //2.发送请求,获取数据 //获取httpclient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); //创建请求方式对象 HttpGet get = new HttpGet(indexUrl); //执行请求操作 CloseableHttpResponse response = httpClient.execute(get); if(response.getStatusLine().getStatusCode() == 200){ HttpEntity entity = response.getEntity(); String res = EntityUtils.toString(entity, "utf-8"); //3.解析数据 res = toJsonString(res); //截取字符串 //转为map对象 Map<String, Object> map = jsonToMap(res); //页面没有数据使,结束循环 if(Integer.parseInt(map.get("datanum").toString()) == 0){ break; } //转换为新闻对象 List<News> list = mapToBean(((List<Map>) map.get("data"))); //4.保存数据 //保存新闻数据 if(list.size()>0){ Integer count = saveNews(list); } System.out.println(page); } page++; } System.out.println("执行结束…."); }

//截取结果字符串 public String toJsonString(String src){ int start = src.indexOf("{"); int end = src.lastIndexOf("}")+1; return src.substring(start,end); }

//json字符串转换为java对象 public Map<String,Object> jsonToMap(String src){ return JSON.parseObject(src, HashMap.class); }

//新闻数据转换为新闻对象 public List<News> mapToBean(List<Map> src) throws ParseException {

SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

List<News> list = new ArrayList<News>();

for (Map map : src) { //获取url的值 String url = map.get("vurl").toString();

if(!checkUrl(url)){ continue; }

News news = new News(); news.setTitle(map.get("title").toString()); news.setIntro(map.get("intro").toString()); news.setSource(map.get("source").toString()); news.setVurl(url); String publish_time = map.get("publish_time").toString(); news.setPublishTime(format.parse(publish_time)); list.add(news); } return list; }

//保存新闻的数据 public int saveNews(List<News> list){

SqlSession session = MybatisUtil.getSession(); NewsMapper mapper = session.getMapper(NewsMapper.class); Integer res = mapper.batchSave(list); session.commit(); session.close(); return res; }

//判断url是否已经在redis中保存 public Boolean checkUrl(String url){ Jedis jedis = new Jedis("localhost",6379); Long count = jedis.sadd("bigdata:0701:spider:news:url", url); jedis.close(); return count>0?true:false; } }

8.App(测试类):com.test.spider;

package com.test.spider;

import com.test.spider.EntertainmentSpider;

import java.util.Timer; import java.util.TimerTask;

public class App {

public static void main(String[] args) {

final EntertainmentSpider spider = new EntertainmentSpider(); //创建定时器 Timer timer = new Timer(); timer.schedule( new TimerTask() { @Override public void run() { try { spider.main();

} catch (Exception e) { e.printStackTrace(); } } }, 0, 1200000L ); } }

 

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

相关文章:

版权声明:Python教程2022-10-28发表,共计7975字。
新手QQ群:570568346,欢迎进群讨论 Python51学习