爬虫简单案例

515次阅读
没有评论
爬虫简单案例

1.原生jdk发送get请求

//演示: 原生jdk发送get请求 public class JDKRequest {

public static void main(String[] args) throws Exception { //1.确定url String indexUrl = "http://www.baidu.com";

//2.发送请求获取数据。 //封装URL对象 URL url = new URL(indexUrl); //获取连接 HttpURLConnection urlConnection = (HttpURLConnection)url.openConnection();

//设置请求方式 urlConnection.setRequestMethod("GET");

InputStream in = urlConnection.getInputStream();

InputStreamReader inputStreamReader = new InputStreamReader(in, "GB2312"); //字符缓冲流 BufferedReader bufferedReader = new BufferedReader(inputStreamReader); //字符串缓冲区 StringBuilder stringBuilder = new StringBuilder();

String len = null; //按行读 while((len=bufferedReader.readLine())!=null){ //追加到字符串缓冲区存放 stringBuilder.append(len); } System.out.println(stringBuilder.toString()); in.close(); } }

2.原生jdk发送post请求

// 模拟 原生jdk发送post请求 public class JDKPost {

public static void main(String[] args) throws Exception { //确定URL String indexUrl = "https://www.douban.com/"; //发送请求获取数据 //封装URL对象 URL url = new URL(indexUrl); //获取连接 HttpURLConnection connection = (HttpURLConnection) url.openConnection(); //设置请求方式 connection.setRequestMethod("POST"); //打开输出流 connection.setDoOutput(true); //设置请求参数 OutputStream out = connection.getOutputStream(); out.write("username=liang&password=123".getBytes());

//获取字节流 InputStream in = connection.getInputStream();

InputStreamReader inputStreamReader = new InputStreamReader(in, "UTF-8"); //字符缓冲流 BufferedReader bufferedReader = new BufferedReader(inputStreamReader);`` //字符串缓冲区 StringBuilder stringBuilder = new StringBuilder(); String len = null; //按行读 while((len=bufferedReader.readLine())!=null){ //追加到字符串缓冲区存放 stringBuilder.append(len); } System.out.println(stringBuilder.toString());

in.close(); } }

3.使用HttpClient发送请求

//httpClient专为java发送http请求而生的, 如果要httpClient ,需要先进行导包 <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.6</version> </dependency>

3.1、httpClient发送 get请求

//模拟 httpClient发送get请求 public class HttpClientGet { public static void main(String[] args) throws Exception { //确定首页url String indexUrl = "http://www.baidu.com"; //发送请求,获取数据 //创建HttpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); //创建http方法对象 HttpGet httpGet = new HttpGet(indexUrl); //设置请求头 httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"); //发送请求,获取响应的数据 CloseableHttpResponse response = httpClient.execute(httpGet);

//获取请求的响应码 if(response.getStatusLine().getStatusCode()==200){ Header[] headers = response.getHeaders("Content-Type"); //System.out.println(headers[0].getValue()); //获取响应体 HttpEntity httpEntity = response.getEntity(); System.out.println(EntityUtils.toString(httpEntity,"GB2312")); } } }

3.2、httpClient发送 post请求

// 模拟httpClient发送post请求 public class HttpClientPost { public static void main(String[] args) throws Exception{ //1.确定URL String indexURl = "https://www.chsi.com.cn/"; //2.发送请求,获取数据 //创建请求方法对象 HttpPost httpPost = new HttpPost(indexURl); //设置请求头 httpPost.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"); //设置请求体 List<BasicNameValuePair> list = new ArrayList(); list.add(new BasicNameValuePair("username","liang")); list.add(new BasicNameValuePair("password","123")); HttpEntity requestEntity = new UrlEncodedFormEntity(list); httpPost.setEntity(requestEntity); //创建请求对象 CloseableHttpClient httpClient = HttpClients.createDefault(); //发送请求获取数据 CloseableHttpResponse response = httpClient.execute(httpPost); //判断是否请求成功 System.out.println(response.getStatusLine().getStatusCode()); if(response.getStatusLine().getStatusCode()==200){ HttpEntity entity = response.getEntity(); System.out.println(EntityUtils.toString(entity,"UTF-8")); } } }

 jsoup是一款java解析HTML文档的工具, 如果要使用jsoup 需要先进行导包

<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.3</version> </dependency>

jsoup在使用之前必须先获取到document对象

如何获取document对象

// 通过jsoup获取document的方式 public class JsoupDocument { public static void main(String[] args) throws Exception { //1. 确定首页URL String indexUrl = "http://www.itcast.cn"; //2. 发送请求获取数据 //3. 解析数据: //3.1 获取document的方式一: 最常用的方式 String html = "<!DOCTYPE html>n" + "<html lang="en">n" + "<head>n" + " <meta charset="UTF-8">n" + " <title>获取document的方式一</title>n" + "</head>n" + "<body>n" + "n" + "</body>n" + "</html>"; Document document1 = Jsoup.parse(html); String title = document1.title(); System.out.println(title); //3.2 获取document的方式二: 最简单的方式 Document document2 = Jsoup.connect(indexUrl).get(); //System.out.println(document2); //3.3 获取本地的HTML文件,转换document对象 //Document document3 = Jsoup.parse(new File(""), "UTF-8"); //3.4 指定一个HTML的片段, 获取document对象 html = "<a>获取document的第四种方式</a>"; Document document4 = Jsoup.parseBodyFragment(html); //Document document4 = Jsoup.parse(html); System.out.println(document4.text()); } }

案例:模拟登陆慢慢买网站,获取某登陆用户的积分值

// 模拟登陆 public class ManManSpider {

public static void main(String[] args) throws Exception { //1. 确定首页URL String loginUrl = "http://home.manmanbuy.com/login.aspx";

//2. 发送请求, 获取数据 //2.1 创建httpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault();

//2.2 设置请求方式 HttpPost httpPost = new HttpPost(loginUrl); //2.3 封装请求参数 List<BasicNameValuePair> list = new ArrayList<BasicNameValuePair>(); list.add(new BasicNameValuePair("__VIEWSTATE","/wEPDwULLTIwNjQ3Mzk2NDFkGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYBBQlhdXRvTG9naW4voj01ABewCkGpFHsMsZvOn9mEZg==")); list.add(new BasicNameValuePair("__EVENTVALIDATION","/wEWBQLW+t7HAwLB2tiHDgLKw6LdBQKWuuO2AgKC3IeGDJ4BlQgowBQGYQvtxzS54yrOdnbC")); list.add(new BasicNameValuePair("txtUser","itcast")); list.add(new BasicNameValuePair("txtPass","www.itcast.cn")); list.add(new BasicNameValuePair("btnLogin","登陆"));

HttpEntity entity = new UrlEncodedFormEntity(list); httpPost.setEntity(entity);

//2.4 封装请求头: referer httpPost.setHeader("Referer","http://home.manmanbuy.com/login.aspx");

//2.5 发送请求, 获取响应对象 CloseableHttpResponse response = httpClient.execute(httpPost); //2.6 获取数据 //2.6.1 :状态码 int statusCode = response.getStatusLine().getStatusCode(); if(statusCode==302){ //登陆成功, 获取重定向URL Header[] locations = response.getHeaders("Location"); String reUrl = locations[0].getValue(); reUrl = "http://home.manmanbuy.com"+reUrl;

Header[] cookies = response.getHeaders("Set-Cookie"); //System.out.println(headers.length);

httpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(reUrl); //封装登陆成功的cookie标识信息 httpGet.setHeader("Cookie",cookies[0].getValue()+" "+cookies[1].getValue());

//重定向后的response的对象 response = httpClient.execute(httpGet); //重定向后的页面的数据 String html = EntityUtils.toString(response.getEntity(), "UTF-8");

//解析HTML的数据 Document document = Jsoup.parse(html); Elements jiFenEl = document.select("#aspnetForm > div.udivright > div:nth-child(2) > table > tbody > tr > td:nth-child(1) > table:nth-child(2) > tbody > tr > td:nth-child(2) > div:nth-child(1) > font"); System.out.println(jiFenEl.text()); } } }

 

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

相关文章:

版权声明:Python教程2022-10-28发表,共计6500字。
新手QQ群:570568346,欢迎进群讨论 Python51学习