本文最后更新于 1926 天前,其中的信息可能已经有所发展或是发生改变。
Github
https://github.com/EchoGroot/fourth_spring_simfyspider.git
目的
运用
spring mvc :WebMVC框架 完成从请求到java 1.Controller 2.Model 3.View
hibernate-jpa / spring-data-jpa:ORM : 对象关系映射
spring :无侵入、轻量级、无缝集成其他框架
spring boot:极度简化spring配置,快速上手开发,提供诸多产品级功能
搭建
利用url传参,并开发持久层,将爬取的数据存储到数据库
分层
web : 接收请求、转换、页面展示等相关功能
service : 服务层、提供查询、抓取等功能实现
dal : 数据访问层,数据持久化相关功能
domain: 领域模型层,模型对象
common:通用工具类
结构
LinkStore 管理 未访问及已访问的队列
PageVisitor 访问URL得到页面
PageParser 解析网页,得到内容
PageRepository 页面仓储服务,保存数据及文件
SimfySpider 爬虫,组装组件
App 测试类
运行
运行程序 访问 http://localhost:8080/crawl?seed=http://news.cqjtu.edu.cn/list.jsp?urltype=tree.TreeTempUrl%26wbtreeid=1021
注意:参数seed里面的url地址不能含有转义字符 如& 需要用%26编码代替
结果
配置数据源
配置jpa
# 数据源配置:使用哪个数据库
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/cqjtu?useUnicode=true&characterEncoding=utf-8&serverTimezone=UTC
spring.datasource.username=root
spring.datasource.password=********
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
# jpa配置
# 控制hibernate的ddl生成策略,create为每次创建
spring.jpa.hibernate.ddl-auto=update
#<property name="hibernate.dialect">org.hibernate.dialect.MySQL5Dialect</property>
spring.jpa.hibernate.dialect=org.hibernate.dialect.MySQL5Dialect
# 是否打印SQL
spring.jpa.show-sql=true
# 日志级别配置
logging.level.org.springframework.boot=INFO
代码
CharsetDetector
/*
* Copyright (C) 2014 hu
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package com.fourth.spring.simfyspider.common;
import org.mozilla.universalchardet.UniversalDetector;
import java.io.UnsupportedEncodingException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 字符集自动检测
*
* @author hu
*/
public class CharsetDetector {
//从Nutch借鉴的网页编码检测代码
private static final int CHUNK_SIZE = 2000;
private static Pattern metaPattern = Pattern.compile(
"<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
Pattern.CASE_INSENSITIVE);
private static Pattern charsetPattern = Pattern.compile(
"charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
private static Pattern charsetPatternHTML5 = Pattern.compile(
"<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
Pattern.CASE_INSENSITIVE);
//从Nutch借鉴的网页编码检测代码
private static String guessEncodingByNutch(byte[] content) {
int length = Math.min(content.length, CHUNK_SIZE);
String str = "";
try {
str = new String(content, "ascii");
} catch (UnsupportedEncodingException e) {
return null;
}
Matcher metaMatcher = metaPattern.matcher(str);
String encoding = null;
if (metaMatcher.find()) {
Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
if (charsetMatcher.find()) {
encoding = new String(charsetMatcher.group(1));
}
}
if (encoding == null) {
metaMatcher = charsetPatternHTML5.matcher(str);
if (metaMatcher.find()) {
encoding = new String(metaMatcher.group(1));
}
}
if (encoding == null) {
if (length >= 3 && content[0] == (byte) 0xEF
&& content[1] == (byte) 0xBB && content[2] == (byte) 0xBF) {
encoding = "UTF-8";
} else if (length >= 2) {
if (content[0] == (byte) 0xFF && content[1] == (byte) 0xFE) {
encoding = "UTF-16LE";
} else if (content[0] == (byte) 0xFE
&& content[1] == (byte) 0xFF) {
encoding = "UTF-16BE";
}
}
}
return encoding;
}
/**
* 根据字节数组,猜测可能的字符集,如果检测失败,返回utf-8
*
* @param bytes 待检测的字节数组
* @return 可能的字符集,如果检测失败,返回utf-8
*/
public static String guessEncodingByMozilla(byte[] bytes) {
String DEFAULT_ENCODING = "UTF-8";
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(bytes, 0, bytes.length);
detector.dataEnd();
String encoding = detector.getDetectedCharset();
detector.reset();
if (encoding == null) {
encoding = DEFAULT_ENCODING;
}
return encoding;
}
/**
* 根据字节数组,猜测可能的字符集,如果检测失败,返回utf-8
* @param content 待检测的字节数组
* @return 可能的字符集,如果检测失败,返回utf-8
*/
public static String guessEncoding(byte[] content) {
String encoding;
try {
encoding = guessEncodingByNutch(content);
} catch (Exception ex) {
return guessEncodingByMozilla(content);
}
if (encoding == null) {
encoding = guessEncodingByMozilla(content);
return encoding;
} else {
return encoding;
}
}
}
NewsRepository
package com.fourth.spring.simfyspider.dal;
import com.fourth.spring.simfyspider.domain.News;
import com.fourth.spring.simfyspider.domain.Page;
import org.springframework.data.jpa.repository.JpaRepository;
public interface NewsRepository extends JpaRepository<News,Integer> {
}
News
package com.fourth.spring.simfyspider.domain;
import javax.persistence.*;
@Table(name="ss_news")
@Entity
public class News {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private String newsId;
private String title;
private String url;
private String content;
private String summary;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
//alt+insert
@Override
public String toString() {
return "News{" +
"title='" + title + '\'' +
", url='" + url + '\'' +
", content='" + content + '\'' +
'}';
}
public String getNewsId() {
return newsId;
}
public void setNewsId(String newsId) {
this.newsId = newsId;
}
}
Page
package com.fourth.spring.simfyspider.domain;
import com.fourth.spring.simfyspider.common.CharsetDetector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.UnsupportedEncodingException;
/**
* 对页面内容的封装
* @author Administrator
*
*/
public class Page {
private byte[] content;
private String url;
private Document doc;
private String html;
private String contentType;
private String charset;
public Page(byte[] content,String url ,String contentType){
this.content=content;
this.url=url;
this.contentType=contentType;
}
//alt+insert
public byte[] getContent() {
return content;
}
public void setContent(byte[] content) {
this.content = content;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public void setHtml(String html) {
this.html = html;
}
public String getContentType() {
return contentType;
}
public void setContentType(String contentType) {
this.contentType = contentType;
}
/**
* 返回网页的源码字符串
*
* @return 网页的源码字符串
*/
public String getHtml() {
if (html != null) {
return html;
}
if (content == null) {
return null;
}
if(charset==null){
charset = CharsetDetector.guessEncoding(content); // 根据内容来猜测 字符编码
}
try {
this.html = new String(content, charset);
return html;
} catch (UnsupportedEncodingException ex) {
ex.printStackTrace();
return null;
}
}
/*
* 得到文档
* */
public Document getDoc(){
if (doc != null) {
return doc;
}
try {
this.doc = Jsoup.parse(getHtml(), url);
return doc;
} catch (Exception ex) {
ex.printStackTrace();
return null;
}
}
}
LinkStore
package com.fourth.spring.simfyspider.service.spider;
import org.springframework.stereotype.Component;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;
/**
*
* @author Administrator
*
*/
@Component
public class LinkStore {
private Set<String> visitedLinks = new HashSet<>();
private Queue<String> unVisitedLinkQueue = new LinkedList<>();
public void addUnVisitLink(String url) {
System.out.println("添加未访问"+url);
if(url!=null&&!visitedLinks.contains(url)&&!unVisitedLinkQueue.contains(url)) {
unVisitedLinkQueue.add(url);
}
}
public void addVisited(String url) {
System.out.println("添加已访问"+url);
visitedLinks.add(url);
}
public boolean isVisited(String url) {
return visitedLinks.contains(url)||visitedLinks.contains(url.concat("/"));
}
public boolean isUnVisitedEmpty() {
return unVisitedLinkQueue.isEmpty();
}
public String getFirst() {
return unVisitedLinkQueue.poll();
}
}
PageParser
package com.fourth.spring.simfyspider.service.spider;
import com.fourth.spring.simfyspider.domain.Page;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
import java.util.HashSet;
import java.util.Set;
/**
* 解析页面
* @author Administrator
*
*/
@Component
public class PageParser {
public Set<String> getLinks(Page page, String cssSelector){
System.out.println("开始解析获得连接");
Elements elements = page.getDoc().select(cssSelector);
Set<String> links = new HashSet<>();
for(Element e : elements){
String link = e.absUrl("href");
links.add(link);
}
return links;
}
/**
* 获得所有的目标元素
* @param page
* @param cssSelectors
* @return
*/
public Elements getTargetElements(Page page ,String... cssSelectors){
Elements result = new Elements();
for(String s : cssSelectors) {
Elements elements = page.getDoc().select(s);
if(elements!=null&&elements.size()>0) {
result.addAll(elements);
}
}
return result;
}
}
PageVisitor
package com.fourth.spring.simfyspider.service.spider;
import com.fourth.spring.simfyspider.domain.Page;
import org.apache.http.Header;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.IOException;
/**
*
* @author Administrator
*
*/
@Component
public class PageVisitor {
/**
* 访问网页
* @param url
* @return
*/
public Page visit(String url) {
//1. 创建客户端
// 配置httpclient
RequestConfig config = RequestConfig.custom()
.setConnectTimeout(50000)
.setSocketTimeout(50000)
.setConnectionRequestTimeout(10000).build();
CloseableHttpClient client = HttpClients.custom()
.setDefaultRequestConfig(config)
.setRetryHandler(new DefaultHttpRequestRetryHandler())
//设置UserAgent,避免网站出现403
.setUserAgent("Mozilla/5.0(Windows;U;Windows NT 5.1;en-US;rv:0.9.4)")
.build();
try {
//2.执行请求
System.out.println("url:"+url);
CloseableHttpResponse response = client.execute(RequestBuilder.get(url).build());
//3.解析响应
if(response.getStatusLine().getStatusCode()!=200){
System.out.println("请求"+url+"失败");
}else{
//将响应转换成内部的page对象
byte[] responseBody = EntityUtils.toByteArray(response.getEntity());
Header contentType = response.getEntity().getContentType();
Page p = new Page(responseBody,url,contentType.getName());
return p;
}
} catch (IOException e) {
e.printStackTrace();
}finally {
try {
client.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
}
SimfySpider
package com.fourth.spring.simfyspider.service.spider;
import com.fourth.spring.simfyspider.dal.NewsRepository;
import com.fourth.spring.simfyspider.domain.News;
import com.fourth.spring.simfyspider.domain.Page;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
@Service
public class SimfySpider {
@Autowired
PageVisitor visitor;
@Autowired//自动生成对象 以前是PageParser parser=new PageParser();
PageParser parser;
@Autowired
NewsRepository repo;
public void crawling(String seed) {
List<News> newsList = new ArrayList<>();
//初始化LinkStore
LinkStore linkStore = new LinkStore();
//把种子URL加入未访问队列
System.out.println("seed"+seed);
linkStore.addUnVisitLink(seed);
while(!linkStore.isUnVisitedEmpty()) {
//得到链接
String url = linkStore.getFirst();
//访问页面
Page page = visitor.visit(url);
//得到链接,存入未访问队列
//System.out.println("page.getHtml()"+page.getHtml());
Set<String> links = parser.getLinks(page,".pagebar .nowrap a");
System.out.println("111");
System.out.println(links.size());
for(String s: links) {
System.out.println("112");
System.out.println("0000"+s);
if(!linkStore.isVisited(s)) {
linkStore.addUnVisitLink(s);
}
//
}
//得到需要的内容clearfix
//System.out.println(page.getHtml());
Elements elements = parser.getTargetElements(page,".right-title");//.info-box .title
System.out.println("elements.text()"+elements.text());
//Elements elements = parser.getTargetElements(page,".clearfix");
for (Element e :elements){
System.out.println(elements.size());
Element element = e.select("a").first();
News news = new News();
System.out.println("element.text()"+element.text());
news.setTitle(element.text());
news.setUrl(element.absUrl("href"));
//Element summaryElement = e.select(".summary").first();
//news.setSummary(summaryElement.text());
//获取内容页的操作
String content = null;
Page contentPage = visitor.visit(news.getUrl());
Elements contentElements = parser.getTargetElements(contentPage,".article-title");
//System.out.println("123 "+contentElements.size());
if(contentElements!=null&&contentElements.size()>0){
//System.out.println("123"+contentElements.html());
content = contentElements.first().text();
news.setContent(content);
}
newsList.add(news);
System.out.println(news);
}
//添加已访问
linkStore.addVisited(url);
//保存文件
repo.saveAll(newsList);
}
}
}
CrawlingController
package com.fourth.spring.simfyspider.web;
import com.fourth.spring.simfyspider.service.spider.SimfySpider;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.stereotype.Repository;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.ResponseBody;
@Controller
public class CrawlingController {
@Autowired
private SimfySpider simfySpider;
@RequestMapping("crawl")//地址映射
@ResponseBody
public String crawl(String seed){
simfySpider.crawling(seed);
return "success";
}
}