Java爬虫 web版
本文最后更新于 1584 天前,其中的信息可能已经有所发展或是发生改变。

Github

https://github.com/EchoGroot/fourth_spring_simfyspider.git

目的

运用
spring mvc :WebMVC框架 完成从请求到java 1.Controller 2.Model 3.View
hibernate-jpa / spring-data-jpa:ORM : 对象关系映射
spring :无侵入、轻量级、无缝集成其他框架
spring boot:极度简化spring配置,快速上手开发,提供诸多产品级功能
搭建
利用url传参,并开发持久层,将爬取的数据存储到数据库

分层

web : 接收请求、转换、页面展示等相关功能
service : 服务层、提供查询、抓取等功能实现
dal : 数据访问层,数据持久化相关功能
domain: 领域模型层,模型对象
common:通用工具类

结构

LinkStore 管理 未访问及已访问的队列
PageVisitor 访问URL得到页面
PageParser 解析网页,得到内容
PageRepository 页面仓储服务,保存数据及文件
SimfySpider 爬虫,组装组件
App 测试类

运行

运行程序 访问 http://localhost:8080/crawl?seed=http://news.cqjtu.edu.cn/list.jsp?urltype=tree.TreeTempUrl%26wbtreeid=1021
注意:参数seed里面的url地址不能含有转义字符 如& 需要用%26编码代替

结果

配置数据源
配置jpa

# 数据源配置:使用哪个数据库
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/cqjtu?useUnicode=true&characterEncoding=utf-8&serverTimezone=UTC
spring.datasource.username=root
spring.datasource.password=********
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
# jpa配置
# 控制hibernate的ddl生成策略,create为每次创建
spring.jpa.hibernate.ddl-auto=update
#<property name="hibernate.dialect">org.hibernate.dialect.MySQL5Dialect</property>
spring.jpa.hibernate.dialect=org.hibernate.dialect.MySQL5Dialect
# 是否打印SQL
spring.jpa.show-sql=true
# 日志级别配置
logging.level.org.springframework.boot=INFO

代码

CharsetDetector

/*
 * Copyright (C) 2014 hu
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package com.fourth.spring.simfyspider.common;


import org.mozilla.universalchardet.UniversalDetector;

import java.io.UnsupportedEncodingException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 字符集自动检测
 *
 * @author hu
 */
public class CharsetDetector {

    //从Nutch借鉴的网页编码检测代码
    private static final int CHUNK_SIZE = 2000;

    private static Pattern metaPattern = Pattern.compile(
            "<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
            Pattern.CASE_INSENSITIVE);
    private static Pattern charsetPattern = Pattern.compile(
            "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
    private static Pattern charsetPatternHTML5 = Pattern.compile(
            "<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
            Pattern.CASE_INSENSITIVE);

    //从Nutch借鉴的网页编码检测代码
    private static String guessEncodingByNutch(byte[] content) {
        int length = Math.min(content.length, CHUNK_SIZE);

        String str = "";
        try {
            str = new String(content, "ascii");
        } catch (UnsupportedEncodingException e) {
            return null;
        }

        Matcher metaMatcher = metaPattern.matcher(str);
        String encoding = null;
        if (metaMatcher.find()) {
            Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
            if (charsetMatcher.find()) {
                encoding = new String(charsetMatcher.group(1));
            }
        }
        if (encoding == null) {
            metaMatcher = charsetPatternHTML5.matcher(str);
            if (metaMatcher.find()) {
                encoding = new String(metaMatcher.group(1));
            }
        }
        if (encoding == null) {
            if (length >= 3 && content[0] == (byte) 0xEF
                    && content[1] == (byte) 0xBB && content[2] == (byte) 0xBF) {
                encoding = "UTF-8";
            } else if (length >= 2) {
                if (content[0] == (byte) 0xFF && content[1] == (byte) 0xFE) {
                    encoding = "UTF-16LE";
                } else if (content[0] == (byte) 0xFE
                        && content[1] == (byte) 0xFF) {
                    encoding = "UTF-16BE";
                }
            }
        }

        return encoding;
    }

    /**
     * 根据字节数组,猜测可能的字符集,如果检测失败,返回utf-8
     *
     * @param bytes 待检测的字节数组
     * @return 可能的字符集,如果检测失败,返回utf-8
     */
    public static String guessEncodingByMozilla(byte[] bytes) {
        String DEFAULT_ENCODING = "UTF-8";
        UniversalDetector detector = new UniversalDetector(null);
        detector.handleData(bytes, 0, bytes.length);
        detector.dataEnd();
        String encoding = detector.getDetectedCharset();
        detector.reset();
        if (encoding == null) {
            encoding = DEFAULT_ENCODING;
        }
        return encoding;
    }

    /**
     * 根据字节数组,猜测可能的字符集,如果检测失败,返回utf-8
     * @param content 待检测的字节数组
     * @return 可能的字符集,如果检测失败,返回utf-8
     */
    public static String guessEncoding(byte[] content) {
        String encoding;
        try {
            encoding = guessEncodingByNutch(content);
        } catch (Exception ex) {
            return guessEncodingByMozilla(content);
        }

        if (encoding == null) {
            encoding = guessEncodingByMozilla(content);
            return encoding;
        } else {
            return encoding;
        }
    }
}

NewsRepository

package com.fourth.spring.simfyspider.dal;

import com.fourth.spring.simfyspider.domain.News;
import com.fourth.spring.simfyspider.domain.Page;
import org.springframework.data.jpa.repository.JpaRepository;

public interface NewsRepository extends JpaRepository<News,Integer> {

}

News

package com.fourth.spring.simfyspider.domain;

import javax.persistence.*;

@Table(name="ss_news")
@Entity
public class News {
    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    private String newsId;
    private String title;
    private String url;
    private String content;
    private String summary;

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public String getSummary() {
        return summary;
    }

    public void setSummary(String summary) {
        this.summary = summary;
    }
    //alt+insert

    @Override
    public String toString() {
        return "News{" +
                "title='" + title + '\'' +
                ", url='" + url + '\'' +
                ", content='" + content + '\'' +

                '}';
    }

    public String getNewsId() {
        return newsId;
    }

    public void setNewsId(String newsId) {
        this.newsId = newsId;
    }
}

Page

package com.fourth.spring.simfyspider.domain;

import com.fourth.spring.simfyspider.common.CharsetDetector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.UnsupportedEncodingException;

/**
 * 对页面内容的封装
 * @author Administrator
 *
 */
public class Page {

    private byte[] content;
    private String url;
    private Document doc;
    private String html;
    private String contentType;
    private String charset;

    public Page(byte[] content,String url ,String contentType){
        this.content=content;
        this.url=url;
        this.contentType=contentType;
    }

//alt+insert
    public byte[] getContent() {
        return content;
    }

    public void setContent(byte[] content) {
        this.content = content;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public void setHtml(String html) {
        this.html = html;
    }

    public String getContentType() {
        return contentType;
    }

    public void setContentType(String contentType) {
        this.contentType = contentType;
    }

    /**
     * 返回网页的源码字符串
     *
     * @return 网页的源码字符串
     */
    public String getHtml() {
        if (html != null) {
            return html;
        }
        if (content == null) {
            return null;
        }
        if(charset==null){
            charset = CharsetDetector.guessEncoding(content); // 根据内容来猜测 字符编码
        }
        try {
            this.html = new String(content, charset);
            return html;
        } catch (UnsupportedEncodingException ex) {
            ex.printStackTrace();
            return null;
        }
    }

    /*
     *  得到文档
     * */
    public Document getDoc(){
        if (doc != null) {
            return doc;
        }
        try {
            this.doc = Jsoup.parse(getHtml(), url);
            return doc;
        } catch (Exception ex) {
            ex.printStackTrace();
            return null;
        }
    }
}

LinkStore

package com.fourth.spring.simfyspider.service.spider;

import org.springframework.stereotype.Component;

import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;

/**
 *
 * @author Administrator
 *
 */
@Component
public class LinkStore {
	private Set<String> visitedLinks = new HashSet<>();
	private Queue<String> unVisitedLinkQueue = new LinkedList<>();
	public void addUnVisitLink(String url) {
		System.out.println("添加未访问"+url);
		if(url!=null&&!visitedLinks.contains(url)&&!unVisitedLinkQueue.contains(url)) {
			unVisitedLinkQueue.add(url);
		}
	}
	
	public void addVisited(String url) {
		System.out.println("添加已访问"+url);

		visitedLinks.add(url);
	}
	

	public boolean isVisited(String url) {
		return visitedLinks.contains(url)||visitedLinks.contains(url.concat("/"));
	}
	
	
	public boolean isUnVisitedEmpty() {
		return unVisitedLinkQueue.isEmpty();
	}

	public String getFirst() {
		return unVisitedLinkQueue.poll();
	}
}

PageParser

package com.fourth.spring.simfyspider.service.spider;

import com.fourth.spring.simfyspider.domain.Page;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;

import java.util.HashSet;
import java.util.Set;

/**
 * 解析页面
 * @author Administrator
 *
 */
@Component
public class PageParser {
	public Set<String> getLinks(Page page, String cssSelector){
		System.out.println("开始解析获得连接");

		Elements elements = page.getDoc().select(cssSelector);
		Set<String> links = new HashSet<>();
		for(Element e : elements){
			String link = e.absUrl("href");
			links.add(link);
		}
		return links;
	}

	/**
	 * 获得所有的目标元素
	 * @param page
	 * @param cssSelectors
	 * @return
	 */
	public Elements getTargetElements(Page page ,String... cssSelectors){
		Elements result = new Elements();
		for(String s : cssSelectors) {
			Elements elements = page.getDoc().select(s);
			if(elements!=null&&elements.size()>0) {
				result.addAll(elements);
			}
		}
		return result;
	}

}

PageVisitor

package com.fourth.spring.simfyspider.service.spider;

import com.fourth.spring.simfyspider.domain.Page;
import org.apache.http.Header;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;

import java.io.IOException;

/**
 * 
 * @author Administrator
 *
 */
@Component
public class PageVisitor {

	/**
	 * 访问网页
	 * @param url
	 * @return
	 */
	public Page visit(String url) {
		//1. 创建客户端
		// 配置httpclient
		RequestConfig config = RequestConfig.custom()
				.setConnectTimeout(50000)
				.setSocketTimeout(50000)
				.setConnectionRequestTimeout(10000).build();

		CloseableHttpClient client = HttpClients.custom()
				.setDefaultRequestConfig(config)
				.setRetryHandler(new DefaultHttpRequestRetryHandler())
				//设置UserAgent,避免网站出现403
				.setUserAgent("Mozilla/5.0(Windows;U;Windows NT 5.1;en-US;rv:0.9.4)")
				.build();

		try {
			//2.执行请求
			System.out.println("url:"+url);
			CloseableHttpResponse response = client.execute(RequestBuilder.get(url).build());
			//3.解析响应
			if(response.getStatusLine().getStatusCode()!=200){
				System.out.println("请求"+url+"失败");
			}else{
				//将响应转换成内部的page对象
				byte[] responseBody = EntityUtils.toByteArray(response.getEntity());
				Header contentType = response.getEntity().getContentType();
				Page p = new Page(responseBody,url,contentType.getName());
				return p;
			}
		} catch (IOException e) {
			e.printStackTrace();
		}finally {
			try {
				client.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}

		return null;
	}
	
}

SimfySpider

package com.fourth.spring.simfyspider.service.spider;

import com.fourth.spring.simfyspider.dal.NewsRepository;
import com.fourth.spring.simfyspider.domain.News;
import com.fourth.spring.simfyspider.domain.Page;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;

@Service
public class SimfySpider {
	@Autowired
	PageVisitor visitor;
	@Autowired//自动生成对象 以前是PageParser parser=new PageParser();
	PageParser parser;
	@Autowired
	NewsRepository repo;

	public void crawling(String seed) {
		List<News> newsList = new ArrayList<>();
		//初始化LinkStore
		LinkStore linkStore = new LinkStore();
		//把种子URL加入未访问队列
		System.out.println("seed"+seed);
		linkStore.addUnVisitLink(seed);
		while(!linkStore.isUnVisitedEmpty()) {
			//得到链接
			String url = linkStore.getFirst();
			//访问页面
			Page page = visitor.visit(url);
			//得到链接,存入未访问队列
			//System.out.println("page.getHtml()"+page.getHtml());
			Set<String> links = parser.getLinks(page,".pagebar .nowrap a");
			System.out.println("111");
			System.out.println(links.size());
			for(String s: links) {
				System.out.println("112");
				System.out.println("0000"+s);
				if(!linkStore.isVisited(s)) {
					linkStore.addUnVisitLink(s);
				}
//
			}
			//得到需要的内容clearfix
			//System.out.println(page.getHtml());
			Elements elements = parser.getTargetElements(page,".right-title");//.info-box .title
			System.out.println("elements.text()"+elements.text());
			//Elements elements = parser.getTargetElements(page,".clearfix");
			for (Element e :elements){
				System.out.println(elements.size());
				Element element = e.select("a").first();
				News news = new News();
				System.out.println("element.text()"+element.text());
				news.setTitle(element.text());
				news.setUrl(element.absUrl("href"));

				//Element summaryElement = e.select(".summary").first();
				//news.setSummary(summaryElement.text());

				//获取内容页的操作
				String content = null;
				Page contentPage = visitor.visit(news.getUrl());
				Elements contentElements = parser.getTargetElements(contentPage,".article-title");
				//System.out.println("123  "+contentElements.size());
				if(contentElements!=null&&contentElements.size()>0){
					//System.out.println("123"+contentElements.html());
					content = contentElements.first().text();
					news.setContent(content);
				}
				newsList.add(news);
				System.out.println(news);
			}

			//添加已访问
			linkStore.addVisited(url);
			//保存文件
			repo.saveAll(newsList);
		}
	}
}

CrawlingController

package com.fourth.spring.simfyspider.web;

import com.fourth.spring.simfyspider.service.spider.SimfySpider;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.stereotype.Repository;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.ResponseBody;

@Controller
public class CrawlingController {
    @Autowired
    private SimfySpider simfySpider;
    @RequestMapping("crawl")//地址映射
    @ResponseBody
    public String crawl(String seed){
        simfySpider.crawling(seed);
        return "success";
    }
}
作者:Yuyy
博客:https://yuyy.info
暂无评论

发送评论 编辑评论


				
|´・ω・)ノ
ヾ(≧∇≦*)ゝ
(☆ω☆)
(╯‵□′)╯︵┴─┴
 ̄﹃ ̄
(/ω\)
∠( ᐛ 」∠)_
(๑•̀ㅁ•́ฅ)
→_→
୧(๑•̀⌄•́๑)૭
٩(ˊᗜˋ*)و
(ノ°ο°)ノ
(´இ皿இ`)
⌇●﹏●⌇
(ฅ´ω`ฅ)
(╯°A°)╯︵○○○
φ( ̄∇ ̄o)
ヾ(´・ ・`。)ノ"
( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃
(ó﹏ò。)
Σ(っ °Д °;)っ
( ,,´・ω・)ノ"(´っω・`。)
╮(╯▽╰)╭
o(*////▽////*)q
>﹏<
( ๑´•ω•) "(ㆆᴗㆆ)
😂
😀
😅
😊
🙂
🙃
😌
😍
😘
😜
😝
😏
😒
🙄
😳
😡
😔
😫
😱
😭
💩
👻
🙌
🖕
👍
👫
👬
👭
🌚
🌝
🙈
💊
😶
🙏
🍦
🍉
😣
Source: github.com/k4yt3x/flowerhd
颜文字
Emoji
小恐龙
花!
上一篇
下一篇