`
276833190
  • 浏览: 14064 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

java 数据采集,采集各个网站的小说文章示例

阅读更多

1、首先要有 jsoup-1.7.1.jar jar包 引入。

 

2、jsoup 的工具类:

package com.wp.util;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

public class JsoupUtil {

	public static Document getDocument(String url) {
		int error_count = 0;
		Document doc = null;
		while (true) {
			if (error_count > Constants.url_error_count) {
				break;
			}
			try {
				doc = Jsoup
						.connect(url)
						.timeout(Constants.url_ConnectTimeout)
						.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
						.header("Accept-Encoding", "gzip,deflate,sdch")
						.header("Connection", "keep-alive")
						.followRedirects(true)
						.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")
						.get();
			} catch (Exception e) {
				error_count++;
			}
			if (doc != null) {
				break;
			}
		}
		return doc;
	}

	public static Document parseHtml(String html) {
		return Jsoup.parse(html);
	}

}

3、主要采集的类:

 

package com.wp.test;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.wp.util.JsoupUtil;
import com.wp.util.Util;

public class Caiji {
	public static void main(String[] args) {
		start();
	}

	public static void start() {
		BufferedWriter w = null;

		Document doc = JsoupUtil.getDocument("http://www.00kxs.com/html/0/596/");// 获取章节列表
		try {
			File file = new File("E:/abc.txt");
			w = new BufferedWriter(new FileWriter(file));

			if (doc != null) {
				Elements list_a = doc.select("div[id=list] dl dd a");// 获取所有章节的url内容
				for (Element el : list_a) {
					String url = el.attr("abs:href");// 章节url
					String name = el.text();// 章节名称
					int chp_num = Util.parseInt(Util.getMatch("第(\\d+)章", name, 1));
					System.out.println(url + "=====" + name + "====" + chp_num);

					Document e_content = JsoupUtil.getDocument(url);// 获取章节正文
					String content = "";// 章节正文
					if (e_content != null) {
						content = e_content.select("div[id=content]").html().replace(" ", " ");
						content = content.replace("<br />", "\n").replaceAll("(\n){1,}", "\n");

						w.append(name + "\n\n" + content.replaceAll("(\n){1,}", "\n") + "\n\n");
					} else {
						w.append(name + "\n\n");
					}
				}
			} else {
				System.out.println("没有获取正文");
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				if (w != null)
					w.close();
			} catch (Exception e) {
			}
		}
	}
	
}

  

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics