JavaのHTMLパーサー色々

Java HTML

あるページの画像とリンクのURLをすべて取得するサンプルを書いてみる。
大体一緒だから他のサンプルも後で書くか…

import java.io.IOException;
import java.net.MalformedURLException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class SampleJsoup {
	
	public static void main(String[] args) {
		String url = "ゆーあーるえる";
		try {
			Document doc = Jsoup.connect(url).get();
			Elements links = doc.select("a[href]");
			Elements imgs = doc.select("[src]");
			
			for(Element img : imgs){
				if(img.tagName().equals("img")){
					System.out.println(String.format("%s : <%s>",img.tagName(),img.attr("abs:src")));
				}
			}
			
			for(Element link : links){
				System.out.println(String.format("%s : <%s>",link.tagName(),link.attr("abs:href")));
			}
			
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch bklock
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}
import java.io.InputStream;
import java.net.URL;

import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

public class SampleCyberNekoParser {

	public static void main(String[] args) throws Exception{

		URL url = new URL("ゆーあーるえる");
		DOMParser parser = new DOMParser();
		InputStream is = url.openConnection().getInputStream();
		try{
			parser.parse(new InputSource(is));
		}finally{
			is.close();
		}
		
		Document doc = parser.getDocument();
		NodeList linkList = doc.getElementsByTagName("A");
		NodeList imgList = doc.getElementsByTagName("IMG");
		
		for(int i = 0;i < linkList.getLength(); i++){
			Element elm = (Element) linkList.item(i);
			System.out.println(String.format("%s : <%s>", elm.getTagName(),elm.getAttribute("href")));
		}
		
		for(int i = 0;i < imgList.getLength(); i++){
			Element elm = (Element) imgList.item(i);
			System.out.println(String.format("%s : <%s>", elm.getTagName(),elm.getAttribute("src")));
		}
	}
}
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;

import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;

public class SampleJerichoHTMLParser {
	
	public static void main(String[] args) throws MalformedURLException, IOException {
		Source source = new Source(new URL("ゆーあーるえる"));
		List<Element> linkList = source.getAllElements(HTMLElementName.A);
		List<Element> imgList = source.getAllElements(HTMLElementName.IMG);
		
		for(Element link : linkList){
			System.out.println(String.format("%s : <%s>" , link.getName(), link.getAttributeValue("href")));
		}
		for(Element img : imgList){
			System.out.println(String.format("%s : <%s>" , img.getName(), img.getAttributeValue("src")));
		}
	}
}
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;

import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;

public class SampleHTMLCleaner {
	
	public static void main(String[] args) throws MalformedURLException, IOException {
		CleanerProperties props = new CleanerProperties();
		HtmlCleaner htmlCleaner = new HtmlCleaner(props);
		
		TagNode tagNode = htmlCleaner.clean(new URL("ゆーあーるえる"));
		
		List<TagNode> linkList = tagNode.getElementListByName("a", true);
		List<TagNode> imgList = tagNode.getElementListByName("img", true);
		
		for(TagNode link : linkList){
			System.out.println(String.format("%s : <%s>",link.getName(),link.getAttributeByName("href")));
		}
		
		for(TagNode img : imgList){
			System.out.println(String.format("%s : <%s>",img.getName(),img.getAttributeByName("src")));
		}
	}
}