imdb-spider/core/src/main/java/ru/bvn13/imdbspider/spider/processor/HtmlProcessor.java

56 lines
1.9 KiB
Java

package ru.bvn13.imdbspider.spider.processor;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException;
import ru.bvn13.imdbspider.exceptions.processor.HtmlToXmlConvertionException;
import ru.bvn13.imdbspider.exceptions.processor.PatternEvaluationException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
/**
* @author boyko_vn at 09.01.2019
*/
public class HtmlProcessor {
public String process(final String html, final String pattern) throws HtmlProcessorException {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = null;
try {
db = dbf.newDocumentBuilder();
} catch (ParserConfigurationException e) {
throw new HtmlProcessorException(e);
}
Document xml = null;
try {
xml = db.parse(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)));
} catch (SAXException e) {
throw new HtmlToXmlConvertionException("Html parsing exception", e);
} catch (IOException e) {
throw new HtmlToXmlConvertionException("Html reading exception", e);
}
XPathFactory xpf = XPathFactory.newInstance();
XPath xpath = xpf.newXPath();
String result = null;
try {
result = (String) xpath.evaluate(pattern, xml, XPathConstants.STRING);
} catch (XPathExpressionException e) {
throw new PatternEvaluationException(String.format("Could not evaluate pattern: %s", pattern), e);
}
return result;
}
}