mirror of https://github.com/bvn13/imdb-spider.git
56 lines
1.9 KiB
Java
56 lines
1.9 KiB
Java
package ru.bvn13.imdbspider.spider.processor;
|
|
|
|
|
|
import org.w3c.dom.Document;
|
|
import org.xml.sax.SAXException;
|
|
import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException;
|
|
import ru.bvn13.imdbspider.exceptions.processor.HtmlToXmlConvertionException;
|
|
import ru.bvn13.imdbspider.exceptions.processor.PatternEvaluationException;
|
|
|
|
import javax.xml.parsers.DocumentBuilder;
|
|
import javax.xml.parsers.DocumentBuilderFactory;
|
|
import javax.xml.parsers.ParserConfigurationException;
|
|
import javax.xml.xpath.XPath;
|
|
import javax.xml.xpath.XPathConstants;
|
|
import javax.xml.xpath.XPathExpressionException;
|
|
import javax.xml.xpath.XPathFactory;
|
|
import java.io.ByteArrayInputStream;
|
|
import java.io.IOException;
|
|
import java.nio.charset.StandardCharsets;
|
|
|
|
/**
|
|
* @author boyko_vn at 09.01.2019
|
|
*/
|
|
public class HtmlProcessor {
|
|
|
|
public String process(final String html, final String pattern) throws HtmlProcessorException {
|
|
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
|
|
DocumentBuilder db = null;
|
|
try {
|
|
db = dbf.newDocumentBuilder();
|
|
} catch (ParserConfigurationException e) {
|
|
throw new HtmlProcessorException(e);
|
|
}
|
|
Document xml = null;
|
|
try {
|
|
xml = db.parse(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)));
|
|
} catch (SAXException e) {
|
|
throw new HtmlToXmlConvertionException("Html parsing exception", e);
|
|
} catch (IOException e) {
|
|
throw new HtmlToXmlConvertionException("Html reading exception", e);
|
|
}
|
|
|
|
XPathFactory xpf = XPathFactory.newInstance();
|
|
XPath xpath = xpf.newXPath();
|
|
String result = null;
|
|
try {
|
|
result = (String) xpath.evaluate(pattern, xml, XPathConstants.STRING);
|
|
} catch (XPathExpressionException e) {
|
|
throw new PatternEvaluationException(String.format("Could not evaluate pattern: %s", pattern), e);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
}
|