imdb-spider/core/src/main/java/ru/bvn13/imdbspider/spider/processor/HtmlProcessor.java

package ru.bvn13.imdbspider.spider.processor;


import org.w3c.dom.Document;
import org.xml.sax.SAXException;
import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException;
import ru.bvn13.imdbspider.exceptions.processor.HtmlToXmlConvertionException;
import ru.bvn13.imdbspider.exceptions.processor.PatternEvaluationException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;

/**
 * @author boyko_vn at 09.01.2019
 */
public class HtmlProcessor {

    public String process(final String html, final String pattern) throws HtmlProcessorException {
        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        DocumentBuilder db = null;
        try {
            db = dbf.newDocumentBuilder();
        } catch (ParserConfigurationException e) {
            throw new HtmlProcessorException(e);
        }
        Document xml = null;
        try {
            xml = db.parse(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)));
        } catch (SAXException e) {
            throw new HtmlToXmlConvertionException("Html parsing exception", e);
        } catch (IOException e) {
            throw new HtmlToXmlConvertionException("Html reading exception", e);
        }

        XPathFactory xpf = XPathFactory.newInstance();
        XPath xpath = xpf.newXPath();
        String result = null;
        try {
            result = (String) xpath.evaluate(pattern, xml, XPathConstants.STRING);
        } catch (XPathExpressionException e) {
            throw new PatternEvaluationException(String.format("Could not evaluate pattern: %s", pattern), e);
        }

        return result;
    }

}