imdb-spider/core/src/main/java/ru/bvn13/imdbspider/spider/extractor/HtmlExtractor.java

88 lines
2.9 KiB
Java

package ru.bvn13.imdbspider.spider.extractor;
import ru.bvn13.imdbspider.exceptions.extractor.HtmlExtractorException;
import ru.bvn13.imdbspider.exceptions.extractor.MalformedUrlException;
import ru.bvn13.imdbspider.exceptions.extractor.ConnectionEstablishingException;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.*;
import java.util.Map;
/**
* @author boyko_vn at 09.01.2019
*/
public class HtmlExtractor {
private static final String UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36";
private static String getParamsString(Map<String, String> params) throws UnsupportedEncodingException {
StringBuilder result = new StringBuilder();
for (Map.Entry<String, String> entry : params.entrySet()) {
result.append(URLEncoder.encode(entry.getKey(), "UTF-8"));
result.append("=");
result.append(URLEncoder.encode(entry.getValue(), "UTF-8"));
result.append("&");
}
String resultString = result.toString();
return resultString.length() > 0
? resultString.substring(0, resultString.length() - 1)
: resultString;
}
public String getHtml(String url) throws HtmlExtractorException {
URL obj = null;
try {
obj = new URL(url);
} catch (MalformedURLException e) {
throw new MalformedUrlException(String.format("Wrong url: %s", url), e);
}
HttpURLConnection connection = null;
try {
connection = (HttpURLConnection) obj.openConnection();
} catch (IOException e) {
throw new ConnectionEstablishingException(String.format("Unable to open connection by utl: %s", url), e);
}
connection.setRequestProperty("Accept", "text/html");
try {
connection.setRequestMethod("GET");
} catch (ProtocolException e) {
throw new ConnectionEstablishingException(String.format("Wrong protocol GET for utl: %s", url), e);
}
BufferedReader in = null;
try {
String inputLine;
StringBuilder response = new StringBuilder();
in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
while ((inputLine = in.readLine()) != null) {
response.append(inputLine);
}
return response.toString();
} catch (IOException e) {
throw new ConnectionEstablishingException(String.format("Could not get input stream for utl: %s", url), e);
} finally {
try {
if (in != null) {
in.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}