From 4205b7ad2788e37b567353ce1feecf7e0adbd45c Mon Sep 17 00:00:00 2001 From: Vyacheslav Boyko Date: Wed, 9 Jan 2019 17:57:37 +0300 Subject: [PATCH] start working on project --- .gitignore | 5 + core/pom.xml | 43 +++++++ core/src/main/java/module-info.java | 9 ++ .../java/ru/bvn13/imdbspider/ImdbSpider.java | 78 +++++++++++++ .../exceptions/ImdbSpiderException.java | 26 +++++ .../api/DataTypeNotSupportedException.java | 28 +++++ .../ConnectionEstablishingException.java | 26 +++++ .../extractor/HtmlExtractorException.java | 27 +++++ .../extractor/MalformedUrlException.java | 27 +++++ .../processor/HtmlProcessorException.java | 28 +++++ .../HtmlToXmlConvertionException.java | 26 +++++ .../processor/PatternEvaluationException.java | 26 +++++ .../ru/bvn13/imdbspider/imdb/DataType.java | 10 ++ .../ru/bvn13/imdbspider/imdb/ImdbObject.java | 26 +++++ .../java/ru/bvn13/imdbspider/imdb/Movie.java | 30 +++++ .../bvn13/imdbspider/imdb/MovieDataType.java | 26 +++++ .../ru/bvn13/imdbspider/imdb/MovieList.java | 10 ++ .../imdbspider/spider/api/ApiFactory.java | 17 +++ .../spider/api/v1_0/ApiFactory_1_0.java | 47 ++++++++ .../spider/extractor/HtmlExtractor.java | 87 ++++++++++++++ .../spider/processor/HtmlProcessor.java | 55 +++++++++ .../imdbspider/spider/tasker/Manager.java | 59 ++++++++++ .../bvn13/imdbspider/spider/tasker/Task.java | 76 ++++++++++++ .../imdbspider/spider/tasker/Worker.java | 67 +++++++++++ .../ru/bvn13/imdbspider/runner/AppTest.java | 26 +++++ pom.xml | 108 ++++++++++++++++++ runner/pom.xml | 86 ++++++++++++++ runner/src/main/java/module-info.java | 5 + .../java/ru/bvn13/imdbspider/runner/App.java | 13 +++ .../ru/bvn13/imdbspider/runner/AppTest.java | 17 +++ 30 files changed, 1114 insertions(+) create mode 100644 core/pom.xml create mode 100644 core/src/main/java/module-info.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/ImdbSpider.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/exceptions/ImdbSpiderException.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/exceptions/api/DataTypeNotSupportedException.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/exceptions/extractor/ConnectionEstablishingException.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/exceptions/extractor/HtmlExtractorException.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/exceptions/extractor/MalformedUrlException.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/HtmlProcessorException.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/HtmlToXmlConvertionException.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/PatternEvaluationException.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/imdb/DataType.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/imdb/ImdbObject.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/imdb/Movie.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/imdb/MovieDataType.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/imdb/MovieList.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/spider/api/ApiFactory.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/spider/extractor/HtmlExtractor.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/spider/processor/HtmlProcessor.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Manager.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Task.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Worker.java create mode 100644 core/src/test/java/ru/bvn13/imdbspider/runner/AppTest.java create mode 100644 pom.xml create mode 100644 runner/pom.xml create mode 100644 runner/src/main/java/module-info.java create mode 100644 runner/src/main/java/ru/bvn13/imdbspider/runner/App.java create mode 100644 runner/src/test/java/ru/bvn13/imdbspider/runner/AppTest.java diff --git a/.gitignore b/.gitignore index a1c2a23..a043496 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,8 @@ # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml hs_err_pid* + +#intellij +*.ipr +*.iws +*.iml diff --git a/core/pom.xml b/core/pom.xml new file mode 100644 index 0000000..6d37707 --- /dev/null +++ b/core/pom.xml @@ -0,0 +1,43 @@ + + + + + parent + ru.bvn13.imdbspider + 1.0-SNAPSHOT + + + 4.0.0 + + imdb-spider-core + + IMDB-SPIDER :: CORE + + jar + + + UTF-8 + + + + + + + + + + + + + + + + junit + junit + 4.11 + test + + + + diff --git a/core/src/main/java/module-info.java b/core/src/main/java/module-info.java new file mode 100644 index 0000000..001f792 --- /dev/null +++ b/core/src/main/java/module-info.java @@ -0,0 +1,9 @@ +module imdb.spider.core { + //exports ru.bvn13.imdbspider; + exports ru.bvn13.imdbspider.imdb; + exports ru.bvn13.imdbspider.spider.tasker; + exports ru.bvn13.imdbspider.exceptions; + + requires java.xml; + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/ImdbSpider.java b/core/src/main/java/ru/bvn13/imdbspider/ImdbSpider.java new file mode 100644 index 0000000..ab8b288 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/ImdbSpider.java @@ -0,0 +1,78 @@ +package ru.bvn13.imdbspider; + +import ru.bvn13.imdbspider.exceptions.api.DataTypeNotSupportedException; +import ru.bvn13.imdbspider.imdb.Movie; +import ru.bvn13.imdbspider.imdb.MovieDataType; +import ru.bvn13.imdbspider.spider.api.v1_0.ApiFactory_1_0; +import ru.bvn13.imdbspider.spider.tasker.Manager; +import ru.bvn13.imdbspider.spider.tasker.Task; +import ru.bvn13.imdbspider.spider.api.ApiFactory; + +import java.net.URLEncoder; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; +import java.util.concurrent.ExecutionException; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class ImdbSpider { + + private static final String URL_MAIN = "https://www.imdb.com/"; + private static final String URL_SEARCH_TITLE = "https://www.imdb.com/find?ref_=nv_sr_fn&q={{title}}&s=tt"; + + private Manager manager; + + private ApiFactory apiFactory; + + public static ImdbSpider withApi_1_0() { + return new ImdbSpider(new ApiFactory_1_0()); + } + + + public ImdbSpider(ApiFactory apiFactory) { + + manager = new Manager(); + + } + + public List searchMovieByTitle(String title) { + return searchMovieByTitle(title, 10); + } + + public List searchMovieByTitle(String title, int maxCount) { + return searchMovieByTitle(title, maxCount, EnumSet.of(MovieDataType.TITLE)); + } + + public List searchMovieByTitle(String title, int maxCount, EnumSet dataTypes) { + + String url = URL_SEARCH_TITLE.replace("{{title}}", URLEncoder.encode(title, Charset.forName("utf-8"))); + + List tasks = new ArrayList<>(); + + for (MovieDataType mdt : MovieDataType.values()) { + if (dataTypes.contains(mdt)) { + try { + tasks.add(apiFactory.taskByDataType(mdt)); + } catch (DataTypeNotSupportedException e) { + //do nothing + e.printStackTrace(); + } + } + } + + try { + tasks = manager.processTasks(tasks); + } catch (ExecutionException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + + return null; + + } + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/exceptions/ImdbSpiderException.java b/core/src/main/java/ru/bvn13/imdbspider/exceptions/ImdbSpiderException.java new file mode 100644 index 0000000..76f20f3 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/exceptions/ImdbSpiderException.java @@ -0,0 +1,26 @@ +package ru.bvn13.imdbspider.exceptions; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class ImdbSpiderException extends Exception { + + public ImdbSpiderException() { + } + + public ImdbSpiderException(String message) { + super(message); + } + + public ImdbSpiderException(String message, Throwable cause) { + super(message, cause); + } + + public ImdbSpiderException(Throwable cause) { + super(cause); + } + + public ImdbSpiderException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/exceptions/api/DataTypeNotSupportedException.java b/core/src/main/java/ru/bvn13/imdbspider/exceptions/api/DataTypeNotSupportedException.java new file mode 100644 index 0000000..ea0ca6b --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/exceptions/api/DataTypeNotSupportedException.java @@ -0,0 +1,28 @@ +package ru.bvn13.imdbspider.exceptions.api; + +import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class DataTypeNotSupportedException extends ImdbSpiderException { + + public DataTypeNotSupportedException() { + } + + public DataTypeNotSupportedException(String message) { + super(message); + } + + public DataTypeNotSupportedException(String message, Throwable cause) { + super(message, cause); + } + + public DataTypeNotSupportedException(Throwable cause) { + super(cause); + } + + public DataTypeNotSupportedException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/exceptions/extractor/ConnectionEstablishingException.java b/core/src/main/java/ru/bvn13/imdbspider/exceptions/extractor/ConnectionEstablishingException.java new file mode 100644 index 0000000..33cc838 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/exceptions/extractor/ConnectionEstablishingException.java @@ -0,0 +1,26 @@ +package ru.bvn13.imdbspider.exceptions.extractor; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class ConnectionEstablishingException extends HtmlExtractorException { + + public ConnectionEstablishingException() { + } + + public ConnectionEstablishingException(String message) { + super(message); + } + + public ConnectionEstablishingException(String message, Throwable cause) { + super(message, cause); + } + + public ConnectionEstablishingException(Throwable cause) { + super(cause); + } + + public ConnectionEstablishingException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/exceptions/extractor/HtmlExtractorException.java b/core/src/main/java/ru/bvn13/imdbspider/exceptions/extractor/HtmlExtractorException.java new file mode 100644 index 0000000..4d826b8 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/exceptions/extractor/HtmlExtractorException.java @@ -0,0 +1,27 @@ +package ru.bvn13.imdbspider.exceptions.extractor; + +import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class HtmlExtractorException extends ImdbSpiderException { + public HtmlExtractorException() { + } + + public HtmlExtractorException(String message) { + super(message); + } + + public HtmlExtractorException(String message, Throwable cause) { + super(message, cause); + } + + public HtmlExtractorException(Throwable cause) { + super(cause); + } + + public HtmlExtractorException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/exceptions/extractor/MalformedUrlException.java b/core/src/main/java/ru/bvn13/imdbspider/exceptions/extractor/MalformedUrlException.java new file mode 100644 index 0000000..90fd899 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/exceptions/extractor/MalformedUrlException.java @@ -0,0 +1,27 @@ +package ru.bvn13.imdbspider.exceptions.extractor; + + +/** + * @author boyko_vn at 09.01.2019 + */ +public class MalformedUrlException extends HtmlExtractorException { + + public MalformedUrlException() { + } + + public MalformedUrlException(String message) { + super(message); + } + + public MalformedUrlException(String message, Throwable cause) { + super(message, cause); + } + + public MalformedUrlException(Throwable cause) { + super(cause); + } + + public MalformedUrlException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/HtmlProcessorException.java b/core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/HtmlProcessorException.java new file mode 100644 index 0000000..6912140 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/HtmlProcessorException.java @@ -0,0 +1,28 @@ +package ru.bvn13.imdbspider.exceptions.processor; + +import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class HtmlProcessorException extends ImdbSpiderException { + + public HtmlProcessorException() { + } + + public HtmlProcessorException(String message) { + super(message); + } + + public HtmlProcessorException(String message, Throwable cause) { + super(message, cause); + } + + public HtmlProcessorException(Throwable cause) { + super(cause); + } + + public HtmlProcessorException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/HtmlToXmlConvertionException.java b/core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/HtmlToXmlConvertionException.java new file mode 100644 index 0000000..ea2ca47 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/HtmlToXmlConvertionException.java @@ -0,0 +1,26 @@ +package ru.bvn13.imdbspider.exceptions.processor; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class HtmlToXmlConvertionException extends HtmlProcessorException { + + public HtmlToXmlConvertionException() { + } + + public HtmlToXmlConvertionException(String message) { + super(message); + } + + public HtmlToXmlConvertionException(String message, Throwable cause) { + super(message, cause); + } + + public HtmlToXmlConvertionException(Throwable cause) { + super(cause); + } + + public HtmlToXmlConvertionException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/PatternEvaluationException.java b/core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/PatternEvaluationException.java new file mode 100644 index 0000000..c66d336 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/PatternEvaluationException.java @@ -0,0 +1,26 @@ +package ru.bvn13.imdbspider.exceptions.processor; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class PatternEvaluationException extends HtmlProcessorException { + + public PatternEvaluationException() { + } + + public PatternEvaluationException(String message) { + super(message); + } + + public PatternEvaluationException(String message, Throwable cause) { + super(message, cause); + } + + public PatternEvaluationException(Throwable cause) { + super(cause); + } + + public PatternEvaluationException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/imdb/DataType.java b/core/src/main/java/ru/bvn13/imdbspider/imdb/DataType.java new file mode 100644 index 0000000..8e910d1 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/imdb/DataType.java @@ -0,0 +1,10 @@ +package ru.bvn13.imdbspider.imdb; + +/** + * @author boyko_vn at 09.01.2019 + */ +public interface DataType { + + String get(); + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/imdb/ImdbObject.java b/core/src/main/java/ru/bvn13/imdbspider/imdb/ImdbObject.java new file mode 100644 index 0000000..9db0a44 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/imdb/ImdbObject.java @@ -0,0 +1,26 @@ +package ru.bvn13.imdbspider.imdb; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class ImdbObject { + + private int id; + private String url; + + public int getId() { + return id; + } + + public void setId(int id) { + this.id = id; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/imdb/Movie.java b/core/src/main/java/ru/bvn13/imdbspider/imdb/Movie.java new file mode 100644 index 0000000..4c2b039 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/imdb/Movie.java @@ -0,0 +1,30 @@ +package ru.bvn13.imdbspider.imdb; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class Movie extends ImdbObject { + + private String title; + private Map akas = new ConcurrentHashMap<>(50); + + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public Map getAkas() { + return akas; + } + + public void setAkas(Map akas) { + this.akas = akas; + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieDataType.java b/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieDataType.java new file mode 100644 index 0000000..7d0b8cb --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieDataType.java @@ -0,0 +1,26 @@ +package ru.bvn13.imdbspider.imdb; + +import java.util.EnumSet; + +/** + * @author boyko_vn at 09.01.2019 + */ +public enum MovieDataType implements DataType { + + TITLE("title") + + ; + + private String value; + + MovieDataType(String v) { + value = v; + } + + public static final EnumSet ALL_DATA = EnumSet.allOf(MovieDataType.class); + + @Override + public String get() { + return value; + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieList.java b/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieList.java new file mode 100644 index 0000000..08d3849 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieList.java @@ -0,0 +1,10 @@ +package ru.bvn13.imdbspider.imdb; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class MovieList extends ImdbObject { + + + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/api/ApiFactory.java b/core/src/main/java/ru/bvn13/imdbspider/spider/api/ApiFactory.java new file mode 100644 index 0000000..9ccd0ca --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/api/ApiFactory.java @@ -0,0 +1,17 @@ +package ru.bvn13.imdbspider.spider.api; + +import ru.bvn13.imdbspider.exceptions.api.DataTypeNotSupportedException; +import ru.bvn13.imdbspider.imdb.DataType; +import ru.bvn13.imdbspider.imdb.ImdbObject; +import ru.bvn13.imdbspider.spider.tasker.Task; + +/** + * @author boyko_vn at 09.01.2019 + */ +public interface ApiFactory { + + Task taskByDataType(DataType dataType) throws DataTypeNotSupportedException; + + void fulfillImdbObject(ImdbObject imdbObject, Task task); + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java new file mode 100644 index 0000000..d5da9b2 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java @@ -0,0 +1,47 @@ +package ru.bvn13.imdbspider.spider.api.v1_0; + +import ru.bvn13.imdbspider.exceptions.api.DataTypeNotSupportedException; +import ru.bvn13.imdbspider.imdb.DataType; +import ru.bvn13.imdbspider.imdb.ImdbObject; +import ru.bvn13.imdbspider.imdb.Movie; +import ru.bvn13.imdbspider.imdb.MovieDataType; +import ru.bvn13.imdbspider.spider.api.ApiFactory; +import ru.bvn13.imdbspider.spider.tasker.Task; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class ApiFactory_1_0 implements ApiFactory { + + @Override + public Task taskByDataType(DataType dataType) throws DataTypeNotSupportedException { + if (dataType instanceof MovieDataType) { + return taskByMovieDataType((MovieDataType) dataType); + } else { + throw new DataTypeNotSupportedException(String.format("DataType %s not supported by API v1_0!", dataType.getClass().getName())); + } + } + + @Override + public void fulfillImdbObject(ImdbObject imdbObject, Task task) { + if (imdbObject instanceof Movie) { + if (task.getDataType() instanceof MovieDataType) { + fulfillMovie((Movie) imdbObject, task); + } + } + } + + private Task taskByMovieDataType(MovieDataType movieDataType) { + switch (movieDataType) { + case TITLE: return new Task(); + default: return null; + } + } + + private void fulfillMovie(Movie movie, Task task) { + switch ((MovieDataType) task.getDataType()) { + case TITLE: movie.setTitle(task.getResult()); break; + } + } + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/extractor/HtmlExtractor.java b/core/src/main/java/ru/bvn13/imdbspider/spider/extractor/HtmlExtractor.java new file mode 100644 index 0000000..d0a42a5 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/extractor/HtmlExtractor.java @@ -0,0 +1,87 @@ +package ru.bvn13.imdbspider.spider.extractor; + +import ru.bvn13.imdbspider.exceptions.extractor.HtmlExtractorException; +import ru.bvn13.imdbspider.exceptions.extractor.MalformedUrlException; +import ru.bvn13.imdbspider.exceptions.extractor.ConnectionEstablishingException; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.net.*; +import java.util.Map; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class HtmlExtractor { + + private static final String UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"; + + private static String getParamsString(Map params) throws UnsupportedEncodingException { + StringBuilder result = new StringBuilder(); + + for (Map.Entry entry : params.entrySet()) { + result.append(URLEncoder.encode(entry.getKey(), "UTF-8")); + result.append("="); + result.append(URLEncoder.encode(entry.getValue(), "UTF-8")); + result.append("&"); + } + + String resultString = result.toString(); + return resultString.length() > 0 + ? resultString.substring(0, resultString.length() - 1) + : resultString; + } + + public String getHtml(String url) throws HtmlExtractorException { + + URL obj = null; + + try { + obj = new URL(url); + } catch (MalformedURLException e) { + throw new MalformedUrlException(String.format("Wrong url: %s", url), e); + } + + HttpURLConnection connection = null; + try { + connection = (HttpURLConnection) obj.openConnection(); + } catch (IOException e) { + throw new ConnectionEstablishingException(String.format("Unable to open connection by utl: %s", url), e); + } + + connection.setRequestProperty("Accept", "text/html"); + + try { + connection.setRequestMethod("GET"); + } catch (ProtocolException e) { + throw new ConnectionEstablishingException(String.format("Wrong protocol GET for utl: %s", url), e); + } + + BufferedReader in = null; + try { + String inputLine; + StringBuilder response = new StringBuilder(); + in = new BufferedReader(new InputStreamReader(connection.getInputStream())); + + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + } + + return response.toString(); + } catch (IOException e) { + throw new ConnectionEstablishingException(String.format("Could not get input stream for utl: %s", url), e); + } finally { + try { + if (in != null) { + in.close(); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + + } + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/processor/HtmlProcessor.java b/core/src/main/java/ru/bvn13/imdbspider/spider/processor/HtmlProcessor.java new file mode 100644 index 0000000..f8af553 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/processor/HtmlProcessor.java @@ -0,0 +1,55 @@ +package ru.bvn13.imdbspider.spider.processor; + + +import org.w3c.dom.Document; +import org.xml.sax.SAXException; +import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException; +import ru.bvn13.imdbspider.exceptions.processor.HtmlToXmlConvertionException; +import ru.bvn13.imdbspider.exceptions.processor.PatternEvaluationException; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpressionException; +import javax.xml.xpath.XPathFactory; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class HtmlProcessor { + + public String process(final String html, final String pattern) throws HtmlProcessorException { + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + DocumentBuilder db = null; + try { + db = dbf.newDocumentBuilder(); + } catch (ParserConfigurationException e) { + throw new HtmlProcessorException(e); + } + Document xml = null; + try { + xml = db.parse(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8))); + } catch (SAXException e) { + throw new HtmlToXmlConvertionException("Html parsing exception", e); + } catch (IOException e) { + throw new HtmlToXmlConvertionException("Html reading exception", e); + } + + XPathFactory xpf = XPathFactory.newInstance(); + XPath xpath = xpf.newXPath(); + String result = null; + try { + result = (String) xpath.evaluate(pattern, xml, XPathConstants.STRING); + } catch (XPathExpressionException e) { + throw new PatternEvaluationException(String.format("Could not evaluate pattern: %s", pattern), e); + } + + return result; + } + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Manager.java b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Manager.java new file mode 100644 index 0000000..ce498a9 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Manager.java @@ -0,0 +1,59 @@ +package ru.bvn13.imdbspider.spider.tasker; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.concurrent.*; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class Manager { + + private ExecutorService executor; + + public Manager() { + this.executor = Executors.newCachedThreadPool(); + } + + + public List processTasks(List allTasks) throws ExecutionException, InterruptedException { + + Map> groupedTasks = new ConcurrentHashMap<>(allTasks.size()); + + for (Task task : allTasks) { + List filteredTasks = null; + + if (groupedTasks.keySet().contains(task.getUrl())) { + filteredTasks = groupedTasks.get(task.getUrl()); + } else { + filteredTasks = new ArrayList<>(); + groupedTasks.put(task.getUrl(), filteredTasks); + } + + filteredTasks.add(task); + } + + List result = Collections.synchronizedList(new ArrayList<>()); + + groupedTasks.entrySet().parallelStream().forEach(stringListEntry -> { + Future> r = executor.submit(new Worker(stringListEntry.getKey(), stringListEntry.getValue())); + while (!r.isDone()) { + Thread.yield(); + } + try { + result.addAll(r.get()); + } catch (InterruptedException e) { + e.printStackTrace(); + } catch (ExecutionException e) { + e.printStackTrace(); + } + }); + + + return result; + + } + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Task.java b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Task.java new file mode 100644 index 0000000..a26ca32 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Task.java @@ -0,0 +1,76 @@ +package ru.bvn13.imdbspider.spider.tasker; + +import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; +import ru.bvn13.imdbspider.imdb.DataType; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class Task { + + private String url; + + private String xpathPattern; + private DataType dataType; + private String result; + + private ImdbSpiderException exception; + + public Task() { + } + + public Task(String xpathPattern) { + this.xpathPattern = xpathPattern; + } + + public Task(String url, String xpathPattern) { + this.url = url; + this.xpathPattern = xpathPattern; + } + + public Task(String url, String xpathPattern, DataType dataType) { + this.url = url; + this.xpathPattern = xpathPattern; + this.dataType = dataType; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getXpathPattern() { + return xpathPattern; + } + + public void setXpathPattern(String xpathPattern) { + this.xpathPattern = xpathPattern; + } + + public DataType getDataType() { + return dataType; + } + + public void setDataType(DataType dataType) { + this.dataType = dataType; + } + + public String getResult() { + return result; + } + + public void setResult(String result) { + this.result = result; + } + + public ImdbSpiderException getException() { + return exception; + } + + public void setException(ImdbSpiderException exception) { + this.exception = exception; + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Worker.java b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Worker.java new file mode 100644 index 0000000..df0d6bf --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Worker.java @@ -0,0 +1,67 @@ +package ru.bvn13.imdbspider.spider.tasker; + +import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; +import ru.bvn13.imdbspider.exceptions.extractor.HtmlExtractorException; +import ru.bvn13.imdbspider.spider.extractor.HtmlExtractor; +import ru.bvn13.imdbspider.spider.processor.HtmlProcessor; + +import java.util.List; +import java.util.concurrent.*; + +/** + * @author boyko_vn at 09.01.2019 + */ +public class Worker implements Callable> { + + private final String url; + private final List tasks; + + private final HtmlExtractor htmlExtractor; + private final HtmlProcessor htmlProcessor; + + private final ExecutorService executor; + + public Worker(String url, List tasks) { + this.url = url; + this.tasks = tasks; + + this.htmlExtractor = new HtmlExtractor(); + this.htmlProcessor = new HtmlProcessor(); + + this.executor = Executors.newCachedThreadPool(); + } + + + @Override + public List call() throws Exception { + Future result = executor.submit(() -> htmlExtractor.getHtml(url)); + while (!result.isDone()) { + Thread.yield(); + } + + final String html; + try { + html = result.get(); + } catch (InterruptedException e) { + throw new ImdbSpiderException("Interrupted", e); + } catch (ExecutionException e) { + throw new HtmlExtractorException("Exception has been occurred", e); + } + + tasks.parallelStream().forEach(task -> { + Future taskResult = executor.submit(() -> htmlProcessor.process(html, task.getXpathPattern())); + while (!taskResult.isDone()) { + Thread.yield(); + } + try { + task.setResult(taskResult.get()); + } catch (InterruptedException e) { + task.setException(new ImdbSpiderException("Interrupted", e)); + } catch (ExecutionException e) { + task.setException(new ImdbSpiderException("Exception has been occurred", e)); + } + }); + + return tasks; + } +} diff --git a/core/src/test/java/ru/bvn13/imdbspider/runner/AppTest.java b/core/src/test/java/ru/bvn13/imdbspider/runner/AppTest.java new file mode 100644 index 0000000..0616c84 --- /dev/null +++ b/core/src/test/java/ru/bvn13/imdbspider/runner/AppTest.java @@ -0,0 +1,26 @@ +package ru.bvn13.imdbspider.runner; + +import static org.junit.Assert.assertTrue; + +import org.junit.BeforeClass; +import org.junit.Test; +import ru.bvn13.imdbspider.ImdbSpider; +import ru.bvn13.imdbspider.imdb.Movie; + +import java.util.List; + + +public class AppTest +{ + private static ImdbSpider spider; + + @BeforeClass + public static void initClass() { + spider = ImdbSpider.withApi_1_0(); + } + + @Test + public void searchTerminatorTest() { + List result = spider.searchMovieByTitle("Терминатор", 5); + } +} diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..86856f6 --- /dev/null +++ b/pom.xml @@ -0,0 +1,108 @@ + + + + 4.0.0 + + ru.bvn13.imdbspider + parent + 1.0-SNAPSHOT + + core + runner + + + IMDB-SPIDER :: PARENT + + http://www.example.com + + pom + + + UTF-8 + 11 + ${java.version} + ${java.version} + + + + + junit + junit + 4.11 + test + + + + org.mockito + mockito-core + 2.20.0 + test + + + + + + + + + + maven-clean-plugin + 3.1.0 + + + + maven-resources-plugin + 3.0.2 + + + maven-compiler-plugin + 3.8.0 + + ${java.version} + + + + org.ow2.asm + asm + 6.2 + + + + + maven-surefire-plugin + 2.22.1 + + + org.ow2.asm + asm + 6.2 + + + + + maven-jar-plugin + 3.0.2 + + + maven-install-plugin + 2.5.2 + + + maven-deploy-plugin + 2.8.2 + + + + maven-site-plugin + 3.7.1 + + + maven-project-info-reports-plugin + 3.0.0 + + + + + + diff --git a/runner/pom.xml b/runner/pom.xml new file mode 100644 index 0000000..1aee796 --- /dev/null +++ b/runner/pom.xml @@ -0,0 +1,86 @@ + + + + + parent + ru.bvn13.imdbspider + 1.0-SNAPSHOT + + + 4.0.0 + + imdb-spider-runner + + IMDB-SPIDER :: RUNNER + + jar + + + UTF-8 + + + + + + ru.bvn13.imdbspider + imdb-spider-core + 1.0-SNAPSHOT + compile + + + + junit + junit + 4.11 + test + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/runner/src/main/java/module-info.java b/runner/src/main/java/module-info.java new file mode 100644 index 0000000..a059167 --- /dev/null +++ b/runner/src/main/java/module-info.java @@ -0,0 +1,5 @@ +module imdb.spider.runner { + requires imdb.spider.core; + requires java.xml; + +} \ No newline at end of file diff --git a/runner/src/main/java/ru/bvn13/imdbspider/runner/App.java b/runner/src/main/java/ru/bvn13/imdbspider/runner/App.java new file mode 100644 index 0000000..ffdfb8a --- /dev/null +++ b/runner/src/main/java/ru/bvn13/imdbspider/runner/App.java @@ -0,0 +1,13 @@ +package ru.bvn13.imdbspider.runner; + +/** + * Hello world! + * + */ +public class App +{ + public static void main( String[] args ) + { + System.out.println( "Hello World!" ); + } +} diff --git a/runner/src/test/java/ru/bvn13/imdbspider/runner/AppTest.java b/runner/src/test/java/ru/bvn13/imdbspider/runner/AppTest.java new file mode 100644 index 0000000..7884387 --- /dev/null +++ b/runner/src/test/java/ru/bvn13/imdbspider/runner/AppTest.java @@ -0,0 +1,17 @@ +package ru.bvn13.imdbspider.runner; + +import static org.junit.Assert.assertTrue; + +import org.junit.Test; + +public class AppTest +{ + /** + * Rigorous Test :-) + */ + @Test + public void shouldAnswerWithTrue() + { + assertTrue( true ); + } +}