From 50cb82135df4f269271c787dbaa71c8d39edd4cc Mon Sep 17 00:00:00 2001 From: Vyacheslav Boyko Date: Thu, 10 Jan 2019 18:14:57 +0300 Subject: [PATCH] implemented searching movies by title and retrieving main movie data: id, title, year --- LICENSE | 8 +- core/pom.xml | 8 ++ core/src/main/java/module-info.java | 4 + .../java/ru/bvn13/imdbspider/ImdbSpider.java | 51 ++++---- .../composer/ComposerNotFoundException.java | 28 ++++ .../HtmlToXmlConvertionException.java | 26 ---- .../ru/bvn13/imdbspider/imdb/ImdbObject.java | 6 +- .../java/ru/bvn13/imdbspider/imdb/Movie.java | 18 +++ .../bvn13/imdbspider/imdb/MovieDataType.java | 5 +- .../ru/bvn13/imdbspider/imdb/MovieList.java | 13 ++ .../imdbspider/imdb/MovieListDataType.java | 26 ++++ .../imdbspider/spider/api/ApiFactory.java | 2 +- .../spider/api/v1_0/ApiFactory_1_0.java | 120 ++++++++++++++++-- .../spider/composer/ImdbObjectComposer.java | 14 ++ .../composer/ImdbObjectComposerFactory.java | 32 +++++ .../spider/composer/MovieListComposer.java | 35 +++++ .../spider/processor/HtmlProcessor.java | 50 +------- .../spider/processor/JsoupHtmlProcessor.java | 20 +++ .../imdbspider/spider/tasker/Manager.java | 29 +++-- .../bvn13/imdbspider/spider/tasker/Task.java | 108 +++++++++++++--- .../imdbspider/spider/tasker/Worker.java | 45 +++---- .../ru/bvn13/imdbspider/runner/AppTest.java | 8 +- 22 files changed, 482 insertions(+), 174 deletions(-) create mode 100644 core/src/main/java/ru/bvn13/imdbspider/exceptions/composer/ComposerNotFoundException.java delete mode 100644 core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/HtmlToXmlConvertionException.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/imdb/MovieListDataType.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/spider/composer/ImdbObjectComposer.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/spider/composer/ImdbObjectComposerFactory.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/spider/composer/MovieListComposer.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/spider/processor/JsoupHtmlProcessor.java diff --git a/LICENSE b/LICENSE index 261eeb9..70664aa 100644 --- a/LICENSE +++ b/LICENSE @@ -24,12 +24,12 @@ exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation + including but not limited to software source postprocess, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, + not limited to compiled object postprocess, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or @@ -53,7 +53,7 @@ the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, + communication on electronic mailing lists, source postprocess control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise @@ -156,7 +156,7 @@ negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the + cssSelectorResult of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor diff --git a/core/pom.xml b/core/pom.xml index 6d37707..026ac2f 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -32,6 +32,14 @@ + + + org.jsoup + jsoup + 1.11.3 + + + junit junit diff --git a/core/src/main/java/module-info.java b/core/src/main/java/module-info.java index 001f792..3b219c4 100644 --- a/core/src/main/java/module-info.java +++ b/core/src/main/java/module-info.java @@ -3,7 +3,11 @@ module imdb.spider.core { exports ru.bvn13.imdbspider.imdb; exports ru.bvn13.imdbspider.spider.tasker; exports ru.bvn13.imdbspider.exceptions; + exports ru.bvn13.imdbspider.exceptions.api; + exports ru.bvn13.imdbspider.exceptions.extractor; + exports ru.bvn13.imdbspider.exceptions.processor; requires java.xml; + requires org.jsoup; } diff --git a/core/src/main/java/ru/bvn13/imdbspider/ImdbSpider.java b/core/src/main/java/ru/bvn13/imdbspider/ImdbSpider.java index ab8b288..e624ce6 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/ImdbSpider.java +++ b/core/src/main/java/ru/bvn13/imdbspider/ImdbSpider.java @@ -1,9 +1,13 @@ package ru.bvn13.imdbspider; +import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; import ru.bvn13.imdbspider.exceptions.api.DataTypeNotSupportedException; -import ru.bvn13.imdbspider.imdb.Movie; import ru.bvn13.imdbspider.imdb.MovieDataType; +import ru.bvn13.imdbspider.imdb.MovieList; +import ru.bvn13.imdbspider.imdb.MovieListDataType; import ru.bvn13.imdbspider.spider.api.v1_0.ApiFactory_1_0; +import ru.bvn13.imdbspider.spider.composer.ImdbObjectComposerFactory; +import ru.bvn13.imdbspider.spider.composer.MovieListComposer; import ru.bvn13.imdbspider.spider.tasker.Manager; import ru.bvn13.imdbspider.spider.tasker.Task; import ru.bvn13.imdbspider.spider.api.ApiFactory; @@ -20,58 +24,59 @@ import java.util.concurrent.ExecutionException; */ public class ImdbSpider { - private static final String URL_MAIN = "https://www.imdb.com/"; private static final String URL_SEARCH_TITLE = "https://www.imdb.com/find?ref_=nv_sr_fn&q={{title}}&s=tt"; private Manager manager; private ApiFactory apiFactory; + private ImdbObjectComposerFactory imdbObjectComposerFactory; public static ImdbSpider withApi_1_0() { - return new ImdbSpider(new ApiFactory_1_0()); + ApiFactory apiFactory = new ApiFactory_1_0(); + return new ImdbSpider(apiFactory, new ImdbObjectComposerFactory(apiFactory)); } - public ImdbSpider(ApiFactory apiFactory) { + public ImdbSpider(ApiFactory apiFactory, ImdbObjectComposerFactory imdbObjectComposerFactory) { + this.apiFactory = apiFactory; + this.imdbObjectComposerFactory = imdbObjectComposerFactory; manager = new Manager(); - } - public List searchMovieByTitle(String title) { + public MovieList searchMovieByTitle(String title) throws ImdbSpiderException { return searchMovieByTitle(title, 10); } - public List searchMovieByTitle(String title, int maxCount) { + public MovieList searchMovieByTitle(String title, int maxCount) throws ImdbSpiderException { return searchMovieByTitle(title, maxCount, EnumSet.of(MovieDataType.TITLE)); } - public List searchMovieByTitle(String title, int maxCount, EnumSet dataTypes) { + public MovieList searchMovieByTitle(String title, int maxCount, EnumSet dataTypes) throws ImdbSpiderException { String url = URL_SEARCH_TITLE.replace("{{title}}", URLEncoder.encode(title, Charset.forName("utf-8"))); List tasks = new ArrayList<>(); - for (MovieDataType mdt : MovieDataType.values()) { - if (dataTypes.contains(mdt)) { - try { - tasks.add(apiFactory.taskByDataType(mdt)); - } catch (DataTypeNotSupportedException e) { - //do nothing - e.printStackTrace(); - } - } + try { + Task t1 = apiFactory.taskByDataType(MovieListDataType.ELEMENTS); + t1.setUrl(url); + tasks.add(t1); + } catch (DataTypeNotSupportedException e) { + throw e; } try { - tasks = manager.processTasks(tasks); - } catch (ExecutionException e) { - e.printStackTrace(); - } catch (InterruptedException e) { - e.printStackTrace(); + manager.processTasks(tasks); + } catch (ExecutionException | InterruptedException e) { + throw new ImdbSpiderException("Error has been occurred!", e); } - return null; + + MovieListComposer movieListComposer = (MovieListComposer) imdbObjectComposerFactory.getComposer(MovieList.class); + MovieList movieList = movieListComposer.compose(tasks.get(0)); + + return movieList; } diff --git a/core/src/main/java/ru/bvn13/imdbspider/exceptions/composer/ComposerNotFoundException.java b/core/src/main/java/ru/bvn13/imdbspider/exceptions/composer/ComposerNotFoundException.java new file mode 100644 index 0000000..b3d7257 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/exceptions/composer/ComposerNotFoundException.java @@ -0,0 +1,28 @@ +package ru.bvn13.imdbspider.exceptions.composer; + +import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; + +/** + * @author boyko_vn at 10.01.2019 + */ +public class ComposerNotFoundException extends ImdbSpiderException { + + public ComposerNotFoundException() { + } + + public ComposerNotFoundException(String message) { + super(message); + } + + public ComposerNotFoundException(String message, Throwable cause) { + super(message, cause); + } + + public ComposerNotFoundException(Throwable cause) { + super(cause); + } + + public ComposerNotFoundException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/HtmlToXmlConvertionException.java b/core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/HtmlToXmlConvertionException.java deleted file mode 100644 index ea2ca47..0000000 --- a/core/src/main/java/ru/bvn13/imdbspider/exceptions/processor/HtmlToXmlConvertionException.java +++ /dev/null @@ -1,26 +0,0 @@ -package ru.bvn13.imdbspider.exceptions.processor; - -/** - * @author boyko_vn at 09.01.2019 - */ -public class HtmlToXmlConvertionException extends HtmlProcessorException { - - public HtmlToXmlConvertionException() { - } - - public HtmlToXmlConvertionException(String message) { - super(message); - } - - public HtmlToXmlConvertionException(String message, Throwable cause) { - super(message, cause); - } - - public HtmlToXmlConvertionException(Throwable cause) { - super(cause); - } - - public HtmlToXmlConvertionException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { - super(message, cause, enableSuppression, writableStackTrace); - } -} diff --git a/core/src/main/java/ru/bvn13/imdbspider/imdb/ImdbObject.java b/core/src/main/java/ru/bvn13/imdbspider/imdb/ImdbObject.java index 9db0a44..47696fd 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/imdb/ImdbObject.java +++ b/core/src/main/java/ru/bvn13/imdbspider/imdb/ImdbObject.java @@ -5,14 +5,14 @@ package ru.bvn13.imdbspider.imdb; */ public class ImdbObject { - private int id; + private String id; private String url; - public int getId() { + public String getId() { return id; } - public void setId(int id) { + public void setId(String id) { this.id = id; } diff --git a/core/src/main/java/ru/bvn13/imdbspider/imdb/Movie.java b/core/src/main/java/ru/bvn13/imdbspider/imdb/Movie.java index 4c2b039..f496542 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/imdb/Movie.java +++ b/core/src/main/java/ru/bvn13/imdbspider/imdb/Movie.java @@ -9,6 +9,8 @@ import java.util.concurrent.ConcurrentHashMap; public class Movie extends ImdbObject { private String title; + private String originalTitle; + private Integer year; private Map akas = new ConcurrentHashMap<>(50); @@ -20,6 +22,22 @@ public class Movie extends ImdbObject { this.title = title; } + public String getOriginalTitle() { + return originalTitle; + } + + public void setOriginalTitle(String originalTitle) { + this.originalTitle = originalTitle; + } + + public Integer getYear() { + return year; + } + + public void setYear(Integer year) { + this.year = year; + } + public Map getAkas() { return akas; } diff --git a/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieDataType.java b/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieDataType.java index 7d0b8cb..61979a1 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieDataType.java +++ b/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieDataType.java @@ -7,7 +7,10 @@ import java.util.EnumSet; */ public enum MovieDataType implements DataType { - TITLE("title") + ID("id"), + TITLE("title"), + YEAR("year"), + AKAS("akas") ; diff --git a/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieList.java b/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieList.java index 08d3849..da7918a 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieList.java +++ b/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieList.java @@ -1,10 +1,23 @@ package ru.bvn13.imdbspider.imdb; +import java.util.ArrayList; +import java.util.List; + /** * @author boyko_vn at 09.01.2019 */ public class MovieList extends ImdbObject { + List movies; + public List getMovies() { + if (movies == null) { + movies = new ArrayList<>(); + } + return movies; + } + public void setMovies(List movies) { + this.movies = movies; + } } diff --git a/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieListDataType.java b/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieListDataType.java new file mode 100644 index 0000000..0e43684 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieListDataType.java @@ -0,0 +1,26 @@ +package ru.bvn13.imdbspider.imdb; + +import java.util.EnumSet; + +/** + * @author boyko_vn at 10.01.2019 + */ +public enum MovieListDataType implements DataType { + + ELEMENTS("element") + ; + + private String value; + + MovieListDataType(String v) { + value = v; + } + + public static final EnumSet ALL_DATA = EnumSet.allOf(MovieListDataType.class); + + @Override + public String get() { + return value; + } + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/api/ApiFactory.java b/core/src/main/java/ru/bvn13/imdbspider/spider/api/ApiFactory.java index 9ccd0ca..686734d 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/spider/api/ApiFactory.java +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/api/ApiFactory.java @@ -12,6 +12,6 @@ public interface ApiFactory { Task taskByDataType(DataType dataType) throws DataTypeNotSupportedException; - void fulfillImdbObject(ImdbObject imdbObject, Task task); + void fillUpImdbObject(ImdbObject imdbObject, Task task); } diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java index d5da9b2..2cb2c7b 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java @@ -1,47 +1,145 @@ package ru.bvn13.imdbspider.spider.api.v1_0; +import org.jsoup.nodes.Element; import ru.bvn13.imdbspider.exceptions.api.DataTypeNotSupportedException; -import ru.bvn13.imdbspider.imdb.DataType; -import ru.bvn13.imdbspider.imdb.ImdbObject; -import ru.bvn13.imdbspider.imdb.Movie; -import ru.bvn13.imdbspider.imdb.MovieDataType; +import ru.bvn13.imdbspider.imdb.*; import ru.bvn13.imdbspider.spider.api.ApiFactory; import ru.bvn13.imdbspider.spider.tasker.Task; +import java.util.EnumSet; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + /** * @author boyko_vn at 09.01.2019 */ public class ApiFactory_1_0 implements ApiFactory { + private static final String URL_MAIN = "https://www.imdb.com"; + + private final Pattern PATTERN_MOVIE_ID_FROM_MOVIELIST = Pattern.compile("/title/tt(\\d+)/.*"); + + private EnumSet defaultMovieDataType = EnumSet.of(MovieDataType.ID, MovieDataType.TITLE, MovieDataType.YEAR); + @Override public Task taskByDataType(DataType dataType) throws DataTypeNotSupportedException { if (dataType instanceof MovieDataType) { return taskByMovieDataType((MovieDataType) dataType); + } else if (dataType instanceof MovieListDataType) { + return taskByMovieListDataType((MovieListDataType) dataType); } else { - throw new DataTypeNotSupportedException(String.format("DataType %s not supported by API v1_0!", dataType.getClass().getName())); + throw new DataTypeNotSupportedException(String.format("DataType %s is not supported by API v1_0!", dataType.getClass().getName())); } } @Override - public void fulfillImdbObject(ImdbObject imdbObject, Task task) { + public void fillUpImdbObject(ImdbObject imdbObject, Task task) { if (imdbObject instanceof Movie) { if (task.getDataType() instanceof MovieDataType) { - fulfillMovie((Movie) imdbObject, task); + fillUpMovie((Movie) imdbObject, task); + } + } else if (imdbObject instanceof MovieList) { + if (task.getDataType() instanceof MovieListDataType) { + fillUpMovieList((MovieList) imdbObject, task); } } } private Task taskByMovieDataType(MovieDataType movieDataType) { + Task t = new Task(); + t.setDataType(movieDataType); switch (movieDataType) { - case TITLE: return new Task(); - default: return null; + case ID: + t.setPostprocess((task, s) -> { + Matcher matcher = PATTERN_MOVIE_ID_FROM_MOVIELIST.matcher(task.getUrl()); + if (matcher.find()) { + task.setResultType(String.class); + task.setResult(matcher.group(1)); + } + }); + break; + case TITLE: + t.setCssSelector("#title-overview-widget > div.vital > div.title_block > div > div.titleBar > div.title_wrapper > h1"); + t.setPostprocess((task, s) -> { + task.setResultType(String.class); + task.setResult(task.getCssSelectorResult().first().wholeText().trim()); + }); + break; + case YEAR: + t.setCssSelector("#titleYear > a"); + t.setPostprocess((task, s) -> { + task.setResultType(Integer.class); + if (task.getCssSelectorResult().size() > 0) { + try { + task.setResult(Integer.parseInt(task.getCssSelectorResult().first().text().trim())); + } catch (NumberFormatException e) { + task.setResult(-1); + } + } else { + task.setResult(-1); + } + }); + break; } + return t; } - private void fulfillMovie(Movie movie, Task task) { + private Task taskByMovieListDataType(MovieListDataType movieListDataType) { + Task t = new Task(); + t.setDataType(movieListDataType); + switch (movieListDataType) { + case ELEMENTS: + t.setCssSelector("#main > div > div.findSection > table > tbody > tr > td.result_text"); + t.setResultType(List.class); + t.setPostprocess((task, s) -> { + for (Element element : task.getCssSelectorResult()) { + Element link = element.select("a").first(); + if (!defaultMovieDataType.contains(MovieDataType.ID)) { + defaultMovieDataType.add(MovieDataType.ID); + } + Task movieTask = this.taskByMovieDataType(MovieDataType.ID) + .setParentTask(task) + .setUrl(String.format("%s%s", URL_MAIN, link.attr("href"))); + task.getNestedTasks().add(movieTask); + defaultMovieDataType.forEach(movieDataType -> movieTask.getNestedTasks().add(this.taskByMovieDataType(movieDataType) + .setParentTask(movieTask) + .setUrl(String.format("%s%s", URL_MAIN, link.attr("href"))))); + } + }); + break; + } + return t; + } + + private void fillUpMovie(Movie movie, Task task) { switch ((MovieDataType) task.getDataType()) { - case TITLE: movie.setTitle(task.getResult()); break; + case ID: + movie.setUrl(task.getUrl()); + movie.setId((String) task.getResult()); + break; + case TITLE: + movie.setTitle((String) task.getResult()); + break; + case YEAR: + movie.setYear((Integer) task.getResult()); + break; } } + private void fillUpMovieList(MovieList movieList, Task task) { + switch ((MovieListDataType) task.getDataType()) { + case ELEMENTS: + movieList.setUrl(task.getUrl()); + break; + } + } + + public EnumSet getDefaultMovieDataType() { + return defaultMovieDataType; + } + + public void setDefaultMovieDataType(EnumSet defaultMovieDataType) { + this.defaultMovieDataType = defaultMovieDataType; + } } diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/composer/ImdbObjectComposer.java b/core/src/main/java/ru/bvn13/imdbspider/spider/composer/ImdbObjectComposer.java new file mode 100644 index 0000000..b8fb5e2 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/composer/ImdbObjectComposer.java @@ -0,0 +1,14 @@ +package ru.bvn13.imdbspider.spider.composer; + +import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; +import ru.bvn13.imdbspider.imdb.ImdbObject; +import ru.bvn13.imdbspider.spider.tasker.Task; + +/** + * @author boyko_vn at 10.01.2019 + */ +public interface ImdbObjectComposer { + + C compose(Task task) throws ImdbSpiderException; + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/composer/ImdbObjectComposerFactory.java b/core/src/main/java/ru/bvn13/imdbspider/spider/composer/ImdbObjectComposerFactory.java new file mode 100644 index 0000000..5a00c80 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/composer/ImdbObjectComposerFactory.java @@ -0,0 +1,32 @@ +package ru.bvn13.imdbspider.spider.composer; + +import ru.bvn13.imdbspider.exceptions.composer.ComposerNotFoundException; +import ru.bvn13.imdbspider.imdb.ImdbObject; +import ru.bvn13.imdbspider.imdb.MovieList; +import ru.bvn13.imdbspider.spider.api.ApiFactory; + +/** + * @author boyko_vn at 10.01.2019 + */ +public class ImdbObjectComposerFactory { + + private ApiFactory apiFactory; + + public ImdbObjectComposerFactory(ApiFactory apiFactory) { + this.apiFactory = apiFactory; + } + + private MovieListComposer movieListComposer; + + public ImdbObjectComposer getComposer(Class clazz) throws ComposerNotFoundException { + if (clazz.isAssignableFrom(MovieList.class)) { + if (movieListComposer == null) { + movieListComposer = new MovieListComposer(apiFactory); + return movieListComposer; + } + } + + throw new ComposerNotFoundException(String.format("Composer not found: %s", clazz.getClass().getName())); + } + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/composer/MovieListComposer.java b/core/src/main/java/ru/bvn13/imdbspider/spider/composer/MovieListComposer.java new file mode 100644 index 0000000..57ec682 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/composer/MovieListComposer.java @@ -0,0 +1,35 @@ +package ru.bvn13.imdbspider.spider.composer; + +import ru.bvn13.imdbspider.imdb.Movie; +import ru.bvn13.imdbspider.imdb.MovieList; +import ru.bvn13.imdbspider.spider.api.ApiFactory; +import ru.bvn13.imdbspider.spider.tasker.Task; + +/** + * @author boyko_vn at 10.01.2019 + */ +public class MovieListComposer implements ImdbObjectComposer { + + private ApiFactory apiFactory; + + public MovieListComposer(ApiFactory apiFactory) { + this.apiFactory = apiFactory; + } + + @Override + public MovieList compose(Task task) { + MovieList movieList = new MovieList(); + apiFactory.fillUpImdbObject(movieList, task); + + for (Task movieTask : task.getNestedTasks()) { + Movie movie = new Movie(); + movieList.getMovies().add(movie); + apiFactory.fillUpImdbObject(movie, movieTask); + for (Task nestedTask : movieTask.getNestedTasks()) { + apiFactory.fillUpImdbObject(movie, nestedTask); + } + } + return movieList; + } + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/processor/HtmlProcessor.java b/core/src/main/java/ru/bvn13/imdbspider/spider/processor/HtmlProcessor.java index f8af553..837c0f2 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/spider/processor/HtmlProcessor.java +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/processor/HtmlProcessor.java @@ -1,55 +1,13 @@ package ru.bvn13.imdbspider.spider.processor; - -import org.w3c.dom.Document; -import org.xml.sax.SAXException; +import org.jsoup.select.Elements; import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException; -import ru.bvn13.imdbspider.exceptions.processor.HtmlToXmlConvertionException; -import ru.bvn13.imdbspider.exceptions.processor.PatternEvaluationException; - -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.xpath.XPath; -import javax.xml.xpath.XPathConstants; -import javax.xml.xpath.XPathExpressionException; -import javax.xml.xpath.XPathFactory; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.nio.charset.StandardCharsets; /** - * @author boyko_vn at 09.01.2019 + * @author boyko_vn at 10.01.2019 */ -public class HtmlProcessor { +public interface HtmlProcessor { - public String process(final String html, final String pattern) throws HtmlProcessorException { - DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); - DocumentBuilder db = null; - try { - db = dbf.newDocumentBuilder(); - } catch (ParserConfigurationException e) { - throw new HtmlProcessorException(e); - } - Document xml = null; - try { - xml = db.parse(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8))); - } catch (SAXException e) { - throw new HtmlToXmlConvertionException("Html parsing exception", e); - } catch (IOException e) { - throw new HtmlToXmlConvertionException("Html reading exception", e); - } - - XPathFactory xpf = XPathFactory.newInstance(); - XPath xpath = xpf.newXPath(); - String result = null; - try { - result = (String) xpath.evaluate(pattern, xml, XPathConstants.STRING); - } catch (XPathExpressionException e) { - throw new PatternEvaluationException(String.format("Could not evaluate pattern: %s", pattern), e); - } - - return result; - } + Elements process(final String html, final String pattern) throws HtmlProcessorException; } diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/processor/JsoupHtmlProcessor.java b/core/src/main/java/ru/bvn13/imdbspider/spider/processor/JsoupHtmlProcessor.java new file mode 100644 index 0000000..945bc8d --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/processor/JsoupHtmlProcessor.java @@ -0,0 +1,20 @@ +package ru.bvn13.imdbspider.spider.processor; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; +import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException; + +/** + * @author boyko_vn at 10.01.2019 + */ +public class JsoupHtmlProcessor implements HtmlProcessor { + + @Override + public Elements process(String html, String pattern) throws HtmlProcessorException { + Document doc = Jsoup.parse(html, "UTF-8"); + Elements result = doc.select(pattern); + return result; + } + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Manager.java b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Manager.java index ce498a9..a0d9960 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Manager.java +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Manager.java @@ -1,7 +1,8 @@ package ru.bvn13.imdbspider.spider.tasker; +import ru.bvn13.imdbspider.exceptions.extractor.HtmlExtractorException; + import java.util.ArrayList; -import java.util.Collections; import java.util.List; import java.util.Map; import java.util.concurrent.*; @@ -18,7 +19,7 @@ public class Manager { } - public List processTasks(List allTasks) throws ExecutionException, InterruptedException { + public void processTasks(List allTasks) throws ExecutionException, InterruptedException { Map> groupedTasks = new ConcurrentHashMap<>(allTasks.size()); @@ -35,24 +36,26 @@ public class Manager { filteredTasks.add(task); } - List result = Collections.synchronizedList(new ArrayList<>()); - groupedTasks.entrySet().parallelStream().forEach(stringListEntry -> { - Future> r = executor.submit(new Worker(stringListEntry.getKey(), stringListEntry.getValue())); - while (!r.isDone()) { - Thread.yield(); - } + Worker w = new Worker(stringListEntry.getKey(), stringListEntry.getValue()); try { - result.addAll(r.get()); - } catch (InterruptedException e) { - e.printStackTrace(); - } catch (ExecutionException e) { + w.run(); + } catch (HtmlExtractorException e) { e.printStackTrace(); } }); + List nextTasks = new ArrayList<>(); - return result; + for (Task task : allTasks) { + if (task.hasNextTasks()) { + nextTasks.addAll(task.getNestedTasks()); + } + } + + if (!nextTasks.isEmpty()) { + processTasks(nextTasks); + } } diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Task.java b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Task.java index a26ca32..6e71d42 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Task.java +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Task.java @@ -1,8 +1,13 @@ package ru.bvn13.imdbspider.spider.tasker; +import org.jsoup.select.Elements; import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; import ru.bvn13.imdbspider.imdb.DataType; +import java.util.ArrayList; +import java.util.List; +import java.util.function.BiConsumer; + /** * @author boyko_vn at 09.01.2019 */ @@ -10,27 +15,35 @@ public class Task { private String url; - private String xpathPattern; private DataType dataType; - private String result; + private String cssSelector; + private Elements cssSelectorResult; + + private Class resultType; + private Object result; private ImdbSpiderException exception; + private BiConsumer postprocess; + + private Task parentTask; + private List nestedTasks; + public Task() { } - public Task(String xpathPattern) { - this.xpathPattern = xpathPattern; + public Task(String cssSelector) { + this.cssSelector = cssSelector; } - public Task(String url, String xpathPattern) { + public Task(String url, String cssSelector) { this.url = url; - this.xpathPattern = xpathPattern; + this.cssSelector = cssSelector; } - public Task(String url, String xpathPattern, DataType dataType) { + public Task(String url, String cssSelector, DataType dataType) { this.url = url; - this.xpathPattern = xpathPattern; + this.cssSelector = cssSelector; this.dataType = dataType; } @@ -38,39 +51,96 @@ public class Task { return url; } - public void setUrl(String url) { + public Task setUrl(String url) { this.url = url; + return this; } - public String getXpathPattern() { - return xpathPattern; + public String getCssSelector() { + return cssSelector; } - public void setXpathPattern(String xpathPattern) { - this.xpathPattern = xpathPattern; + public Task setCssSelector(String cssSelector) { + this.cssSelector = cssSelector; + return this; } public DataType getDataType() { return dataType; } - public void setDataType(DataType dataType) { + public Task setDataType(DataType dataType) { this.dataType = dataType; + return this; } - public String getResult() { - return result; + public Elements getCssSelectorResult() { + return cssSelectorResult; } - public void setResult(String result) { - this.result = result; + public Task setCssSelectorResult(Elements cssSelectorResult) { + this.cssSelectorResult = cssSelectorResult; + return this; } public ImdbSpiderException getException() { return exception; } - public void setException(ImdbSpiderException exception) { + public Task setException(ImdbSpiderException exception) { this.exception = exception; + return this; + } + + public BiConsumer getPostprocess() { + return postprocess; + } + + public Task setPostprocess(BiConsumer postprocess) { + this.postprocess = postprocess; + return this; + } + + public Class getResultType() { + return resultType; + } + + public Task setResultType(Class resultType) { + this.resultType = resultType; + return this; + } + + public Object getResult() { + return result; + } + + public Task setResult(Object result) { + this.result = result; + return this; + } + + public boolean hasNextTasks() { + return (nestedTasks != null && !nestedTasks.isEmpty()); + } + + public List getNestedTasks() { + if (nestedTasks == null) { + nestedTasks = new ArrayList<>(); + } + return nestedTasks; + } + + public Task setNestedTasks(List nestedTasks) { + this.nestedTasks = nestedTasks; + return this; + } + + public Task getParentTask() { + return parentTask; + } + + public Task setParentTask(Task parentTask) { + this.parentTask = parentTask; + return this; } } diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Worker.java b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Worker.java index df0d6bf..fd7c3bb 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Worker.java +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Worker.java @@ -2,8 +2,10 @@ package ru.bvn13.imdbspider.spider.tasker; import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; import ru.bvn13.imdbspider.exceptions.extractor.HtmlExtractorException; +import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException; import ru.bvn13.imdbspider.spider.extractor.HtmlExtractor; import ru.bvn13.imdbspider.spider.processor.HtmlProcessor; +import ru.bvn13.imdbspider.spider.processor.JsoupHtmlProcessor; import java.util.List; import java.util.concurrent.*; @@ -11,7 +13,7 @@ import java.util.concurrent.*; /** * @author boyko_vn at 09.01.2019 */ -public class Worker implements Callable> { +public class Worker { private final String url; private final List tasks; @@ -26,42 +28,33 @@ public class Worker implements Callable> { this.tasks = tasks; this.htmlExtractor = new HtmlExtractor(); - this.htmlProcessor = new HtmlProcessor(); + this.htmlProcessor = new JsoupHtmlProcessor(); this.executor = Executors.newCachedThreadPool(); } - @Override - public List call() throws Exception { - Future result = executor.submit(() -> htmlExtractor.getHtml(url)); - while (!result.isDone()) { - Thread.yield(); - } + public Boolean run() throws HtmlExtractorException { - final String html; - try { - html = result.get(); - } catch (InterruptedException e) { - throw new ImdbSpiderException("Interrupted", e); - } catch (ExecutionException e) { - throw new HtmlExtractorException("Exception has been occurred", e); - } + final String html = htmlExtractor.getHtml(url); tasks.parallelStream().forEach(task -> { - Future taskResult = executor.submit(() -> htmlProcessor.process(html, task.getXpathPattern())); - while (!taskResult.isDone()) { - Thread.yield(); - } + try { - task.setResult(taskResult.get()); - } catch (InterruptedException e) { - task.setException(new ImdbSpiderException("Interrupted", e)); - } catch (ExecutionException e) { - task.setException(new ImdbSpiderException("Exception has been occurred", e)); + if (task.getCssSelector() != null && !task.getCssSelector().isEmpty()) { + task.setCssSelectorResult(htmlProcessor.process(html, task.getCssSelector())); + } + + if (task.getPostprocess() != null) { + task.getPostprocess().accept(task, html); + } + } catch (HtmlProcessorException e) { + task.setException(new ImdbSpiderException(e)); + e.printStackTrace(); } + }); - return tasks; + return true; } } diff --git a/core/src/test/java/ru/bvn13/imdbspider/runner/AppTest.java b/core/src/test/java/ru/bvn13/imdbspider/runner/AppTest.java index 0616c84..9908f6a 100644 --- a/core/src/test/java/ru/bvn13/imdbspider/runner/AppTest.java +++ b/core/src/test/java/ru/bvn13/imdbspider/runner/AppTest.java @@ -5,7 +5,9 @@ import static org.junit.Assert.assertTrue; import org.junit.BeforeClass; import org.junit.Test; import ru.bvn13.imdbspider.ImdbSpider; +import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; import ru.bvn13.imdbspider.imdb.Movie; +import ru.bvn13.imdbspider.imdb.MovieList; import java.util.List; @@ -21,6 +23,10 @@ public class AppTest @Test public void searchTerminatorTest() { - List result = spider.searchMovieByTitle("Терминатор", 5); + try { + MovieList result = spider.searchMovieByTitle("test", 5); + } catch (ImdbSpiderException e) { + e.printStackTrace(); + } } }