implemented searching movies by title and retrieving main movie data: id, title, year

master
Vyacheslav Boyko 2019-01-10 18:14:57 +03:00
parent 4205b7ad27
commit 50cb82135d
22 changed files with 482 additions and 174 deletions

View File

@ -24,12 +24,12 @@
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
including but not limited to software source postprocess, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
not limited to compiled object postprocess, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
@ -53,7 +53,7 @@
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
communication on electronic mailing lists, source postprocess control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
@ -156,7 +156,7 @@
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
cssSelectorResult of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor

View File

@ -32,6 +32,14 @@
<!--<version>1.1.6</version>-->
<!--</dependency>-->
<dependency>
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>

View File

@ -3,7 +3,11 @@ module imdb.spider.core {
exports ru.bvn13.imdbspider.imdb;
exports ru.bvn13.imdbspider.spider.tasker;
exports ru.bvn13.imdbspider.exceptions;
exports ru.bvn13.imdbspider.exceptions.api;
exports ru.bvn13.imdbspider.exceptions.extractor;
exports ru.bvn13.imdbspider.exceptions.processor;
requires java.xml;
requires org.jsoup;
}

View File

@ -1,9 +1,13 @@
package ru.bvn13.imdbspider;
import ru.bvn13.imdbspider.exceptions.ImdbSpiderException;
import ru.bvn13.imdbspider.exceptions.api.DataTypeNotSupportedException;
import ru.bvn13.imdbspider.imdb.Movie;
import ru.bvn13.imdbspider.imdb.MovieDataType;
import ru.bvn13.imdbspider.imdb.MovieList;
import ru.bvn13.imdbspider.imdb.MovieListDataType;
import ru.bvn13.imdbspider.spider.api.v1_0.ApiFactory_1_0;
import ru.bvn13.imdbspider.spider.composer.ImdbObjectComposerFactory;
import ru.bvn13.imdbspider.spider.composer.MovieListComposer;
import ru.bvn13.imdbspider.spider.tasker.Manager;
import ru.bvn13.imdbspider.spider.tasker.Task;
import ru.bvn13.imdbspider.spider.api.ApiFactory;
@ -20,58 +24,59 @@ import java.util.concurrent.ExecutionException;
*/
public class ImdbSpider {
private static final String URL_MAIN = "https://www.imdb.com/";
private static final String URL_SEARCH_TITLE = "https://www.imdb.com/find?ref_=nv_sr_fn&q={{title}}&s=tt";
private Manager manager;
private ApiFactory apiFactory;
private ImdbObjectComposerFactory imdbObjectComposerFactory;
public static ImdbSpider withApi_1_0() {
return new ImdbSpider(new ApiFactory_1_0());
ApiFactory apiFactory = new ApiFactory_1_0();
return new ImdbSpider(apiFactory, new ImdbObjectComposerFactory(apiFactory));
}
public ImdbSpider(ApiFactory apiFactory) {
public ImdbSpider(ApiFactory apiFactory, ImdbObjectComposerFactory imdbObjectComposerFactory) {
this.apiFactory = apiFactory;
this.imdbObjectComposerFactory = imdbObjectComposerFactory;
manager = new Manager();
}
public List<Movie> searchMovieByTitle(String title) {
public MovieList searchMovieByTitle(String title) throws ImdbSpiderException {
return searchMovieByTitle(title, 10);
}
public List<Movie> searchMovieByTitle(String title, int maxCount) {
public MovieList searchMovieByTitle(String title, int maxCount) throws ImdbSpiderException {
return searchMovieByTitle(title, maxCount, EnumSet.of(MovieDataType.TITLE));
}
public List<Movie> searchMovieByTitle(String title, int maxCount, EnumSet<MovieDataType> dataTypes) {
public MovieList searchMovieByTitle(String title, int maxCount, EnumSet<MovieDataType> dataTypes) throws ImdbSpiderException {
String url = URL_SEARCH_TITLE.replace("{{title}}", URLEncoder.encode(title, Charset.forName("utf-8")));
List<Task> tasks = new ArrayList<>();
for (MovieDataType mdt : MovieDataType.values()) {
if (dataTypes.contains(mdt)) {
try {
tasks.add(apiFactory.taskByDataType(mdt));
} catch (DataTypeNotSupportedException e) {
//do nothing
e.printStackTrace();
}
}
try {
Task t1 = apiFactory.taskByDataType(MovieListDataType.ELEMENTS);
t1.setUrl(url);
tasks.add(t1);
} catch (DataTypeNotSupportedException e) {
throw e;
}
try {
tasks = manager.processTasks(tasks);
} catch (ExecutionException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
manager.processTasks(tasks);
} catch (ExecutionException | InterruptedException e) {
throw new ImdbSpiderException("Error has been occurred!", e);
}
return null;
MovieListComposer movieListComposer = (MovieListComposer) imdbObjectComposerFactory.getComposer(MovieList.class);
MovieList movieList = movieListComposer.compose(tasks.get(0));
return movieList;
}

View File

@ -0,0 +1,28 @@
package ru.bvn13.imdbspider.exceptions.composer;
import ru.bvn13.imdbspider.exceptions.ImdbSpiderException;
/**
* @author boyko_vn at 10.01.2019
*/
public class ComposerNotFoundException extends ImdbSpiderException {
public ComposerNotFoundException() {
}
public ComposerNotFoundException(String message) {
super(message);
}
public ComposerNotFoundException(String message, Throwable cause) {
super(message, cause);
}
public ComposerNotFoundException(Throwable cause) {
super(cause);
}
public ComposerNotFoundException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
super(message, cause, enableSuppression, writableStackTrace);
}
}

View File

@ -1,26 +0,0 @@
package ru.bvn13.imdbspider.exceptions.processor;
/**
* @author boyko_vn at 09.01.2019
*/
public class HtmlToXmlConvertionException extends HtmlProcessorException {
public HtmlToXmlConvertionException() {
}
public HtmlToXmlConvertionException(String message) {
super(message);
}
public HtmlToXmlConvertionException(String message, Throwable cause) {
super(message, cause);
}
public HtmlToXmlConvertionException(Throwable cause) {
super(cause);
}
public HtmlToXmlConvertionException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
super(message, cause, enableSuppression, writableStackTrace);
}
}

View File

@ -5,14 +5,14 @@ package ru.bvn13.imdbspider.imdb;
*/
public class ImdbObject {
private int id;
private String id;
private String url;
public int getId() {
public String getId() {
return id;
}
public void setId(int id) {
public void setId(String id) {
this.id = id;
}

View File

@ -9,6 +9,8 @@ import java.util.concurrent.ConcurrentHashMap;
public class Movie extends ImdbObject {
private String title;
private String originalTitle;
private Integer year;
private Map<String, String> akas = new ConcurrentHashMap<>(50);
@ -20,6 +22,22 @@ public class Movie extends ImdbObject {
this.title = title;
}
public String getOriginalTitle() {
return originalTitle;
}
public void setOriginalTitle(String originalTitle) {
this.originalTitle = originalTitle;
}
public Integer getYear() {
return year;
}
public void setYear(Integer year) {
this.year = year;
}
public Map<String, String> getAkas() {
return akas;
}

View File

@ -7,7 +7,10 @@ import java.util.EnumSet;
*/
public enum MovieDataType implements DataType {
TITLE("title")
ID("id"),
TITLE("title"),
YEAR("year"),
AKAS("akas")
;

View File

@ -1,10 +1,23 @@
package ru.bvn13.imdbspider.imdb;
import java.util.ArrayList;
import java.util.List;
/**
* @author boyko_vn at 09.01.2019
*/
public class MovieList extends ImdbObject {
List<Movie> movies;
public List<Movie> getMovies() {
if (movies == null) {
movies = new ArrayList<>();
}
return movies;
}
public void setMovies(List<Movie> movies) {
this.movies = movies;
}
}

View File

@ -0,0 +1,26 @@
package ru.bvn13.imdbspider.imdb;
import java.util.EnumSet;
/**
* @author boyko_vn at 10.01.2019
*/
public enum MovieListDataType implements DataType {
ELEMENTS("element")
;
private String value;
MovieListDataType(String v) {
value = v;
}
public static final EnumSet<MovieListDataType> ALL_DATA = EnumSet.allOf(MovieListDataType.class);
@Override
public String get() {
return value;
}
}

View File

@ -12,6 +12,6 @@ public interface ApiFactory {
Task taskByDataType(DataType dataType) throws DataTypeNotSupportedException;
void fulfillImdbObject(ImdbObject imdbObject, Task task);
void fillUpImdbObject(ImdbObject imdbObject, Task task);
}

View File

@ -1,47 +1,145 @@
package ru.bvn13.imdbspider.spider.api.v1_0;
import org.jsoup.nodes.Element;
import ru.bvn13.imdbspider.exceptions.api.DataTypeNotSupportedException;
import ru.bvn13.imdbspider.imdb.DataType;
import ru.bvn13.imdbspider.imdb.ImdbObject;
import ru.bvn13.imdbspider.imdb.Movie;
import ru.bvn13.imdbspider.imdb.MovieDataType;
import ru.bvn13.imdbspider.imdb.*;
import ru.bvn13.imdbspider.spider.api.ApiFactory;
import ru.bvn13.imdbspider.spider.tasker.Task;
import java.util.EnumSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author boyko_vn at 09.01.2019
*/
public class ApiFactory_1_0 implements ApiFactory {
private static final String URL_MAIN = "https://www.imdb.com";
private final Pattern PATTERN_MOVIE_ID_FROM_MOVIELIST = Pattern.compile("/title/tt(\\d+)/.*");
private EnumSet<MovieDataType> defaultMovieDataType = EnumSet.of(MovieDataType.ID, MovieDataType.TITLE, MovieDataType.YEAR);
@Override
public Task taskByDataType(DataType dataType) throws DataTypeNotSupportedException {
if (dataType instanceof MovieDataType) {
return taskByMovieDataType((MovieDataType) dataType);
} else if (dataType instanceof MovieListDataType) {
return taskByMovieListDataType((MovieListDataType) dataType);
} else {
throw new DataTypeNotSupportedException(String.format("DataType %s not supported by API v1_0!", dataType.getClass().getName()));
throw new DataTypeNotSupportedException(String.format("DataType %s is not supported by API v1_0!", dataType.getClass().getName()));
}
}
@Override
public void fulfillImdbObject(ImdbObject imdbObject, Task task) {
public void fillUpImdbObject(ImdbObject imdbObject, Task task) {
if (imdbObject instanceof Movie) {
if (task.getDataType() instanceof MovieDataType) {
fulfillMovie((Movie) imdbObject, task);
fillUpMovie((Movie) imdbObject, task);
}
} else if (imdbObject instanceof MovieList) {
if (task.getDataType() instanceof MovieListDataType) {
fillUpMovieList((MovieList) imdbObject, task);
}
}
}
private Task taskByMovieDataType(MovieDataType movieDataType) {
Task t = new Task();
t.setDataType(movieDataType);
switch (movieDataType) {
case TITLE: return new Task();
default: return null;
case ID:
t.setPostprocess((task, s) -> {
Matcher matcher = PATTERN_MOVIE_ID_FROM_MOVIELIST.matcher(task.getUrl());
if (matcher.find()) {
task.setResultType(String.class);
task.setResult(matcher.group(1));
}
});
break;
case TITLE:
t.setCssSelector("#title-overview-widget > div.vital > div.title_block > div > div.titleBar > div.title_wrapper > h1");
t.setPostprocess((task, s) -> {
task.setResultType(String.class);
task.setResult(task.getCssSelectorResult().first().wholeText().trim());
});
break;
case YEAR:
t.setCssSelector("#titleYear > a");
t.setPostprocess((task, s) -> {
task.setResultType(Integer.class);
if (task.getCssSelectorResult().size() > 0) {
try {
task.setResult(Integer.parseInt(task.getCssSelectorResult().first().text().trim()));
} catch (NumberFormatException e) {
task.setResult(-1);
}
} else {
task.setResult(-1);
}
});
break;
}
return t;
}
private void fulfillMovie(Movie movie, Task task) {
private Task taskByMovieListDataType(MovieListDataType movieListDataType) {
Task t = new Task();
t.setDataType(movieListDataType);
switch (movieListDataType) {
case ELEMENTS:
t.setCssSelector("#main > div > div.findSection > table > tbody > tr > td.result_text");
t.setResultType(List.class);
t.setPostprocess((task, s) -> {
for (Element element : task.getCssSelectorResult()) {
Element link = element.select("a").first();
if (!defaultMovieDataType.contains(MovieDataType.ID)) {
defaultMovieDataType.add(MovieDataType.ID);
}
Task movieTask = this.taskByMovieDataType(MovieDataType.ID)
.setParentTask(task)
.setUrl(String.format("%s%s", URL_MAIN, link.attr("href")));
task.getNestedTasks().add(movieTask);
defaultMovieDataType.forEach(movieDataType -> movieTask.getNestedTasks().add(this.taskByMovieDataType(movieDataType)
.setParentTask(movieTask)
.setUrl(String.format("%s%s", URL_MAIN, link.attr("href")))));
}
});
break;
}
return t;
}
private void fillUpMovie(Movie movie, Task task) {
switch ((MovieDataType) task.getDataType()) {
case TITLE: movie.setTitle(task.getResult()); break;
case ID:
movie.setUrl(task.getUrl());
movie.setId((String) task.getResult());
break;
case TITLE:
movie.setTitle((String) task.getResult());
break;
case YEAR:
movie.setYear((Integer) task.getResult());
break;
}
}
private void fillUpMovieList(MovieList movieList, Task task) {
switch ((MovieListDataType) task.getDataType()) {
case ELEMENTS:
movieList.setUrl(task.getUrl());
break;
}
}
public EnumSet<MovieDataType> getDefaultMovieDataType() {
return defaultMovieDataType;
}
public void setDefaultMovieDataType(EnumSet<MovieDataType> defaultMovieDataType) {
this.defaultMovieDataType = defaultMovieDataType;
}
}

View File

@ -0,0 +1,14 @@
package ru.bvn13.imdbspider.spider.composer;
import ru.bvn13.imdbspider.exceptions.ImdbSpiderException;
import ru.bvn13.imdbspider.imdb.ImdbObject;
import ru.bvn13.imdbspider.spider.tasker.Task;
/**
* @author boyko_vn at 10.01.2019
*/
public interface ImdbObjectComposer<C extends ImdbObject> {
C compose(Task task) throws ImdbSpiderException;
}

View File

@ -0,0 +1,32 @@
package ru.bvn13.imdbspider.spider.composer;
import ru.bvn13.imdbspider.exceptions.composer.ComposerNotFoundException;
import ru.bvn13.imdbspider.imdb.ImdbObject;
import ru.bvn13.imdbspider.imdb.MovieList;
import ru.bvn13.imdbspider.spider.api.ApiFactory;
/**
* @author boyko_vn at 10.01.2019
*/
public class ImdbObjectComposerFactory {
private ApiFactory apiFactory;
public ImdbObjectComposerFactory(ApiFactory apiFactory) {
this.apiFactory = apiFactory;
}
private MovieListComposer movieListComposer;
public <C extends ImdbObject> ImdbObjectComposer getComposer(Class<C> clazz) throws ComposerNotFoundException {
if (clazz.isAssignableFrom(MovieList.class)) {
if (movieListComposer == null) {
movieListComposer = new MovieListComposer(apiFactory);
return movieListComposer;
}
}
throw new ComposerNotFoundException(String.format("Composer not found: %s", clazz.getClass().getName()));
}
}

View File

@ -0,0 +1,35 @@
package ru.bvn13.imdbspider.spider.composer;
import ru.bvn13.imdbspider.imdb.Movie;
import ru.bvn13.imdbspider.imdb.MovieList;
import ru.bvn13.imdbspider.spider.api.ApiFactory;
import ru.bvn13.imdbspider.spider.tasker.Task;
/**
* @author boyko_vn at 10.01.2019
*/
public class MovieListComposer implements ImdbObjectComposer<MovieList> {
private ApiFactory apiFactory;
public MovieListComposer(ApiFactory apiFactory) {
this.apiFactory = apiFactory;
}
@Override
public MovieList compose(Task task) {
MovieList movieList = new MovieList();
apiFactory.fillUpImdbObject(movieList, task);
for (Task movieTask : task.getNestedTasks()) {
Movie movie = new Movie();
movieList.getMovies().add(movie);
apiFactory.fillUpImdbObject(movie, movieTask);
for (Task nestedTask : movieTask.getNestedTasks()) {
apiFactory.fillUpImdbObject(movie, nestedTask);
}
}
return movieList;
}
}

View File

@ -1,55 +1,13 @@
package ru.bvn13.imdbspider.spider.processor;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
import org.jsoup.select.Elements;
import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException;
import ru.bvn13.imdbspider.exceptions.processor.HtmlToXmlConvertionException;
import ru.bvn13.imdbspider.exceptions.processor.PatternEvaluationException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
/**
* @author boyko_vn at 09.01.2019
* @author boyko_vn at 10.01.2019
*/
public class HtmlProcessor {
public interface HtmlProcessor {
public String process(final String html, final String pattern) throws HtmlProcessorException {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = null;
try {
db = dbf.newDocumentBuilder();
} catch (ParserConfigurationException e) {
throw new HtmlProcessorException(e);
}
Document xml = null;
try {
xml = db.parse(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)));
} catch (SAXException e) {
throw new HtmlToXmlConvertionException("Html parsing exception", e);
} catch (IOException e) {
throw new HtmlToXmlConvertionException("Html reading exception", e);
}
XPathFactory xpf = XPathFactory.newInstance();
XPath xpath = xpf.newXPath();
String result = null;
try {
result = (String) xpath.evaluate(pattern, xml, XPathConstants.STRING);
} catch (XPathExpressionException e) {
throw new PatternEvaluationException(String.format("Could not evaluate pattern: %s", pattern), e);
}
return result;
}
Elements process(final String html, final String pattern) throws HtmlProcessorException;
}

View File

@ -0,0 +1,20 @@
package ru.bvn13.imdbspider.spider.processor;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException;
/**
* @author boyko_vn at 10.01.2019
*/
public class JsoupHtmlProcessor implements HtmlProcessor {
@Override
public Elements process(String html, String pattern) throws HtmlProcessorException {
Document doc = Jsoup.parse(html, "UTF-8");
Elements result = doc.select(pattern);
return result;
}
}

View File

@ -1,7 +1,8 @@
package ru.bvn13.imdbspider.spider.tasker;
import ru.bvn13.imdbspider.exceptions.extractor.HtmlExtractorException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.*;
@ -18,7 +19,7 @@ public class Manager {
}
public List<Task> processTasks(List<Task> allTasks) throws ExecutionException, InterruptedException {
public void processTasks(List<Task> allTasks) throws ExecutionException, InterruptedException {
Map<String, List<Task>> groupedTasks = new ConcurrentHashMap<>(allTasks.size());
@ -35,24 +36,26 @@ public class Manager {
filteredTasks.add(task);
}
List<Task> result = Collections.synchronizedList(new ArrayList<>());
groupedTasks.entrySet().parallelStream().forEach(stringListEntry -> {
Future<List<Task>> r = executor.submit(new Worker(stringListEntry.getKey(), stringListEntry.getValue()));
while (!r.isDone()) {
Thread.yield();
}
Worker w = new Worker(stringListEntry.getKey(), stringListEntry.getValue());
try {
result.addAll(r.get());
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ExecutionException e) {
w.run();
} catch (HtmlExtractorException e) {
e.printStackTrace();
}
});
List<Task> nextTasks = new ArrayList<>();
return result;
for (Task task : allTasks) {
if (task.hasNextTasks()) {
nextTasks.addAll(task.getNestedTasks());
}
}
if (!nextTasks.isEmpty()) {
processTasks(nextTasks);
}
}

View File

@ -1,8 +1,13 @@
package ru.bvn13.imdbspider.spider.tasker;
import org.jsoup.select.Elements;
import ru.bvn13.imdbspider.exceptions.ImdbSpiderException;
import ru.bvn13.imdbspider.imdb.DataType;
import java.util.ArrayList;
import java.util.List;
import java.util.function.BiConsumer;
/**
* @author boyko_vn at 09.01.2019
*/
@ -10,27 +15,35 @@ public class Task {
private String url;
private String xpathPattern;
private DataType dataType;
private String result;
private String cssSelector;
private Elements cssSelectorResult;
private Class resultType;
private Object result;
private ImdbSpiderException exception;
private BiConsumer<Task, String> postprocess;
private Task parentTask;
private List<Task> nestedTasks;
public Task() {
}
public Task(String xpathPattern) {
this.xpathPattern = xpathPattern;
public Task(String cssSelector) {
this.cssSelector = cssSelector;
}
public Task(String url, String xpathPattern) {
public Task(String url, String cssSelector) {
this.url = url;
this.xpathPattern = xpathPattern;
this.cssSelector = cssSelector;
}
public Task(String url, String xpathPattern, DataType dataType) {
public Task(String url, String cssSelector, DataType dataType) {
this.url = url;
this.xpathPattern = xpathPattern;
this.cssSelector = cssSelector;
this.dataType = dataType;
}
@ -38,39 +51,96 @@ public class Task {
return url;
}
public void setUrl(String url) {
public Task setUrl(String url) {
this.url = url;
return this;
}
public String getXpathPattern() {
return xpathPattern;
public String getCssSelector() {
return cssSelector;
}
public void setXpathPattern(String xpathPattern) {
this.xpathPattern = xpathPattern;
public Task setCssSelector(String cssSelector) {
this.cssSelector = cssSelector;
return this;
}
public DataType getDataType() {
return dataType;
}
public void setDataType(DataType dataType) {
public Task setDataType(DataType dataType) {
this.dataType = dataType;
return this;
}
public String getResult() {
return result;
public Elements getCssSelectorResult() {
return cssSelectorResult;
}
public void setResult(String result) {
this.result = result;
public Task setCssSelectorResult(Elements cssSelectorResult) {
this.cssSelectorResult = cssSelectorResult;
return this;
}
public ImdbSpiderException getException() {
return exception;
}
public void setException(ImdbSpiderException exception) {
public Task setException(ImdbSpiderException exception) {
this.exception = exception;
return this;
}
public BiConsumer<Task, String> getPostprocess() {
return postprocess;
}
public Task setPostprocess(BiConsumer<Task, String> postprocess) {
this.postprocess = postprocess;
return this;
}
public Class getResultType() {
return resultType;
}
public Task setResultType(Class resultType) {
this.resultType = resultType;
return this;
}
public Object getResult() {
return result;
}
public Task setResult(Object result) {
this.result = result;
return this;
}
public boolean hasNextTasks() {
return (nestedTasks != null && !nestedTasks.isEmpty());
}
public List<Task> getNestedTasks() {
if (nestedTasks == null) {
nestedTasks = new ArrayList<>();
}
return nestedTasks;
}
public Task setNestedTasks(List<Task> nestedTasks) {
this.nestedTasks = nestedTasks;
return this;
}
public Task getParentTask() {
return parentTask;
}
public Task setParentTask(Task parentTask) {
this.parentTask = parentTask;
return this;
}
}

View File

@ -2,8 +2,10 @@ package ru.bvn13.imdbspider.spider.tasker;
import ru.bvn13.imdbspider.exceptions.ImdbSpiderException;
import ru.bvn13.imdbspider.exceptions.extractor.HtmlExtractorException;
import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException;
import ru.bvn13.imdbspider.spider.extractor.HtmlExtractor;
import ru.bvn13.imdbspider.spider.processor.HtmlProcessor;
import ru.bvn13.imdbspider.spider.processor.JsoupHtmlProcessor;
import java.util.List;
import java.util.concurrent.*;
@ -11,7 +13,7 @@ import java.util.concurrent.*;
/**
* @author boyko_vn at 09.01.2019
*/
public class Worker implements Callable<List<Task>> {
public class Worker {
private final String url;
private final List<Task> tasks;
@ -26,42 +28,33 @@ public class Worker implements Callable<List<Task>> {
this.tasks = tasks;
this.htmlExtractor = new HtmlExtractor();
this.htmlProcessor = new HtmlProcessor();
this.htmlProcessor = new JsoupHtmlProcessor();
this.executor = Executors.newCachedThreadPool();
}
@Override
public List<Task> call() throws Exception {
Future<String> result = executor.submit(() -> htmlExtractor.getHtml(url));
while (!result.isDone()) {
Thread.yield();
}
public Boolean run() throws HtmlExtractorException {
final String html;
try {
html = result.get();
} catch (InterruptedException e) {
throw new ImdbSpiderException("Interrupted", e);
} catch (ExecutionException e) {
throw new HtmlExtractorException("Exception has been occurred", e);
}
final String html = htmlExtractor.getHtml(url);
tasks.parallelStream().forEach(task -> {
Future<String> taskResult = executor.submit(() -> htmlProcessor.process(html, task.getXpathPattern()));
while (!taskResult.isDone()) {
Thread.yield();
}
try {
task.setResult(taskResult.get());
} catch (InterruptedException e) {
task.setException(new ImdbSpiderException("Interrupted", e));
} catch (ExecutionException e) {
task.setException(new ImdbSpiderException("Exception has been occurred", e));
if (task.getCssSelector() != null && !task.getCssSelector().isEmpty()) {
task.setCssSelectorResult(htmlProcessor.process(html, task.getCssSelector()));
}
if (task.getPostprocess() != null) {
task.getPostprocess().accept(task, html);
}
} catch (HtmlProcessorException e) {
task.setException(new ImdbSpiderException(e));
e.printStackTrace();
}
});
return tasks;
return true;
}
}

View File

@ -5,7 +5,9 @@ import static org.junit.Assert.assertTrue;
import org.junit.BeforeClass;
import org.junit.Test;
import ru.bvn13.imdbspider.ImdbSpider;
import ru.bvn13.imdbspider.exceptions.ImdbSpiderException;
import ru.bvn13.imdbspider.imdb.Movie;
import ru.bvn13.imdbspider.imdb.MovieList;
import java.util.List;
@ -21,6 +23,10 @@ public class AppTest
@Test
public void searchTerminatorTest() {
List<Movie> result = spider.searchMovieByTitle("Терминатор", 5);
try {
MovieList result = spider.searchMovieByTitle("test", 5);
} catch (ImdbSpiderException e) {
e.printStackTrace();
}
}
}