mirror of https://github.com/bvn13/imdb-spider.git
implemented searching movies by title and retrieving main movie data: id, title, year
parent
4205b7ad27
commit
50cb82135d
8
LICENSE
8
LICENSE
|
@ -24,12 +24,12 @@
|
|||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
including but not limited to software source postprocess, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
not limited to compiled object postprocess, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
|
@ -53,7 +53,7 @@
|
|||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
communication on electronic mailing lists, source postprocess control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
|
@ -156,7 +156,7 @@
|
|||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
cssSelectorResult of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
|
|
|
@ -32,6 +32,14 @@
|
|||
<!--<version>1.1.6</version>-->
|
||||
<!--</dependency>-->
|
||||
|
||||
<dependency>
|
||||
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
<version>1.11.3</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
|
|
|
@ -3,7 +3,11 @@ module imdb.spider.core {
|
|||
exports ru.bvn13.imdbspider.imdb;
|
||||
exports ru.bvn13.imdbspider.spider.tasker;
|
||||
exports ru.bvn13.imdbspider.exceptions;
|
||||
exports ru.bvn13.imdbspider.exceptions.api;
|
||||
exports ru.bvn13.imdbspider.exceptions.extractor;
|
||||
exports ru.bvn13.imdbspider.exceptions.processor;
|
||||
|
||||
requires java.xml;
|
||||
requires org.jsoup;
|
||||
|
||||
}
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
package ru.bvn13.imdbspider;
|
||||
|
||||
import ru.bvn13.imdbspider.exceptions.ImdbSpiderException;
|
||||
import ru.bvn13.imdbspider.exceptions.api.DataTypeNotSupportedException;
|
||||
import ru.bvn13.imdbspider.imdb.Movie;
|
||||
import ru.bvn13.imdbspider.imdb.MovieDataType;
|
||||
import ru.bvn13.imdbspider.imdb.MovieList;
|
||||
import ru.bvn13.imdbspider.imdb.MovieListDataType;
|
||||
import ru.bvn13.imdbspider.spider.api.v1_0.ApiFactory_1_0;
|
||||
import ru.bvn13.imdbspider.spider.composer.ImdbObjectComposerFactory;
|
||||
import ru.bvn13.imdbspider.spider.composer.MovieListComposer;
|
||||
import ru.bvn13.imdbspider.spider.tasker.Manager;
|
||||
import ru.bvn13.imdbspider.spider.tasker.Task;
|
||||
import ru.bvn13.imdbspider.spider.api.ApiFactory;
|
||||
|
@ -20,58 +24,59 @@ import java.util.concurrent.ExecutionException;
|
|||
*/
|
||||
public class ImdbSpider {
|
||||
|
||||
private static final String URL_MAIN = "https://www.imdb.com/";
|
||||
private static final String URL_SEARCH_TITLE = "https://www.imdb.com/find?ref_=nv_sr_fn&q={{title}}&s=tt";
|
||||
|
||||
private Manager manager;
|
||||
|
||||
private ApiFactory apiFactory;
|
||||
private ImdbObjectComposerFactory imdbObjectComposerFactory;
|
||||
|
||||
public static ImdbSpider withApi_1_0() {
|
||||
return new ImdbSpider(new ApiFactory_1_0());
|
||||
ApiFactory apiFactory = new ApiFactory_1_0();
|
||||
return new ImdbSpider(apiFactory, new ImdbObjectComposerFactory(apiFactory));
|
||||
}
|
||||
|
||||
|
||||
public ImdbSpider(ApiFactory apiFactory) {
|
||||
public ImdbSpider(ApiFactory apiFactory, ImdbObjectComposerFactory imdbObjectComposerFactory) {
|
||||
this.apiFactory = apiFactory;
|
||||
this.imdbObjectComposerFactory = imdbObjectComposerFactory;
|
||||
|
||||
manager = new Manager();
|
||||
|
||||
}
|
||||
|
||||
public List<Movie> searchMovieByTitle(String title) {
|
||||
public MovieList searchMovieByTitle(String title) throws ImdbSpiderException {
|
||||
return searchMovieByTitle(title, 10);
|
||||
}
|
||||
|
||||
public List<Movie> searchMovieByTitle(String title, int maxCount) {
|
||||
public MovieList searchMovieByTitle(String title, int maxCount) throws ImdbSpiderException {
|
||||
return searchMovieByTitle(title, maxCount, EnumSet.of(MovieDataType.TITLE));
|
||||
}
|
||||
|
||||
public List<Movie> searchMovieByTitle(String title, int maxCount, EnumSet<MovieDataType> dataTypes) {
|
||||
public MovieList searchMovieByTitle(String title, int maxCount, EnumSet<MovieDataType> dataTypes) throws ImdbSpiderException {
|
||||
|
||||
String url = URL_SEARCH_TITLE.replace("{{title}}", URLEncoder.encode(title, Charset.forName("utf-8")));
|
||||
|
||||
List<Task> tasks = new ArrayList<>();
|
||||
|
||||
for (MovieDataType mdt : MovieDataType.values()) {
|
||||
if (dataTypes.contains(mdt)) {
|
||||
try {
|
||||
tasks.add(apiFactory.taskByDataType(mdt));
|
||||
} catch (DataTypeNotSupportedException e) {
|
||||
//do nothing
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
try {
|
||||
Task t1 = apiFactory.taskByDataType(MovieListDataType.ELEMENTS);
|
||||
t1.setUrl(url);
|
||||
tasks.add(t1);
|
||||
} catch (DataTypeNotSupportedException e) {
|
||||
throw e;
|
||||
}
|
||||
|
||||
try {
|
||||
tasks = manager.processTasks(tasks);
|
||||
} catch (ExecutionException e) {
|
||||
e.printStackTrace();
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
manager.processTasks(tasks);
|
||||
} catch (ExecutionException | InterruptedException e) {
|
||||
throw new ImdbSpiderException("Error has been occurred!", e);
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
MovieListComposer movieListComposer = (MovieListComposer) imdbObjectComposerFactory.getComposer(MovieList.class);
|
||||
MovieList movieList = movieListComposer.compose(tasks.get(0));
|
||||
|
||||
return movieList;
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
package ru.bvn13.imdbspider.exceptions.composer;
|
||||
|
||||
import ru.bvn13.imdbspider.exceptions.ImdbSpiderException;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 10.01.2019
|
||||
*/
|
||||
public class ComposerNotFoundException extends ImdbSpiderException {
|
||||
|
||||
public ComposerNotFoundException() {
|
||||
}
|
||||
|
||||
public ComposerNotFoundException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public ComposerNotFoundException(String message, Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
public ComposerNotFoundException(Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
|
||||
public ComposerNotFoundException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
|
||||
super(message, cause, enableSuppression, writableStackTrace);
|
||||
}
|
||||
}
|
|
@ -1,26 +0,0 @@
|
|||
package ru.bvn13.imdbspider.exceptions.processor;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 09.01.2019
|
||||
*/
|
||||
public class HtmlToXmlConvertionException extends HtmlProcessorException {
|
||||
|
||||
public HtmlToXmlConvertionException() {
|
||||
}
|
||||
|
||||
public HtmlToXmlConvertionException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public HtmlToXmlConvertionException(String message, Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
public HtmlToXmlConvertionException(Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
|
||||
public HtmlToXmlConvertionException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
|
||||
super(message, cause, enableSuppression, writableStackTrace);
|
||||
}
|
||||
}
|
|
@ -5,14 +5,14 @@ package ru.bvn13.imdbspider.imdb;
|
|||
*/
|
||||
public class ImdbObject {
|
||||
|
||||
private int id;
|
||||
private String id;
|
||||
private String url;
|
||||
|
||||
public int getId() {
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(int id) {
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
|
|
|
@ -9,6 +9,8 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||
public class Movie extends ImdbObject {
|
||||
|
||||
private String title;
|
||||
private String originalTitle;
|
||||
private Integer year;
|
||||
private Map<String, String> akas = new ConcurrentHashMap<>(50);
|
||||
|
||||
|
||||
|
@ -20,6 +22,22 @@ public class Movie extends ImdbObject {
|
|||
this.title = title;
|
||||
}
|
||||
|
||||
public String getOriginalTitle() {
|
||||
return originalTitle;
|
||||
}
|
||||
|
||||
public void setOriginalTitle(String originalTitle) {
|
||||
this.originalTitle = originalTitle;
|
||||
}
|
||||
|
||||
public Integer getYear() {
|
||||
return year;
|
||||
}
|
||||
|
||||
public void setYear(Integer year) {
|
||||
this.year = year;
|
||||
}
|
||||
|
||||
public Map<String, String> getAkas() {
|
||||
return akas;
|
||||
}
|
||||
|
|
|
@ -7,7 +7,10 @@ import java.util.EnumSet;
|
|||
*/
|
||||
public enum MovieDataType implements DataType {
|
||||
|
||||
TITLE("title")
|
||||
ID("id"),
|
||||
TITLE("title"),
|
||||
YEAR("year"),
|
||||
AKAS("akas")
|
||||
|
||||
;
|
||||
|
||||
|
|
|
@ -1,10 +1,23 @@
|
|||
package ru.bvn13.imdbspider.imdb;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 09.01.2019
|
||||
*/
|
||||
public class MovieList extends ImdbObject {
|
||||
|
||||
List<Movie> movies;
|
||||
|
||||
public List<Movie> getMovies() {
|
||||
if (movies == null) {
|
||||
movies = new ArrayList<>();
|
||||
}
|
||||
return movies;
|
||||
}
|
||||
|
||||
public void setMovies(List<Movie> movies) {
|
||||
this.movies = movies;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
package ru.bvn13.imdbspider.imdb;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 10.01.2019
|
||||
*/
|
||||
public enum MovieListDataType implements DataType {
|
||||
|
||||
ELEMENTS("element")
|
||||
;
|
||||
|
||||
private String value;
|
||||
|
||||
MovieListDataType(String v) {
|
||||
value = v;
|
||||
}
|
||||
|
||||
public static final EnumSet<MovieListDataType> ALL_DATA = EnumSet.allOf(MovieListDataType.class);
|
||||
|
||||
@Override
|
||||
public String get() {
|
||||
return value;
|
||||
}
|
||||
|
||||
}
|
|
@ -12,6 +12,6 @@ public interface ApiFactory {
|
|||
|
||||
Task taskByDataType(DataType dataType) throws DataTypeNotSupportedException;
|
||||
|
||||
void fulfillImdbObject(ImdbObject imdbObject, Task task);
|
||||
void fillUpImdbObject(ImdbObject imdbObject, Task task);
|
||||
|
||||
}
|
||||
|
|
|
@ -1,47 +1,145 @@
|
|||
package ru.bvn13.imdbspider.spider.api.v1_0;
|
||||
|
||||
import org.jsoup.nodes.Element;
|
||||
import ru.bvn13.imdbspider.exceptions.api.DataTypeNotSupportedException;
|
||||
import ru.bvn13.imdbspider.imdb.DataType;
|
||||
import ru.bvn13.imdbspider.imdb.ImdbObject;
|
||||
import ru.bvn13.imdbspider.imdb.Movie;
|
||||
import ru.bvn13.imdbspider.imdb.MovieDataType;
|
||||
import ru.bvn13.imdbspider.imdb.*;
|
||||
import ru.bvn13.imdbspider.spider.api.ApiFactory;
|
||||
import ru.bvn13.imdbspider.spider.tasker.Task;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 09.01.2019
|
||||
*/
|
||||
public class ApiFactory_1_0 implements ApiFactory {
|
||||
|
||||
private static final String URL_MAIN = "https://www.imdb.com";
|
||||
|
||||
private final Pattern PATTERN_MOVIE_ID_FROM_MOVIELIST = Pattern.compile("/title/tt(\\d+)/.*");
|
||||
|
||||
private EnumSet<MovieDataType> defaultMovieDataType = EnumSet.of(MovieDataType.ID, MovieDataType.TITLE, MovieDataType.YEAR);
|
||||
|
||||
@Override
|
||||
public Task taskByDataType(DataType dataType) throws DataTypeNotSupportedException {
|
||||
if (dataType instanceof MovieDataType) {
|
||||
return taskByMovieDataType((MovieDataType) dataType);
|
||||
} else if (dataType instanceof MovieListDataType) {
|
||||
return taskByMovieListDataType((MovieListDataType) dataType);
|
||||
} else {
|
||||
throw new DataTypeNotSupportedException(String.format("DataType %s not supported by API v1_0!", dataType.getClass().getName()));
|
||||
throw new DataTypeNotSupportedException(String.format("DataType %s is not supported by API v1_0!", dataType.getClass().getName()));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void fulfillImdbObject(ImdbObject imdbObject, Task task) {
|
||||
public void fillUpImdbObject(ImdbObject imdbObject, Task task) {
|
||||
if (imdbObject instanceof Movie) {
|
||||
if (task.getDataType() instanceof MovieDataType) {
|
||||
fulfillMovie((Movie) imdbObject, task);
|
||||
fillUpMovie((Movie) imdbObject, task);
|
||||
}
|
||||
} else if (imdbObject instanceof MovieList) {
|
||||
if (task.getDataType() instanceof MovieListDataType) {
|
||||
fillUpMovieList((MovieList) imdbObject, task);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Task taskByMovieDataType(MovieDataType movieDataType) {
|
||||
Task t = new Task();
|
||||
t.setDataType(movieDataType);
|
||||
switch (movieDataType) {
|
||||
case TITLE: return new Task();
|
||||
default: return null;
|
||||
case ID:
|
||||
t.setPostprocess((task, s) -> {
|
||||
Matcher matcher = PATTERN_MOVIE_ID_FROM_MOVIELIST.matcher(task.getUrl());
|
||||
if (matcher.find()) {
|
||||
task.setResultType(String.class);
|
||||
task.setResult(matcher.group(1));
|
||||
}
|
||||
});
|
||||
break;
|
||||
case TITLE:
|
||||
t.setCssSelector("#title-overview-widget > div.vital > div.title_block > div > div.titleBar > div.title_wrapper > h1");
|
||||
t.setPostprocess((task, s) -> {
|
||||
task.setResultType(String.class);
|
||||
task.setResult(task.getCssSelectorResult().first().wholeText().trim());
|
||||
});
|
||||
break;
|
||||
case YEAR:
|
||||
t.setCssSelector("#titleYear > a");
|
||||
t.setPostprocess((task, s) -> {
|
||||
task.setResultType(Integer.class);
|
||||
if (task.getCssSelectorResult().size() > 0) {
|
||||
try {
|
||||
task.setResult(Integer.parseInt(task.getCssSelectorResult().first().text().trim()));
|
||||
} catch (NumberFormatException e) {
|
||||
task.setResult(-1);
|
||||
}
|
||||
} else {
|
||||
task.setResult(-1);
|
||||
}
|
||||
});
|
||||
break;
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
private void fulfillMovie(Movie movie, Task task) {
|
||||
private Task taskByMovieListDataType(MovieListDataType movieListDataType) {
|
||||
Task t = new Task();
|
||||
t.setDataType(movieListDataType);
|
||||
switch (movieListDataType) {
|
||||
case ELEMENTS:
|
||||
t.setCssSelector("#main > div > div.findSection > table > tbody > tr > td.result_text");
|
||||
t.setResultType(List.class);
|
||||
t.setPostprocess((task, s) -> {
|
||||
for (Element element : task.getCssSelectorResult()) {
|
||||
Element link = element.select("a").first();
|
||||
if (!defaultMovieDataType.contains(MovieDataType.ID)) {
|
||||
defaultMovieDataType.add(MovieDataType.ID);
|
||||
}
|
||||
Task movieTask = this.taskByMovieDataType(MovieDataType.ID)
|
||||
.setParentTask(task)
|
||||
.setUrl(String.format("%s%s", URL_MAIN, link.attr("href")));
|
||||
task.getNestedTasks().add(movieTask);
|
||||
defaultMovieDataType.forEach(movieDataType -> movieTask.getNestedTasks().add(this.taskByMovieDataType(movieDataType)
|
||||
.setParentTask(movieTask)
|
||||
.setUrl(String.format("%s%s", URL_MAIN, link.attr("href")))));
|
||||
}
|
||||
});
|
||||
break;
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
private void fillUpMovie(Movie movie, Task task) {
|
||||
switch ((MovieDataType) task.getDataType()) {
|
||||
case TITLE: movie.setTitle(task.getResult()); break;
|
||||
case ID:
|
||||
movie.setUrl(task.getUrl());
|
||||
movie.setId((String) task.getResult());
|
||||
break;
|
||||
case TITLE:
|
||||
movie.setTitle((String) task.getResult());
|
||||
break;
|
||||
case YEAR:
|
||||
movie.setYear((Integer) task.getResult());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private void fillUpMovieList(MovieList movieList, Task task) {
|
||||
switch ((MovieListDataType) task.getDataType()) {
|
||||
case ELEMENTS:
|
||||
movieList.setUrl(task.getUrl());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
public EnumSet<MovieDataType> getDefaultMovieDataType() {
|
||||
return defaultMovieDataType;
|
||||
}
|
||||
|
||||
public void setDefaultMovieDataType(EnumSet<MovieDataType> defaultMovieDataType) {
|
||||
this.defaultMovieDataType = defaultMovieDataType;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
package ru.bvn13.imdbspider.spider.composer;
|
||||
|
||||
import ru.bvn13.imdbspider.exceptions.ImdbSpiderException;
|
||||
import ru.bvn13.imdbspider.imdb.ImdbObject;
|
||||
import ru.bvn13.imdbspider.spider.tasker.Task;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 10.01.2019
|
||||
*/
|
||||
public interface ImdbObjectComposer<C extends ImdbObject> {
|
||||
|
||||
C compose(Task task) throws ImdbSpiderException;
|
||||
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package ru.bvn13.imdbspider.spider.composer;
|
||||
|
||||
import ru.bvn13.imdbspider.exceptions.composer.ComposerNotFoundException;
|
||||
import ru.bvn13.imdbspider.imdb.ImdbObject;
|
||||
import ru.bvn13.imdbspider.imdb.MovieList;
|
||||
import ru.bvn13.imdbspider.spider.api.ApiFactory;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 10.01.2019
|
||||
*/
|
||||
public class ImdbObjectComposerFactory {
|
||||
|
||||
private ApiFactory apiFactory;
|
||||
|
||||
public ImdbObjectComposerFactory(ApiFactory apiFactory) {
|
||||
this.apiFactory = apiFactory;
|
||||
}
|
||||
|
||||
private MovieListComposer movieListComposer;
|
||||
|
||||
public <C extends ImdbObject> ImdbObjectComposer getComposer(Class<C> clazz) throws ComposerNotFoundException {
|
||||
if (clazz.isAssignableFrom(MovieList.class)) {
|
||||
if (movieListComposer == null) {
|
||||
movieListComposer = new MovieListComposer(apiFactory);
|
||||
return movieListComposer;
|
||||
}
|
||||
}
|
||||
|
||||
throw new ComposerNotFoundException(String.format("Composer not found: %s", clazz.getClass().getName()));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
package ru.bvn13.imdbspider.spider.composer;
|
||||
|
||||
import ru.bvn13.imdbspider.imdb.Movie;
|
||||
import ru.bvn13.imdbspider.imdb.MovieList;
|
||||
import ru.bvn13.imdbspider.spider.api.ApiFactory;
|
||||
import ru.bvn13.imdbspider.spider.tasker.Task;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 10.01.2019
|
||||
*/
|
||||
public class MovieListComposer implements ImdbObjectComposer<MovieList> {
|
||||
|
||||
private ApiFactory apiFactory;
|
||||
|
||||
public MovieListComposer(ApiFactory apiFactory) {
|
||||
this.apiFactory = apiFactory;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MovieList compose(Task task) {
|
||||
MovieList movieList = new MovieList();
|
||||
apiFactory.fillUpImdbObject(movieList, task);
|
||||
|
||||
for (Task movieTask : task.getNestedTasks()) {
|
||||
Movie movie = new Movie();
|
||||
movieList.getMovies().add(movie);
|
||||
apiFactory.fillUpImdbObject(movie, movieTask);
|
||||
for (Task nestedTask : movieTask.getNestedTasks()) {
|
||||
apiFactory.fillUpImdbObject(movie, nestedTask);
|
||||
}
|
||||
}
|
||||
return movieList;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,55 +1,13 @@
|
|||
package ru.bvn13.imdbspider.spider.processor;
|
||||
|
||||
|
||||
import org.w3c.dom.Document;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.jsoup.select.Elements;
|
||||
import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException;
|
||||
import ru.bvn13.imdbspider.exceptions.processor.HtmlToXmlConvertionException;
|
||||
import ru.bvn13.imdbspider.exceptions.processor.PatternEvaluationException;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.xpath.XPath;
|
||||
import javax.xml.xpath.XPathConstants;
|
||||
import javax.xml.xpath.XPathExpressionException;
|
||||
import javax.xml.xpath.XPathFactory;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 09.01.2019
|
||||
* @author boyko_vn at 10.01.2019
|
||||
*/
|
||||
public class HtmlProcessor {
|
||||
public interface HtmlProcessor {
|
||||
|
||||
public String process(final String html, final String pattern) throws HtmlProcessorException {
|
||||
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder db = null;
|
||||
try {
|
||||
db = dbf.newDocumentBuilder();
|
||||
} catch (ParserConfigurationException e) {
|
||||
throw new HtmlProcessorException(e);
|
||||
}
|
||||
Document xml = null;
|
||||
try {
|
||||
xml = db.parse(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)));
|
||||
} catch (SAXException e) {
|
||||
throw new HtmlToXmlConvertionException("Html parsing exception", e);
|
||||
} catch (IOException e) {
|
||||
throw new HtmlToXmlConvertionException("Html reading exception", e);
|
||||
}
|
||||
|
||||
XPathFactory xpf = XPathFactory.newInstance();
|
||||
XPath xpath = xpf.newXPath();
|
||||
String result = null;
|
||||
try {
|
||||
result = (String) xpath.evaluate(pattern, xml, XPathConstants.STRING);
|
||||
} catch (XPathExpressionException e) {
|
||||
throw new PatternEvaluationException(String.format("Could not evaluate pattern: %s", pattern), e);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
Elements process(final String html, final String pattern) throws HtmlProcessorException;
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
package ru.bvn13.imdbspider.spider.processor;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.select.Elements;
|
||||
import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 10.01.2019
|
||||
*/
|
||||
public class JsoupHtmlProcessor implements HtmlProcessor {
|
||||
|
||||
@Override
|
||||
public Elements process(String html, String pattern) throws HtmlProcessorException {
|
||||
Document doc = Jsoup.parse(html, "UTF-8");
|
||||
Elements result = doc.select(pattern);
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,7 +1,8 @@
|
|||
package ru.bvn13.imdbspider.spider.tasker;
|
||||
|
||||
import ru.bvn13.imdbspider.exceptions.extractor.HtmlExtractorException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.*;
|
||||
|
@ -18,7 +19,7 @@ public class Manager {
|
|||
}
|
||||
|
||||
|
||||
public List<Task> processTasks(List<Task> allTasks) throws ExecutionException, InterruptedException {
|
||||
public void processTasks(List<Task> allTasks) throws ExecutionException, InterruptedException {
|
||||
|
||||
Map<String, List<Task>> groupedTasks = new ConcurrentHashMap<>(allTasks.size());
|
||||
|
||||
|
@ -35,24 +36,26 @@ public class Manager {
|
|||
filteredTasks.add(task);
|
||||
}
|
||||
|
||||
List<Task> result = Collections.synchronizedList(new ArrayList<>());
|
||||
|
||||
groupedTasks.entrySet().parallelStream().forEach(stringListEntry -> {
|
||||
Future<List<Task>> r = executor.submit(new Worker(stringListEntry.getKey(), stringListEntry.getValue()));
|
||||
while (!r.isDone()) {
|
||||
Thread.yield();
|
||||
}
|
||||
Worker w = new Worker(stringListEntry.getKey(), stringListEntry.getValue());
|
||||
try {
|
||||
result.addAll(r.get());
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
} catch (ExecutionException e) {
|
||||
w.run();
|
||||
} catch (HtmlExtractorException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
|
||||
List<Task> nextTasks = new ArrayList<>();
|
||||
|
||||
return result;
|
||||
for (Task task : allTasks) {
|
||||
if (task.hasNextTasks()) {
|
||||
nextTasks.addAll(task.getNestedTasks());
|
||||
}
|
||||
}
|
||||
|
||||
if (!nextTasks.isEmpty()) {
|
||||
processTasks(nextTasks);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -1,8 +1,13 @@
|
|||
package ru.bvn13.imdbspider.spider.tasker;
|
||||
|
||||
import org.jsoup.select.Elements;
|
||||
import ru.bvn13.imdbspider.exceptions.ImdbSpiderException;
|
||||
import ru.bvn13.imdbspider.imdb.DataType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.BiConsumer;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 09.01.2019
|
||||
*/
|
||||
|
@ -10,27 +15,35 @@ public class Task {
|
|||
|
||||
private String url;
|
||||
|
||||
private String xpathPattern;
|
||||
private DataType dataType;
|
||||
private String result;
|
||||
private String cssSelector;
|
||||
private Elements cssSelectorResult;
|
||||
|
||||
private Class resultType;
|
||||
private Object result;
|
||||
|
||||
private ImdbSpiderException exception;
|
||||
|
||||
private BiConsumer<Task, String> postprocess;
|
||||
|
||||
private Task parentTask;
|
||||
private List<Task> nestedTasks;
|
||||
|
||||
public Task() {
|
||||
}
|
||||
|
||||
public Task(String xpathPattern) {
|
||||
this.xpathPattern = xpathPattern;
|
||||
public Task(String cssSelector) {
|
||||
this.cssSelector = cssSelector;
|
||||
}
|
||||
|
||||
public Task(String url, String xpathPattern) {
|
||||
public Task(String url, String cssSelector) {
|
||||
this.url = url;
|
||||
this.xpathPattern = xpathPattern;
|
||||
this.cssSelector = cssSelector;
|
||||
}
|
||||
|
||||
public Task(String url, String xpathPattern, DataType dataType) {
|
||||
public Task(String url, String cssSelector, DataType dataType) {
|
||||
this.url = url;
|
||||
this.xpathPattern = xpathPattern;
|
||||
this.cssSelector = cssSelector;
|
||||
this.dataType = dataType;
|
||||
}
|
||||
|
||||
|
@ -38,39 +51,96 @@ public class Task {
|
|||
return url;
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
public Task setUrl(String url) {
|
||||
this.url = url;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getXpathPattern() {
|
||||
return xpathPattern;
|
||||
public String getCssSelector() {
|
||||
return cssSelector;
|
||||
}
|
||||
|
||||
public void setXpathPattern(String xpathPattern) {
|
||||
this.xpathPattern = xpathPattern;
|
||||
public Task setCssSelector(String cssSelector) {
|
||||
this.cssSelector = cssSelector;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DataType getDataType() {
|
||||
return dataType;
|
||||
}
|
||||
|
||||
public void setDataType(DataType dataType) {
|
||||
public Task setDataType(DataType dataType) {
|
||||
this.dataType = dataType;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getResult() {
|
||||
return result;
|
||||
public Elements getCssSelectorResult() {
|
||||
return cssSelectorResult;
|
||||
}
|
||||
|
||||
public void setResult(String result) {
|
||||
this.result = result;
|
||||
public Task setCssSelectorResult(Elements cssSelectorResult) {
|
||||
this.cssSelectorResult = cssSelectorResult;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ImdbSpiderException getException() {
|
||||
return exception;
|
||||
}
|
||||
|
||||
public void setException(ImdbSpiderException exception) {
|
||||
public Task setException(ImdbSpiderException exception) {
|
||||
this.exception = exception;
|
||||
return this;
|
||||
}
|
||||
|
||||
public BiConsumer<Task, String> getPostprocess() {
|
||||
return postprocess;
|
||||
}
|
||||
|
||||
public Task setPostprocess(BiConsumer<Task, String> postprocess) {
|
||||
this.postprocess = postprocess;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Class getResultType() {
|
||||
return resultType;
|
||||
}
|
||||
|
||||
public Task setResultType(Class resultType) {
|
||||
this.resultType = resultType;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Object getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
public Task setResult(Object result) {
|
||||
this.result = result;
|
||||
return this;
|
||||
}
|
||||
|
||||
public boolean hasNextTasks() {
|
||||
return (nestedTasks != null && !nestedTasks.isEmpty());
|
||||
}
|
||||
|
||||
public List<Task> getNestedTasks() {
|
||||
if (nestedTasks == null) {
|
||||
nestedTasks = new ArrayList<>();
|
||||
}
|
||||
return nestedTasks;
|
||||
}
|
||||
|
||||
public Task setNestedTasks(List<Task> nestedTasks) {
|
||||
this.nestedTasks = nestedTasks;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Task getParentTask() {
|
||||
return parentTask;
|
||||
}
|
||||
|
||||
public Task setParentTask(Task parentTask) {
|
||||
this.parentTask = parentTask;
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,8 +2,10 @@ package ru.bvn13.imdbspider.spider.tasker;
|
|||
|
||||
import ru.bvn13.imdbspider.exceptions.ImdbSpiderException;
|
||||
import ru.bvn13.imdbspider.exceptions.extractor.HtmlExtractorException;
|
||||
import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException;
|
||||
import ru.bvn13.imdbspider.spider.extractor.HtmlExtractor;
|
||||
import ru.bvn13.imdbspider.spider.processor.HtmlProcessor;
|
||||
import ru.bvn13.imdbspider.spider.processor.JsoupHtmlProcessor;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
|
@ -11,7 +13,7 @@ import java.util.concurrent.*;
|
|||
/**
|
||||
* @author boyko_vn at 09.01.2019
|
||||
*/
|
||||
public class Worker implements Callable<List<Task>> {
|
||||
public class Worker {
|
||||
|
||||
private final String url;
|
||||
private final List<Task> tasks;
|
||||
|
@ -26,42 +28,33 @@ public class Worker implements Callable<List<Task>> {
|
|||
this.tasks = tasks;
|
||||
|
||||
this.htmlExtractor = new HtmlExtractor();
|
||||
this.htmlProcessor = new HtmlProcessor();
|
||||
this.htmlProcessor = new JsoupHtmlProcessor();
|
||||
|
||||
this.executor = Executors.newCachedThreadPool();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Task> call() throws Exception {
|
||||
Future<String> result = executor.submit(() -> htmlExtractor.getHtml(url));
|
||||
while (!result.isDone()) {
|
||||
Thread.yield();
|
||||
}
|
||||
public Boolean run() throws HtmlExtractorException {
|
||||
|
||||
final String html;
|
||||
try {
|
||||
html = result.get();
|
||||
} catch (InterruptedException e) {
|
||||
throw new ImdbSpiderException("Interrupted", e);
|
||||
} catch (ExecutionException e) {
|
||||
throw new HtmlExtractorException("Exception has been occurred", e);
|
||||
}
|
||||
final String html = htmlExtractor.getHtml(url);
|
||||
|
||||
tasks.parallelStream().forEach(task -> {
|
||||
Future<String> taskResult = executor.submit(() -> htmlProcessor.process(html, task.getXpathPattern()));
|
||||
while (!taskResult.isDone()) {
|
||||
Thread.yield();
|
||||
}
|
||||
|
||||
try {
|
||||
task.setResult(taskResult.get());
|
||||
} catch (InterruptedException e) {
|
||||
task.setException(new ImdbSpiderException("Interrupted", e));
|
||||
} catch (ExecutionException e) {
|
||||
task.setException(new ImdbSpiderException("Exception has been occurred", e));
|
||||
if (task.getCssSelector() != null && !task.getCssSelector().isEmpty()) {
|
||||
task.setCssSelectorResult(htmlProcessor.process(html, task.getCssSelector()));
|
||||
}
|
||||
|
||||
if (task.getPostprocess() != null) {
|
||||
task.getPostprocess().accept(task, html);
|
||||
}
|
||||
} catch (HtmlProcessorException e) {
|
||||
task.setException(new ImdbSpiderException(e));
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
return tasks;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,9 @@ import static org.junit.Assert.assertTrue;
|
|||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import ru.bvn13.imdbspider.ImdbSpider;
|
||||
import ru.bvn13.imdbspider.exceptions.ImdbSpiderException;
|
||||
import ru.bvn13.imdbspider.imdb.Movie;
|
||||
import ru.bvn13.imdbspider.imdb.MovieList;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
|
@ -21,6 +23,10 @@ public class AppTest
|
|||
|
||||
@Test
|
||||
public void searchTerminatorTest() {
|
||||
List<Movie> result = spider.searchMovieByTitle("Терминатор", 5);
|
||||
try {
|
||||
MovieList result = spider.searchMovieByTitle("test", 5);
|
||||
} catch (ImdbSpiderException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue