2019-01-09 17:57:37 +03:00
|
|
|
package ru.bvn13.imdbspider.spider.api.v1_0;
|
|
|
|
|
2019-01-10 18:14:57 +03:00
|
|
|
import org.jsoup.nodes.Element;
|
2019-01-09 17:57:37 +03:00
|
|
|
import ru.bvn13.imdbspider.exceptions.api.DataTypeNotSupportedException;
|
2019-01-10 18:14:57 +03:00
|
|
|
import ru.bvn13.imdbspider.imdb.*;
|
2019-01-09 17:57:37 +03:00
|
|
|
import ru.bvn13.imdbspider.spider.api.ApiFactory;
|
|
|
|
import ru.bvn13.imdbspider.spider.tasker.Task;
|
|
|
|
|
2019-01-10 18:14:57 +03:00
|
|
|
import java.util.EnumSet;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.regex.Matcher;
|
|
|
|
import java.util.regex.Pattern;
|
|
|
|
|
2019-01-09 17:57:37 +03:00
|
|
|
/**
|
|
|
|
* @author boyko_vn at 09.01.2019
|
|
|
|
*/
|
|
|
|
public class ApiFactory_1_0 implements ApiFactory {
|
|
|
|
|
2019-01-10 18:14:57 +03:00
|
|
|
private static final String URL_MAIN = "https://www.imdb.com";
|
|
|
|
|
|
|
|
private final Pattern PATTERN_MOVIE_ID_FROM_MOVIELIST = Pattern.compile("/title/tt(\\d+)/.*");
|
|
|
|
|
|
|
|
private EnumSet<MovieDataType> defaultMovieDataType = EnumSet.of(MovieDataType.ID, MovieDataType.TITLE, MovieDataType.YEAR);
|
|
|
|
|
2019-01-09 17:57:37 +03:00
|
|
|
@Override
|
|
|
|
public Task taskByDataType(DataType dataType) throws DataTypeNotSupportedException {
|
|
|
|
if (dataType instanceof MovieDataType) {
|
|
|
|
return taskByMovieDataType((MovieDataType) dataType);
|
2019-01-10 18:14:57 +03:00
|
|
|
} else if (dataType instanceof MovieListDataType) {
|
|
|
|
return taskByMovieListDataType((MovieListDataType) dataType);
|
2019-01-09 17:57:37 +03:00
|
|
|
} else {
|
2019-01-10 18:14:57 +03:00
|
|
|
throw new DataTypeNotSupportedException(String.format("DataType %s is not supported by API v1_0!", dataType.getClass().getName()));
|
2019-01-09 17:57:37 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
2019-01-10 18:14:57 +03:00
|
|
|
public void fillUpImdbObject(ImdbObject imdbObject, Task task) {
|
2019-01-09 17:57:37 +03:00
|
|
|
if (imdbObject instanceof Movie) {
|
|
|
|
if (task.getDataType() instanceof MovieDataType) {
|
2019-01-10 18:14:57 +03:00
|
|
|
fillUpMovie((Movie) imdbObject, task);
|
|
|
|
}
|
|
|
|
} else if (imdbObject instanceof MovieList) {
|
|
|
|
if (task.getDataType() instanceof MovieListDataType) {
|
|
|
|
fillUpMovieList((MovieList) imdbObject, task);
|
2019-01-09 17:57:37 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private Task taskByMovieDataType(MovieDataType movieDataType) {
|
2019-01-10 18:14:57 +03:00
|
|
|
Task t = new Task();
|
|
|
|
t.setDataType(movieDataType);
|
2019-01-09 17:57:37 +03:00
|
|
|
switch (movieDataType) {
|
2019-01-10 18:14:57 +03:00
|
|
|
case ID:
|
|
|
|
t.setPostprocess((task, s) -> {
|
|
|
|
Matcher matcher = PATTERN_MOVIE_ID_FROM_MOVIELIST.matcher(task.getUrl());
|
|
|
|
if (matcher.find()) {
|
|
|
|
task.setResultType(String.class);
|
|
|
|
task.setResult(matcher.group(1));
|
|
|
|
}
|
|
|
|
});
|
|
|
|
break;
|
|
|
|
case TITLE:
|
|
|
|
t.setCssSelector("#title-overview-widget > div.vital > div.title_block > div > div.titleBar > div.title_wrapper > h1");
|
|
|
|
t.setPostprocess((task, s) -> {
|
|
|
|
task.setResultType(String.class);
|
|
|
|
task.setResult(task.getCssSelectorResult().first().wholeText().trim());
|
|
|
|
});
|
|
|
|
break;
|
|
|
|
case YEAR:
|
|
|
|
t.setCssSelector("#titleYear > a");
|
|
|
|
t.setPostprocess((task, s) -> {
|
|
|
|
task.setResultType(Integer.class);
|
|
|
|
if (task.getCssSelectorResult().size() > 0) {
|
|
|
|
try {
|
|
|
|
task.setResult(Integer.parseInt(task.getCssSelectorResult().first().text().trim()));
|
|
|
|
} catch (NumberFormatException e) {
|
|
|
|
task.setResult(-1);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
task.setResult(-1);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
|
|
|
private Task taskByMovieListDataType(MovieListDataType movieListDataType) {
|
|
|
|
Task t = new Task();
|
|
|
|
t.setDataType(movieListDataType);
|
|
|
|
switch (movieListDataType) {
|
|
|
|
case ELEMENTS:
|
|
|
|
t.setCssSelector("#main > div > div.findSection > table > tbody > tr > td.result_text");
|
|
|
|
t.setResultType(List.class);
|
|
|
|
t.setPostprocess((task, s) -> {
|
2019-01-11 10:08:50 +03:00
|
|
|
int count = 0;
|
2019-01-10 18:14:57 +03:00
|
|
|
for (Element element : task.getCssSelectorResult()) {
|
2019-01-11 10:08:50 +03:00
|
|
|
count++;
|
|
|
|
if (task.getRestrictionByCount() != null) {
|
|
|
|
if (count > task.getRestrictionByCount()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2019-01-10 18:14:57 +03:00
|
|
|
Element link = element.select("a").first();
|
|
|
|
if (!defaultMovieDataType.contains(MovieDataType.ID)) {
|
|
|
|
defaultMovieDataType.add(MovieDataType.ID);
|
|
|
|
}
|
|
|
|
Task movieTask = this.taskByMovieDataType(MovieDataType.ID)
|
|
|
|
.setParentTask(task)
|
|
|
|
.setUrl(String.format("%s%s", URL_MAIN, link.attr("href")));
|
|
|
|
task.getNestedTasks().add(movieTask);
|
|
|
|
defaultMovieDataType.forEach(movieDataType -> movieTask.getNestedTasks().add(this.taskByMovieDataType(movieDataType)
|
|
|
|
.setParentTask(movieTask)
|
|
|
|
.setUrl(String.format("%s%s", URL_MAIN, link.attr("href")))));
|
|
|
|
}
|
|
|
|
});
|
|
|
|
break;
|
2019-01-09 17:57:37 +03:00
|
|
|
}
|
2019-01-10 18:14:57 +03:00
|
|
|
return t;
|
2019-01-09 17:57:37 +03:00
|
|
|
}
|
|
|
|
|
2019-01-10 18:14:57 +03:00
|
|
|
private void fillUpMovie(Movie movie, Task task) {
|
2019-01-09 17:57:37 +03:00
|
|
|
switch ((MovieDataType) task.getDataType()) {
|
2019-01-10 18:14:57 +03:00
|
|
|
case ID:
|
|
|
|
movie.setUrl(task.getUrl());
|
|
|
|
movie.setId((String) task.getResult());
|
|
|
|
break;
|
|
|
|
case TITLE:
|
|
|
|
movie.setTitle((String) task.getResult());
|
|
|
|
break;
|
|
|
|
case YEAR:
|
|
|
|
movie.setYear((Integer) task.getResult());
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private void fillUpMovieList(MovieList movieList, Task task) {
|
|
|
|
switch ((MovieListDataType) task.getDataType()) {
|
|
|
|
case ELEMENTS:
|
|
|
|
movieList.setUrl(task.getUrl());
|
|
|
|
break;
|
2019-01-09 17:57:37 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-01-10 18:14:57 +03:00
|
|
|
public EnumSet<MovieDataType> getDefaultMovieDataType() {
|
|
|
|
return defaultMovieDataType;
|
|
|
|
}
|
|
|
|
|
|
|
|
public void setDefaultMovieDataType(EnumSet<MovieDataType> defaultMovieDataType) {
|
|
|
|
this.defaultMovieDataType = defaultMovieDataType;
|
|
|
|
}
|
2019-01-09 17:57:37 +03:00
|
|
|
}
|