package ru.bvn13.imdbspider.spider.api.v1_0; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; import ru.bvn13.imdbspider.exceptions.api.DataTypeNotSupportedException; import ru.bvn13.imdbspider.imdb.*; import ru.bvn13.imdbspider.imdb.accessories.Link; import ru.bvn13.imdbspider.spider.api.ApiFactory; import ru.bvn13.imdbspider.spider.processor.HtmlProcessor; import ru.bvn13.imdbspider.spider.tasker.Task; import java.net.URLEncoder; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.EnumSet; import java.util.List; import java.util.function.BiConsumer; import java.util.regex.Pattern; /** * @author boyko_vn at 09.01.2019 * * IMDB :: SPIDER :: API :: version 1.0 (started 09.01.2019) */ public class ApiFactory_1_0 implements ApiFactory { static final String URL_MAIN = "https://www.imdb.com"; private static final String URL_SEARCH_TITLE = "https://www.imdb.com/find?ref_=nv_sr_fn&q={{title}}&s=tt"; static final Pattern PATTERN_MOVIE_ID_FROM_MOVIELIST = Pattern.compile("/title/tt(\\d+)/.*"); private EnumSet defaultMovieDataTypeSet = EnumSet.of(MovieDataType.ID, MovieDataType.TITLE, MovieDataType.YEAR); private EnumSet movieDataTypeSet; private HtmlProcessor htmlProcessor; static class POSTPROCESS { static final BiConsumer GET_TEXT_OF_FIRST_ELEMENT = (task, s) -> { task.setResultType(String.class); if (task.getCssSelectorResult().size() > 0) { task.setResult(task.getCssSelectorResult().first().text().trim()); } else { task.setResult(""); } }; static final BiConsumer GET_OWN_TEXT_OF_FIRST_ELEMENT = (task, s) -> { task.setResultType(String.class); if (task.getCssSelectorResult().size() > 0) { task.setResult(task.getCssSelectorResult().first().ownText().trim()); } else { task.setResult(""); } }; static final BiConsumer GET_WHOLE_TEXT_OF_FIRST_ELEMENT = (task, s) -> { task.setResultType(String.class); if (task.getCssSelectorResult().size() > 0) { task.setResult(task.getCssSelectorResult().first().wholeText().trim()); } else { task.setResult(""); } }; static final BiConsumer GET_OWN_TEXT_OF_PARENT_MODE = (task, s) -> { task.setResultType(String.class); if (task.getCssSelectorResult().size() > 0) { task.setResult(task.getCssSelectorResult().first().parent().ownText().trim()); } else { task.setResult(""); } }; static final BiConsumer COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE = (task, s) -> { task.setResultType(List.class); List titles = new ArrayList<>(); if (task.getCssSelectorResult().size() > 0) { for (Element title : task.getCssSelectorResult().first().parent().select("a")) { titles.add(title.text().trim()); } } task.setResult(titles); }; static final BiConsumer COLLECT_ALL_NESTED_LINKS_OF_PARENT_NODE = (task, s) -> { task.setResultType(List.class); List titles = new ArrayList<>(); if (task.getCssSelectorResult().size() > 0) { for (Element link : task.getCssSelectorResult().first().parent().select("a")) { final String url = link.attr("href").trim(); titles.add(new Link() .setTitle(link.text().trim()) .setUrl((url.startsWith("/") ? String.format("%s%s", URL_MAIN, url) : url)) ); } } task.setResult(titles); }; static final BiConsumer GET_TITLE_OF_FIRST_LINK_IN_PARENT_MODE = (task, s) -> { task.setResultType(String.class); task.setResult(""); if (task.getCssSelectorResult().size() > 0) { Elements links = task.getCssSelectorResult().first().parent().select("a"); if (links.size() > 0) { task.setResult(links.first().text().trim()); } } }; } private MovieListProcessor_1_0 movieListProcessor; private MovieProcessor_1_0 movieProcessor; private TaglineListProcessor_1_0 taglineListProcessor; private TaglineProcessor_1_0 taglineProcessor; public ApiFactory_1_0(HtmlProcessor htmlProcessor) { this.htmlProcessor = htmlProcessor; this.movieListProcessor = new MovieListProcessor_1_0(this); this.movieProcessor = new MovieProcessor_1_0(this); this.taglineListProcessor = new TaglineListProcessor_1_0(this); this.taglineProcessor = new TaglineProcessor_1_0(this); } @Override public List createTasksForSearchMovieByTitle(String title, int maxCount, EnumSet dataTypes) throws ImdbSpiderException { setMovieDataTypeSet(dataTypes); String url = URL_SEARCH_TITLE.replace("{{title}}", URLEncoder.encode(title, Charset.forName("utf-8"))); List tasks = new ArrayList<>(); try { Task t1 = taskByDataType(MovieListDataType.ELEMENTS); t1.setUrl(url); if (maxCount > 0) { t1.setRestrictionByCount(maxCount); } tasks.add(t1); } catch (DataTypeNotSupportedException e) { throw e; } return tasks; } @Override public Task taskByDataType(DataType dataType) throws DataTypeNotSupportedException { if (dataType instanceof MovieDataType) { return movieProcessor.taskByMovieDataType((MovieDataType) dataType); } else if (dataType instanceof MovieListDataType) { return movieListProcessor.taskByMovieListDataType((MovieListDataType) dataType); } else if (dataType instanceof TaglineListDataType) { return taglineListProcessor.taskByTaglineListDataType((TaglineListDataType) dataType); } else if (dataType instanceof TaglineDataType) { return taglineProcessor.taskByTaglineDataType((TaglineDataType) dataType); } else { throw new DataTypeNotSupportedException(String.format("DataType %s is not supported by API v1_0!", dataType.getClass().getName())); } } @Override public void fillUpImdbObject(ImdbObject imdbObject, Task task) { if (imdbObject instanceof Movie) { if (task.getDataType() instanceof MovieDataType) { movieProcessor.fillUpMovie((Movie) imdbObject, task); } } else if (imdbObject instanceof MovieList) { if (task.getDataType() instanceof MovieListDataType) { movieListProcessor.fillUpMovieList((MovieList) imdbObject, task); } } else if (imdbObject instanceof TaglineList) { if (task.getDataType() instanceof TaglineListDataType) { taglineListProcessor.fillUpTaglineList((TaglineList) imdbObject, task); } } else if (imdbObject instanceof Tagline) { if (task.getDataType() instanceof TaglineDataType) { taglineProcessor.fillUpTagline((Tagline) imdbObject, task); } } } @Override public EnumSet getDefaultMovieDataTypeSet() { return defaultMovieDataTypeSet; } @Override public void setMovieDataTypeSet(EnumSet movieDataTypeSet) { this.movieDataTypeSet = movieDataTypeSet; } @Override public EnumSet getMovieDataTypeSet() { return movieDataTypeSet; } HtmlProcessor getHtmlProcessor() { return htmlProcessor; } MovieProcessor_1_0 getMovieProcessor() { return movieProcessor; } TaglineProcessor_1_0 getTaglineProcessor() { return taglineProcessor; } TaglineListProcessor_1_0 getTaglineListProcessor() { return taglineListProcessor; } }