diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/AbstractApiProcessor_1_0.java b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/AbstractApiProcessor_1_0.java new file mode 100644 index 0000000..3189626 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/AbstractApiProcessor_1_0.java @@ -0,0 +1,17 @@ +package ru.bvn13.imdbspider.spider.api.v1_0; + +/** + * @author boyko_vn at 15.01.2019 + */ +abstract public class AbstractApiProcessor_1_0 { + + private ApiFactory_1_0 apiFactory; + + public AbstractApiProcessor_1_0(ApiFactory_1_0 apiFactory) { + this.apiFactory = apiFactory; + } + + public ApiFactory_1_0 getApiFactory() { + return apiFactory; + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java index 39e6060..fe75d4a 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java @@ -4,10 +4,8 @@ import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; import ru.bvn13.imdbspider.exceptions.api.DataTypeNotSupportedException; -import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException; import ru.bvn13.imdbspider.imdb.*; import ru.bvn13.imdbspider.imdb.accessories.Link; -import ru.bvn13.imdbspider.imdb.accessories.SoundMix; import ru.bvn13.imdbspider.spider.api.ApiFactory; import ru.bvn13.imdbspider.spider.processor.HtmlProcessor; import ru.bvn13.imdbspider.spider.tasker.Task; @@ -17,9 +15,7 @@ import java.nio.charset.Charset; import java.util.ArrayList; import java.util.EnumSet; import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; import java.util.function.BiConsumer; -import java.util.regex.Matcher; import java.util.regex.Pattern; /** @@ -29,18 +25,18 @@ import java.util.regex.Pattern; */ public class ApiFactory_1_0 implements ApiFactory { - public static final String URL_MAIN = "https://www.imdb.com"; + static final String URL_MAIN = "https://www.imdb.com"; private static final String URL_SEARCH_TITLE = "https://www.imdb.com/find?ref_=nv_sr_fn&q={{title}}&s=tt"; - private final Pattern PATTERN_MOVIE_ID_FROM_MOVIELIST = Pattern.compile("/title/tt(\\d+)/.*"); + static final Pattern PATTERN_MOVIE_ID_FROM_MOVIELIST = Pattern.compile("/title/tt(\\d+)/.*"); private EnumSet defaultMovieDataTypeSet = EnumSet.of(MovieDataType.ID, MovieDataType.TITLE, MovieDataType.YEAR); private EnumSet movieDataTypeSet; private HtmlProcessor htmlProcessor; - private static class POSTPROCESS { + static class POSTPROCESS { static final BiConsumer GET_TEXT_OF_FIRST_ELEMENT = (task, s) -> { task.setResultType(String.class); @@ -118,8 +114,18 @@ public class ApiFactory_1_0 implements ApiFactory { } + private MovieListProcessor_1_0 movieListProcessor; + private MovieProcessor_1_0 movieProcessor; + private TaglineListProcessor_1_0 taglineListProcessor; + private TaglineProcessor_1_0 taglineProcessor; + public ApiFactory_1_0(HtmlProcessor htmlProcessor) { this.htmlProcessor = htmlProcessor; + + this.movieListProcessor = new MovieListProcessor_1_0(this); + this.movieProcessor = new MovieProcessor_1_0(this); + this.taglineListProcessor = new TaglineListProcessor_1_0(this); + this.taglineProcessor = new TaglineProcessor_1_0(this); } @Override @@ -149,13 +155,13 @@ public class ApiFactory_1_0 implements ApiFactory { @Override public Task taskByDataType(DataType dataType) throws DataTypeNotSupportedException { if (dataType instanceof MovieDataType) { - return taskByMovieDataType((MovieDataType) dataType); + return movieProcessor.taskByMovieDataType((MovieDataType) dataType); } else if (dataType instanceof MovieListDataType) { - return taskByMovieListDataType((MovieListDataType) dataType); + return movieListProcessor.taskByMovieListDataType((MovieListDataType) dataType); } else if (dataType instanceof TaglineListDataType) { - return taskByTaglineListDataType((TaglineListDataType) dataType); + return taglineListProcessor.taskByTaglineListDataType((TaglineListDataType) dataType); } else if (dataType instanceof TaglineDataType) { - return taskByTaglineDataType((TaglineDataType) dataType); + return taglineProcessor.taskByTaglineDataType((TaglineDataType) dataType); } else { throw new DataTypeNotSupportedException(String.format("DataType %s is not supported by API v1_0!", dataType.getClass().getName())); } @@ -165,387 +171,24 @@ public class ApiFactory_1_0 implements ApiFactory { public void fillUpImdbObject(ImdbObject imdbObject, Task task) { if (imdbObject instanceof Movie) { if (task.getDataType() instanceof MovieDataType) { - fillUpMovie((Movie) imdbObject, task); + movieProcessor.fillUpMovie((Movie) imdbObject, task); } } else if (imdbObject instanceof MovieList) { if (task.getDataType() instanceof MovieListDataType) { - fillUpMovieList((MovieList) imdbObject, task); + movieListProcessor.fillUpMovieList((MovieList) imdbObject, task); } } else if (imdbObject instanceof TaglineList) { if (task.getDataType() instanceof TaglineListDataType) { - fillUpTaglineList((TaglineList) imdbObject, task); + taglineListProcessor.fillUpTaglineList((TaglineList) imdbObject, task); } } else if (imdbObject instanceof Tagline) { if (task.getDataType() instanceof TaglineDataType) { - fillUpTagline((Tagline) imdbObject, task); + taglineProcessor.fillUpTagline((Tagline) imdbObject, task); } } } - private Task taskByMovieDataType(MovieDataType movieDataType) { - Task t = new Task(); - t.setDataType(movieDataType); - switch (movieDataType) { - case ID: - t.setPostprocess((task, s) -> { - Matcher matcher = PATTERN_MOVIE_ID_FROM_MOVIELIST.matcher(task.getUrl()); - if (matcher.find()) { - task.setResultType(String.class); - task.setResult(matcher.group(1)); - } - }); - break; - case TITLE: - t.setCssSelector("#title-overview-widget > div.vital > div.title_block > div > div.titleBar > div.title_wrapper > h1"); - t.setPostprocess(POSTPROCESS.GET_WHOLE_TEXT_OF_FIRST_ELEMENT); - break; - case ORIGINAL_TITLE: - t.setCssSelector("#title-overview-widget > div.vital > div.title_block > div > div.titleBar > div.title_wrapper > div.originalTitle"); - t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_FIRST_ELEMENT); - break; - case YEAR: - t.setCssSelector("#titleYear > a"); - t.setPostprocess((task, s) -> { - task.setResultType(Integer.class); - if (task.getCssSelectorResult().size() > 0) { - try { - task.setResult(Integer.parseInt(task.getCssSelectorResult().first().text().trim())); - } catch (NumberFormatException e) { - task.setResult(-1); - } - } else { - task.setResult(-1); - } - }); - break; - case POSTER: - t.setCssSelector("#title-overview-widget > div.vital > div.slate_wrapper > div.poster > a > img"); - t.setPostprocess((task, s) -> { - task.setResultType(String.class); - if (task.getCssSelectorResult().size() > 0) { - task.setResult(task.getCssSelectorResult().first().attr("src")); - } else { - task.setResult(""); - } - }); - break; - case STORYLINE: - t.setCssSelector("#titleStoryLine > div:nth-child(3) > p > span"); - t.setPostprocess(POSTPROCESS.GET_TEXT_OF_FIRST_ELEMENT); - break; - case RANDOM_TAGLINE: - t.setCssSelector("#titleStoryLine > div > h4:contains(Taglines)"); //#titleStoryLine > div:nth-child(8) > h4 - t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); - break; - case GENRES: - t.setCssSelector("#titleStoryLine > div > h4:contains(Genres)"); - t.setPostprocess(POSTPROCESS.COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE); - break; - case CERTIFICATE: - t.setCssSelector("#titleStoryLine > div > h4:contains(Certificate)"); - t.setPostprocess((task, s) -> { - task.setResultType(String.class); - if (task.getCssSelectorResult().size() > 0) { - task.setResult(task.getCssSelectorResult().first().parent().select("span:nth-child(2)").first().text().trim()); - } - }); - break; - case OFFICIAL_SITES: - t.setCssSelector("#titleDetails > div > h4:contains(Official Sites)"); - t.setPostprocess(POSTPROCESS.COLLECT_ALL_NESTED_LINKS_OF_PARENT_NODE); - break; - case COUNTRIES: - t.setCssSelector("#titleDetails > div > h4:contains(Country)"); - t.setPostprocess(POSTPROCESS.COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE); - break; - case LANGUAGES: - t.setCssSelector("#titleDetails > div > h4:contains(Language)"); - t.setPostprocess(POSTPROCESS.COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE); - break; - case RELEASE_DATE: - t.setCssSelector("#titleDetails > div > h4:contains(Release Date)"); - t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); - break; - case BUDGET: - t.setCssSelector("#titleDetails > div > h4:contains(Budget)"); - t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); - break; - case CUMULATIVE_WORLDWIDE_GROSS: - t.setCssSelector("#titleDetails > div > h4:contains(Cumulative Worldwide Gross)"); - t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); - break; - case RUNTIME: - t.setCssSelector("#titleDetails > div > h4:contains(Runtime)"); - t.setPostprocess((task, s) -> { - task.setResultType(String.class); - task.setResult(""); - if (task.getCssSelectorResult().size() > 0) { - task.setResult(task.getCssSelectorResult().first().parent().text().replace("Runtime:", "").trim()); - } - }); - break; - case SOUND_MIXES: - t.setCssSelector("#titleDetails > div > h4:contains(Sound Mix)"); - t.setPostprocess((task, s) -> { - task.setResultType(List.class); - List titles = new ArrayList<>(); - if (task.getCssSelectorResult().size() > 0) { - String html = task.getCssSelectorResult().first().parent().html(); - html = html.replace("\r", ""); - html = html.replace("\n", ""); - html = html.replace("|", "|"); - // remove header:

Sound Mix:

- html = html.replaceAll("()", ""); - - String[] lines = html.split("\\|"); - - for (int i=0; i%s", lines[i]), "div"); - if (els.size() > 0) { - Element div = els.first(); - Element link = div.selectFirst("a"); - titles.add(new SoundMix() - .setName(link.text().trim()) - .setDescription(div.ownText()) - ); - } - } catch (HtmlProcessorException e) { - e.printStackTrace(); - } - } - } - task.setResult(titles); - }); - break; - case COLOR: - t.setCssSelector("#titleDetails > div > h4:contains(Color)"); - t.setPostprocess(POSTPROCESS.GET_TITLE_OF_FIRST_LINK_IN_PARENT_MODE); - break; - case ASPECT_RATIO: - t.setCssSelector("#titleDetails > div > h4:contains(Aspect Ratio)"); - t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); - break; - case TAGLINES: - t.setCssSelector("#titleStoryLine > div > h4:contains(Taglines)"); - t.setPostprocess((task, s) -> { - if (task.getCssSelectorResult().size() > 0) { - Elements links = task.getCssSelectorResult().first().parent().select("span > a:contains(See more)"); - if (links.size() > 0) { - Task newTask = this.taskByTaglineListDataType(TaglineListDataType.ELEMENTS) - .setParentTask(task) - .setUrl(String.format("%s%s", URL_MAIN, links.first().attr("href"))); - task.getNestedTasks().add(newTask); - } - } - }); - break; - } - return t; - } - - private Task taskByTaglineListDataType(TaglineListDataType taglineListDataType) { - Task t = new Task(); - t.setDataType(taglineListDataType); - switch (taglineListDataType) { - case ELEMENTS: - t.setCssSelector("#taglines_content > div.soda"); - AtomicInteger i = new AtomicInteger(0); - t.setPostprocess((task, s) -> { - for (Element element : task.getCssSelectorResult()) { - Task newTaskId = taskByTaglineDataType(TaglineDataType.ID) - .setParentTask(task) - .setUrl(task.getUrl()) - .setResult(String.format("%d", i.getAndAdd(1))); - task.getNestedTasks().add(newTaskId); - - Task newTaskText = taskByTaglineDataType(TaglineDataType.TEXT) - .setParentTask(task) - .setUrl(task.getUrl()) - .setResult(element.text()); - newTaskId.getNestedTasks().add(newTaskText); - } - }); - break; - } - return t; - } - - private Task taskByTaglineDataType(TaglineDataType taglineDataType) { - Task t = new Task(); - t.setDataType(taglineDataType); - switch (taglineDataType) { - case ID: - // - break; - case TEXT: - t.setPostprocess((task, s) -> { - task.setResult(((String)task.getResult()).trim()); - }); - break; - } - return t; - } - - private Task taskByMovieListDataType(MovieListDataType movieListDataType) { - Task t = new Task(); - t.setDataType(movieListDataType); - switch (movieListDataType) { - case ELEMENTS: - t.setCssSelector("#main > div > div.findSection > table > tbody > tr > td.result_text"); - t.setResultType(List.class); - t.setPostprocess((task, s) -> { - int count = 0; - for (Element element : task.getCssSelectorResult()) { - count++; - if (task.getRestrictionByCount() != null) { - if (count > task.getRestrictionByCount()) { - break; - } - } - Element link = element.select("a").first(); - if (movieDataTypeSet == null) { - movieDataTypeSet = defaultMovieDataTypeSet; - } - if (!movieDataTypeSet.contains(MovieDataType.ID)) { - movieDataTypeSet.add(MovieDataType.ID); - } - Task movieTask = this.taskByMovieDataType(MovieDataType.ID) - .setParentTask(task) - .setUrl(String.format("%s%s", URL_MAIN, link.attr("href"))); - task.getNestedTasks().add(movieTask); - movieDataTypeSet.forEach(movieDataType -> movieTask.getNestedTasks().add(this.taskByMovieDataType(movieDataType) - .setParentTask(movieTask) - .setUrl(String.format("%s%s", URL_MAIN, link.attr("href"))))); - } - }); - break; - } - return t; - } - - private void fillUpMovie(Movie movie, Task task) { - boolean isDone = false; - switch ((MovieDataType) task.getDataType()) { - case ID: - movie.setUrl(task.getUrl()); - movie.setId((String) task.getResult()); - isDone = true; - break; - case TITLE: - movie.setTitle((String) task.getResult()); - isDone = true; - break; - case ORIGINAL_TITLE: - movie.setOriginalTitle((String) task.getResult()); - isDone = true; - break; - case YEAR: - movie.setYear((Integer) task.getResult()); - isDone = true; - break; - case POSTER: - movie.setPosterLink((String) task.getResult()); - isDone = true; - break; - case STORYLINE: - movie.setStoryline((String) task.getResult()); - isDone = true; - break; - case RANDOM_TAGLINE: - movie.setRandomTagline((String) task.getResult()); - isDone = true; - break; - case GENRES: - movie.setGenres((List) task.getResult()); - isDone = true; - break; - case CERTIFICATE: - movie.setCertificate((String) task.getResult()); - isDone = true; - break; - case OFFICIAL_SITES: - movie.setOfficialSites((List) task.getResult()); - isDone = true; - break; - case COUNTRIES: - movie.setCountries((List) task.getResult()); - isDone = true; - break; - case LANGUAGES: - movie.setLanguages((List) task.getResult()); - isDone = true; - break; - case RELEASE_DATE: - movie.setReleaseDate((String) task.getResult()); - isDone = true; - break; - case BUDGET: - movie.setBudget((String) task.getResult()); - isDone = true; - break; - case CUMULATIVE_WORLDWIDE_GROSS: - movie.setCumulativeWorldwideGross((String) task.getResult()); - isDone = true; - break; - case RUNTIME: - movie.setRuntime((String) task.getResult()); - isDone = true; - break; - case SOUND_MIXES: - movie.setSoundMixes((List) task.getResult()); - isDone = true; - break; - case COLOR: - movie.setColor((String) task.getResult()); - isDone = true; - break; - case ASPECT_RATIO: - movie.setAspectRatio((String) task.getResult()); - isDone = true; - break; - case TAGLINES: - isDone = true; - } - - if (isDone) { - movie.getRetrievedDataTypes().add((MovieDataType) task.getDataType()); - } - } - - private void fillUpTaglineList(TaglineList taglineList, Task task) { - switch ((TaglineListDataType) task.getDataType()) { - case ELEMENTS: - taglineList.setUrl(task.getUrl()); - taglineList.getRetrievedDataTypes().add((TaglineListDataType) task.getDataType()); - break; - } - } - - private void fillUpTagline(Tagline tagline, Task task) { - switch ((TaglineDataType) task.getDataType()) { - case ID: - tagline.setUrl(task.getUrl()); - tagline.setId((String) task.getResult()); - tagline.getRetrievedDataTypes().add((TaglineDataType) task.getDataType()); - break; - case TEXT: - tagline.setUrl(task.getUrl()); - tagline.setText((String) task.getResult()); - tagline.getRetrievedDataTypes().add((TaglineDataType) task.getDataType()); - break; - } - } - - private void fillUpMovieList(MovieList movieList, Task task) { - switch ((MovieListDataType) task.getDataType()) { - case ELEMENTS: - movieList.setUrl(task.getUrl()); - movieList.getRetrievedDataTypes().add((MovieListDataType) task.getDataType()); - break; - } - } @Override public EnumSet getDefaultMovieDataTypeSet() { @@ -561,4 +204,17 @@ public class ApiFactory_1_0 implements ApiFactory { public EnumSet getMovieDataTypeSet() { return movieDataTypeSet; } + + + HtmlProcessor getHtmlProcessor() { + return htmlProcessor; + } + + MovieProcessor_1_0 getMovieProcessor() { + return movieProcessor; + } + + TaglineProcessor_1_0 getTaglineProcessor() { + return taglineProcessor; + } } diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/MovieListProcessor_1_0.java b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/MovieListProcessor_1_0.java new file mode 100644 index 0000000..bd2a793 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/MovieListProcessor_1_0.java @@ -0,0 +1,69 @@ +package ru.bvn13.imdbspider.spider.api.v1_0; + +import org.jsoup.nodes.Element; +import ru.bvn13.imdbspider.imdb.MovieDataType; +import ru.bvn13.imdbspider.imdb.MovieList; +import ru.bvn13.imdbspider.imdb.MovieListDataType; +import ru.bvn13.imdbspider.spider.tasker.Task; + +import java.util.List; + +/** + * @author boyko_vn at 15.01.2019 + */ +public class MovieListProcessor_1_0 extends AbstractApiProcessor_1_0 { + + public MovieListProcessor_1_0(ApiFactory_1_0 apiFactory) { + super(apiFactory); + } + + Task taskByMovieListDataType(MovieListDataType movieListDataType) { + Task t = new Task(); + t.setDataType(movieListDataType); + switch (movieListDataType) { + case ELEMENTS: + t.setCssSelector("#main > div > div.findSection > table > tbody > tr > td.result_text"); + t.setResultType(List.class); + t.setPostprocess((task, s) -> { + int count = 0; + for (Element element : task.getCssSelectorResult()) { + count++; + if (task.getRestrictionByCount() != null) { + if (count > task.getRestrictionByCount()) { + break; + } + } + Element link = element.select("a").first(); + if (getApiFactory().getMovieDataTypeSet() == null) { + getApiFactory().setMovieDataTypeSet(getApiFactory().getDefaultMovieDataTypeSet()); + } + if (!getApiFactory().getMovieDataTypeSet().contains(MovieDataType.ID)) { + getApiFactory().getMovieDataTypeSet().add(MovieDataType.ID); + } + Task movieTask = getApiFactory().getMovieProcessor().taskByMovieDataType(MovieDataType.ID) + .setParentTask(task) + .setUrl(String.format("%s%s", ApiFactory_1_0.URL_MAIN, link.attr("href"))); + task.getNestedTasks().add(movieTask); + getApiFactory().getMovieDataTypeSet().forEach(movieDataType -> + movieTask.getNestedTasks().add(getApiFactory().getMovieProcessor().taskByMovieDataType(movieDataType) + .setParentTask(movieTask) + .setUrl(String.format("%s%s", ApiFactory_1_0.URL_MAIN, link.attr("href"))))); + } + }); + break; + } + return t; + } + + void fillUpMovieList(MovieList movieList, Task task) { + switch ((MovieListDataType) task.getDataType()) { + case ELEMENTS: + movieList.setUrl(task.getUrl()); + movieList.getRetrievedDataTypes().add((MovieListDataType) task.getDataType()); + break; + } + } + + + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/MovieProcessor_1_0.java b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/MovieProcessor_1_0.java new file mode 100644 index 0000000..ae39a11 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/MovieProcessor_1_0.java @@ -0,0 +1,280 @@ +package ru.bvn13.imdbspider.spider.api.v1_0; + +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException; +import ru.bvn13.imdbspider.imdb.Movie; +import ru.bvn13.imdbspider.imdb.MovieDataType; +import ru.bvn13.imdbspider.imdb.TaglineListDataType; +import ru.bvn13.imdbspider.imdb.accessories.Link; +import ru.bvn13.imdbspider.imdb.accessories.SoundMix; +import ru.bvn13.imdbspider.spider.tasker.Task; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; + +/** + * @author boyko_vn at 15.01.2019 + */ +public class MovieProcessor_1_0 extends AbstractApiProcessor_1_0 { + + public MovieProcessor_1_0(ApiFactory_1_0 apiFactory) { + super(apiFactory); + } + + public Task taskByMovieDataType(MovieDataType movieDataType) { + Task t = new Task(); + t.setDataType(movieDataType); + switch (movieDataType) { + case ID: + t.setPostprocess((task, s) -> { + Matcher matcher = ApiFactory_1_0.PATTERN_MOVIE_ID_FROM_MOVIELIST.matcher(task.getUrl()); + if (matcher.find()) { + task.setResultType(String.class); + task.setResult(matcher.group(1)); + } + }); + break; + case TITLE: + t.setCssSelector("#title-overview-widget > div.vital > div.title_block > div > div.titleBar > div.title_wrapper > h1"); + t.setPostprocess(ApiFactory_1_0.POSTPROCESS.GET_WHOLE_TEXT_OF_FIRST_ELEMENT); + break; + case ORIGINAL_TITLE: + t.setCssSelector("#title-overview-widget > div.vital > div.title_block > div > div.titleBar > div.title_wrapper > div.originalTitle"); + t.setPostprocess(ApiFactory_1_0.POSTPROCESS.GET_OWN_TEXT_OF_FIRST_ELEMENT); + break; + case YEAR: + t.setCssSelector("#titleYear > a"); + t.setPostprocess((task, s) -> { + task.setResultType(Integer.class); + if (task.getCssSelectorResult().size() > 0) { + try { + task.setResult(Integer.parseInt(task.getCssSelectorResult().first().text().trim())); + } catch (NumberFormatException e) { + task.setResult(-1); + } + } else { + task.setResult(-1); + } + }); + break; + case POSTER: + t.setCssSelector("#title-overview-widget > div.vital > div.slate_wrapper > div.poster > a > img"); + t.setPostprocess((task, s) -> { + task.setResultType(String.class); + if (task.getCssSelectorResult().size() > 0) { + task.setResult(task.getCssSelectorResult().first().attr("src")); + } else { + task.setResult(""); + } + }); + break; + case STORYLINE: + t.setCssSelector("#titleStoryLine > div:nth-child(3) > p > span"); + t.setPostprocess(ApiFactory_1_0.POSTPROCESS.GET_TEXT_OF_FIRST_ELEMENT); + break; + case RANDOM_TAGLINE: + t.setCssSelector("#titleStoryLine > div > h4:contains(Taglines)"); //#titleStoryLine > div:nth-child(8) > h4 + t.setPostprocess(ApiFactory_1_0.POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); + break; + case GENRES: + t.setCssSelector("#titleStoryLine > div > h4:contains(Genres)"); + t.setPostprocess(ApiFactory_1_0.POSTPROCESS.COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE); + break; + case CERTIFICATE: + t.setCssSelector("#titleStoryLine > div > h4:contains(Certificate)"); + t.setPostprocess((task, s) -> { + task.setResultType(String.class); + if (task.getCssSelectorResult().size() > 0) { + task.setResult(task.getCssSelectorResult().first().parent().select("span:nth-child(2)").first().text().trim()); + } + }); + break; + case OFFICIAL_SITES: + t.setCssSelector("#titleDetails > div > h4:contains(Official Sites)"); + t.setPostprocess(ApiFactory_1_0.POSTPROCESS.COLLECT_ALL_NESTED_LINKS_OF_PARENT_NODE); + break; + case COUNTRIES: + t.setCssSelector("#titleDetails > div > h4:contains(Country)"); + t.setPostprocess(ApiFactory_1_0.POSTPROCESS.COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE); + break; + case LANGUAGES: + t.setCssSelector("#titleDetails > div > h4:contains(Language)"); + t.setPostprocess(ApiFactory_1_0.POSTPROCESS.COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE); + break; + case RELEASE_DATE: + t.setCssSelector("#titleDetails > div > h4:contains(Release Date)"); + t.setPostprocess(ApiFactory_1_0.POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); + break; + case BUDGET: + t.setCssSelector("#titleDetails > div > h4:contains(Budget)"); + t.setPostprocess(ApiFactory_1_0.POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); + break; + case CUMULATIVE_WORLDWIDE_GROSS: + t.setCssSelector("#titleDetails > div > h4:contains(Cumulative Worldwide Gross)"); + t.setPostprocess(ApiFactory_1_0.POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); + break; + case RUNTIME: + t.setCssSelector("#titleDetails > div > h4:contains(Runtime)"); + t.setPostprocess((task, s) -> { + task.setResultType(String.class); + task.setResult(""); + if (task.getCssSelectorResult().size() > 0) { + task.setResult(task.getCssSelectorResult().first().parent().text().replace("Runtime:", "").trim()); + } + }); + break; + case SOUND_MIXES: + t.setCssSelector("#titleDetails > div > h4:contains(Sound Mix)"); + t.setPostprocess((task, s) -> { + task.setResultType(List.class); + List titles = new ArrayList<>(); + if (task.getCssSelectorResult().size() > 0) { + String html = task.getCssSelectorResult().first().parent().html(); + html = html.replace("\r", ""); + html = html.replace("\n", ""); + html = html.replace("|", "|"); + + // remove header:

Sound Mix:

+ html = html.replaceAll("()", ""); + + String[] lines = html.split("\\|"); + + for (int i=0; i%s", lines[i]), "div"); + if (els.size() > 0) { + Element div = els.first(); + Element link = div.selectFirst("a"); + titles.add(new SoundMix() + .setName(link.text().trim()) + .setDescription(div.ownText()) + ); + } + } catch (HtmlProcessorException e) { + e.printStackTrace(); + } + } + } + task.setResult(titles); + }); + break; + case COLOR: + t.setCssSelector("#titleDetails > div > h4:contains(Color)"); + t.setPostprocess(ApiFactory_1_0.POSTPROCESS.GET_TITLE_OF_FIRST_LINK_IN_PARENT_MODE); + break; + case ASPECT_RATIO: + t.setCssSelector("#titleDetails > div > h4:contains(Aspect Ratio)"); + t.setPostprocess(ApiFactory_1_0.POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); + break; + case TAGLINES: + t.setCssSelector("#titleStoryLine > div > h4:contains(Taglines)"); + t.setPostprocess((task, s) -> { + if (task.getCssSelectorResult().size() > 0) { + Elements links = task.getCssSelectorResult().first().parent().select("span > a:contains(See more)"); + if (links.size() > 0) { + Task newTask = getApiFactory().taskByTaglineListDataType(TaglineListDataType.ELEMENTS) + .setParentTask(task) + .setUrl(String.format("%s%s", ApiFactory_1_0.URL_MAIN, links.first().attr("href"))); + task.getNestedTasks().add(newTask); + } + } + }); + break; + } + return t; + } + + + void fillUpMovie(Movie movie, Task task) { + boolean isDone = false; + switch ((MovieDataType) task.getDataType()) { + case ID: + movie.setUrl(task.getUrl()); + movie.setId((String) task.getResult()); + isDone = true; + break; + case TITLE: + movie.setTitle((String) task.getResult()); + isDone = true; + break; + case ORIGINAL_TITLE: + movie.setOriginalTitle((String) task.getResult()); + isDone = true; + break; + case YEAR: + movie.setYear((Integer) task.getResult()); + isDone = true; + break; + case POSTER: + movie.setPosterLink((String) task.getResult()); + isDone = true; + break; + case STORYLINE: + movie.setStoryline((String) task.getResult()); + isDone = true; + break; + case RANDOM_TAGLINE: + movie.setRandomTagline((String) task.getResult()); + isDone = true; + break; + case GENRES: + movie.setGenres((List) task.getResult()); + isDone = true; + break; + case CERTIFICATE: + movie.setCertificate((String) task.getResult()); + isDone = true; + break; + case OFFICIAL_SITES: + movie.setOfficialSites((List) task.getResult()); + isDone = true; + break; + case COUNTRIES: + movie.setCountries((List) task.getResult()); + isDone = true; + break; + case LANGUAGES: + movie.setLanguages((List) task.getResult()); + isDone = true; + break; + case RELEASE_DATE: + movie.setReleaseDate((String) task.getResult()); + isDone = true; + break; + case BUDGET: + movie.setBudget((String) task.getResult()); + isDone = true; + break; + case CUMULATIVE_WORLDWIDE_GROSS: + movie.setCumulativeWorldwideGross((String) task.getResult()); + isDone = true; + break; + case RUNTIME: + movie.setRuntime((String) task.getResult()); + isDone = true; + break; + case SOUND_MIXES: + movie.setSoundMixes((List) task.getResult()); + isDone = true; + break; + case COLOR: + movie.setColor((String) task.getResult()); + isDone = true; + break; + case ASPECT_RATIO: + movie.setAspectRatio((String) task.getResult()); + isDone = true; + break; + case TAGLINES: + isDone = true; + } + + if (isDone) { + movie.getRetrievedDataTypes().add((MovieDataType) task.getDataType()); + } + } + + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/TaglineListProcessor_1_0.java b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/TaglineListProcessor_1_0.java new file mode 100644 index 0000000..048046e --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/TaglineListProcessor_1_0.java @@ -0,0 +1,57 @@ +package ru.bvn13.imdbspider.spider.api.v1_0; + +import org.jsoup.nodes.Element; +import ru.bvn13.imdbspider.imdb.TaglineDataType; +import ru.bvn13.imdbspider.imdb.TaglineList; +import ru.bvn13.imdbspider.imdb.TaglineListDataType; +import ru.bvn13.imdbspider.spider.tasker.Task; + +import java.util.concurrent.atomic.AtomicInteger; + +/** + * @author boyko_vn at 15.01.2019 + */ +public class TaglineListProcessor_1_0 extends AbstractApiProcessor_1_0 { + + public TaglineListProcessor_1_0(ApiFactory_1_0 apiFactory) { + super(apiFactory); + } + + Task taskByTaglineListDataType(TaglineListDataType taglineListDataType) { + Task t = new Task(); + t.setDataType(taglineListDataType); + switch (taglineListDataType) { + case ELEMENTS: + t.setCssSelector("#taglines_content > div.soda"); + AtomicInteger i = new AtomicInteger(0); + t.setPostprocess((task, s) -> { + for (Element element : task.getCssSelectorResult()) { + Task newTaskId = getApiFactory().getTaglineProcessor().taskByTaglineDataType(TaglineDataType.ID) + .setParentTask(task) + .setUrl(task.getUrl()) + .setResult(String.format("%d", i.getAndAdd(1))); + task.getNestedTasks().add(newTaskId); + + Task newTaskText = getApiFactory().getTaglineProcessor().taskByTaglineDataType(TaglineDataType.TEXT) + .setParentTask(task) + .setUrl(task.getUrl()) + .setResult(element.text()); + newTaskId.getNestedTasks().add(newTaskText); + } + }); + break; + } + return t; + } + + void fillUpTaglineList(TaglineList taglineList, Task task) { + switch ((TaglineListDataType) task.getDataType()) { + case ELEMENTS: + taglineList.setUrl(task.getUrl()); + taglineList.getRetrievedDataTypes().add((TaglineListDataType) task.getDataType()); + break; + } + } + + +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/TaglineProcessor_1_0.java b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/TaglineProcessor_1_0.java new file mode 100644 index 0000000..a971d69 --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/TaglineProcessor_1_0.java @@ -0,0 +1,49 @@ +package ru.bvn13.imdbspider.spider.api.v1_0; + +import ru.bvn13.imdbspider.imdb.Tagline; +import ru.bvn13.imdbspider.imdb.TaglineDataType; +import ru.bvn13.imdbspider.spider.tasker.Task; + +/** + * @author boyko_vn at 15.01.2019 + */ +public class TaglineProcessor_1_0 extends AbstractApiProcessor_1_0 { + + public TaglineProcessor_1_0(ApiFactory_1_0 apiFactory) { + super(apiFactory); + } + + Task taskByTaglineDataType(TaglineDataType taglineDataType) { + Task t = new Task(); + t.setDataType(taglineDataType); + switch (taglineDataType) { + case ID: + // + break; + case TEXT: + t.setPostprocess((task, s) -> { + task.setResult(((String)task.getResult()).trim()); + }); + break; + } + return t; + } + + + void fillUpTagline(Tagline tagline, Task task) { + switch ((TaglineDataType) task.getDataType()) { + case ID: + tagline.setUrl(task.getUrl()); + tagline.setId((String) task.getResult()); + tagline.getRetrievedDataTypes().add((TaglineDataType) task.getDataType()); + break; + case TEXT: + tagline.setUrl(task.getUrl()); + tagline.setText((String) task.getResult()); + tagline.getRetrievedDataTypes().add((TaglineDataType) task.getDataType()); + break; + } + } + + +}