From 0d0abda561585eb4bac2417711826db9d02d2831 Mon Sep 17 00:00:00 2001 From: Vyacheslav Boyko Date: Mon, 14 Jan 2019 18:18:42 +0300 Subject: [PATCH] implemented almost all simple movie params. test included. --- .../java/ru/bvn13/imdbspider/ImdbSpider.java | 24 +- .../java/ru/bvn13/imdbspider/imdb/Movie.java | 141 +++++++++ .../bvn13/imdbspider/imdb/MovieDataType.java | 16 + .../imdbspider/imdb/accessories/Link.java | 28 ++ .../imdbspider/imdb/accessories/SoundMix.java | 28 ++ .../imdbspider/spider/api/ApiFactory.java | 4 - .../spider/api/v1_0/ApiFactory_1_0.java | 277 +++++++++++++++++- .../spider/extractor/HtmlExtractor.java | 5 +- .../imdbspider/spider/tasker/Manager.java | 12 +- .../imdbspider/spider/tasker/Worker.java | 11 +- .../imdbspider/runner/MovieSearchTest.java | 109 ++++++- 11 files changed, 609 insertions(+), 46 deletions(-) create mode 100644 core/src/main/java/ru/bvn13/imdbspider/imdb/accessories/Link.java create mode 100644 core/src/main/java/ru/bvn13/imdbspider/imdb/accessories/SoundMix.java diff --git a/core/src/main/java/ru/bvn13/imdbspider/ImdbSpider.java b/core/src/main/java/ru/bvn13/imdbspider/ImdbSpider.java index 12672e3..3454ba0 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/ImdbSpider.java +++ b/core/src/main/java/ru/bvn13/imdbspider/ImdbSpider.java @@ -6,15 +6,14 @@ import ru.bvn13.imdbspider.imdb.MovieList; import ru.bvn13.imdbspider.spider.api.v1_0.ApiFactory_1_0; import ru.bvn13.imdbspider.spider.composer.ImdbObjectComposerFactory; import ru.bvn13.imdbspider.spider.composer.MovieListComposer; +import ru.bvn13.imdbspider.spider.processor.JsoupHtmlProcessor; import ru.bvn13.imdbspider.spider.tasker.Manager; import ru.bvn13.imdbspider.spider.tasker.Task; import ru.bvn13.imdbspider.spider.api.ApiFactory; import java.time.Duration; import java.time.LocalDateTime; -import java.util.EnumSet; -import java.util.List; -import java.util.concurrent.ExecutionException; +import java.util.*; /** * @author boyko_vn at 09.01.2019 @@ -27,7 +26,7 @@ public class ImdbSpider { private ImdbObjectComposerFactory imdbObjectComposerFactory; public static ImdbSpider withApi_1_0() { - ApiFactory apiFactory = new ApiFactory_1_0(); + ApiFactory apiFactory = new ApiFactory_1_0(new JsoupHtmlProcessor()); return new ImdbSpider(apiFactory, new ImdbObjectComposerFactory(apiFactory)); } @@ -39,6 +38,11 @@ public class ImdbSpider { manager = new Manager(); } + public ImdbSpider addHttpRequestHeader(String key, String value) { + manager.addHttpRequestHeader(key, value); + return this; + } + public MovieList searchMovieByTitle(String title) throws ImdbSpiderException { return searchMovieByTitle(title, 0); } @@ -47,16 +51,16 @@ public class ImdbSpider { return searchMovieByTitle(title, maxCount, EnumSet.of(MovieDataType.TITLE)); } + public MovieList searchMovieByTitle(String title, int maxCount, MovieDataType... dataTypes) throws ImdbSpiderException { + return searchMovieByTitle(title, maxCount, EnumSet.copyOf(Arrays.asList(dataTypes))); + } + public MovieList searchMovieByTitle(String title, int maxCount, EnumSet dataTypes) throws ImdbSpiderException { List tasks = apiFactory.createTasksForSearchMovieByTitle(title, maxCount, dataTypes); LocalDateTime dateStart = LocalDateTime.now(); - try { - manager.processTasks(tasks); - } catch (ExecutionException | InterruptedException e) { - throw new ImdbSpiderException("Error has been occurred!", e); - } + manager.processTasks(tasks); LocalDateTime dateEnd = LocalDateTime.now(); Duration diff = Duration.between(dateStart, dateEnd); System.out.println("TIME SPENT: "+(diff.toMillis())+" msec"); @@ -69,4 +73,6 @@ public class ImdbSpider { } + + } diff --git a/core/src/main/java/ru/bvn13/imdbspider/imdb/Movie.java b/core/src/main/java/ru/bvn13/imdbspider/imdb/Movie.java index c39ba34..06e0643 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/imdb/Movie.java +++ b/core/src/main/java/ru/bvn13/imdbspider/imdb/Movie.java @@ -1,6 +1,10 @@ package ru.bvn13.imdbspider.imdb; +import ru.bvn13.imdbspider.imdb.accessories.Link; +import ru.bvn13.imdbspider.imdb.accessories.SoundMix; + import java.util.EnumSet; +import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; @@ -12,6 +16,23 @@ public class Movie extends ImdbObject { private String title; private String originalTitle; private Integer year; + private String posterLink; + private String storyline; + private String randomTagline; + private List genres; + private String certificate; + private List officialSites; + private List countries; + private List languages; + private String releaseDate; + private String budget; + private String cumulativeWorldwideGross; + private String runtime; + private List soundMixes; + private String color; + private String aspectRatio; + + //private List taglines; private Map akas = new ConcurrentHashMap<>(50); @Override @@ -43,6 +64,126 @@ public class Movie extends ImdbObject { this.year = year; } + public String getPosterLink() { + return posterLink; + } + + public void setPosterLink(String posterLink) { + this.posterLink = posterLink; + } + + public String getStoryline() { + return storyline; + } + + public void setStoryline(String storyline) { + this.storyline = storyline; + } + + public String getRandomTagline() { + return randomTagline; + } + + public void setRandomTagline(String randomTagline) { + this.randomTagline = randomTagline; + } + + public List getGenres() { + return genres; + } + + public void setGenres(List genres) { + this.genres = genres; + } + + public String getCertificate() { + return certificate; + } + + public void setCertificate(String certificate) { + this.certificate = certificate; + } + + public List getOfficialSites() { + return officialSites; + } + + public void setOfficialSites(List officialSites) { + this.officialSites = officialSites; + } + + public List getCountries() { + return countries; + } + + public void setCountries(List countries) { + this.countries = countries; + } + + public List getLanguages() { + return languages; + } + + public void setLanguages(List languages) { + this.languages = languages; + } + + public String getReleaseDate() { + return releaseDate; + } + + public void setReleaseDate(String releaseDate) { + this.releaseDate = releaseDate; + } + + public String getBudget() { + return budget; + } + + public void setBudget(String budget) { + this.budget = budget; + } + + public String getCumulativeWorldwideGross() { + return cumulativeWorldwideGross; + } + + public void setCumulativeWorldwideGross(String cumulativeWorldwideGross) { + this.cumulativeWorldwideGross = cumulativeWorldwideGross; + } + + public String getRuntime() { + return runtime; + } + + public void setRuntime(String runtime) { + this.runtime = runtime; + } + + public List getSoundMixes() { + return soundMixes; + } + + public void setSoundMixes(List soundMixes) { + this.soundMixes = soundMixes; + } + + public String getColor() { + return color; + } + + public void setColor(String color) { + this.color = color; + } + + public String getAspectRatio() { + return aspectRatio; + } + + public void setAspectRatio(String aspectRatio) { + this.aspectRatio = aspectRatio; + } + public Map getAkas() { return akas; } diff --git a/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieDataType.java b/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieDataType.java index d055dd4..a733c93 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieDataType.java +++ b/core/src/main/java/ru/bvn13/imdbspider/imdb/MovieDataType.java @@ -11,6 +11,22 @@ public enum MovieDataType implements DataType { TITLE("title"), ORIGINAL_TITLE("original_title"), YEAR("year"), + POSTER("poster"), + STORYLINE("storyline"), + RANDOM_TAGLINE("random_tagline"), + GENRES("genres"), + CERTIFICATE("certificate"), + OFFICIAL_SITES("official_sites"), + COUNTRIES("countries"), + LANGUAGES("languages"), + RELEASE_DATE("release_date"), + BUDGET("budget"), + CUMULATIVE_WORLDWIDE_GROSS("cumulative worldwide gross"), + RUNTIME("runtime"), + SOUND_MIXES("sound_mixes"), + COLOR("color"), + ASPECT_RATIO("aspect_ratio"), + TAGLINES("taglines"), AKAS("akas") ; diff --git a/core/src/main/java/ru/bvn13/imdbspider/imdb/accessories/Link.java b/core/src/main/java/ru/bvn13/imdbspider/imdb/accessories/Link.java new file mode 100644 index 0000000..1fbe41b --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/imdb/accessories/Link.java @@ -0,0 +1,28 @@ +package ru.bvn13.imdbspider.imdb.accessories; + +/** + * @author boyko_vn at 14.01.2019 + */ +public class Link { + + private String url; + private String title; + + public String getUrl() { + return url; + } + + public Link setUrl(String url) { + this.url = url; + return this; + } + + public String getTitle() { + return title; + } + + public Link setTitle(String title) { + this.title = title; + return this; + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/imdb/accessories/SoundMix.java b/core/src/main/java/ru/bvn13/imdbspider/imdb/accessories/SoundMix.java new file mode 100644 index 0000000..8bae2cb --- /dev/null +++ b/core/src/main/java/ru/bvn13/imdbspider/imdb/accessories/SoundMix.java @@ -0,0 +1,28 @@ +package ru.bvn13.imdbspider.imdb.accessories; + +/** + * @author boyko_vn at 14.01.2019 + */ +public class SoundMix { + + private String name; + private String description; + + public String getName() { + return name; + } + + public SoundMix setName(String name) { + this.name = name; + return this; + } + + public String getDescription() { + return description; + } + + public SoundMix setDescription(String description) { + this.description = description; + return this; + } +} diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/api/ApiFactory.java b/core/src/main/java/ru/bvn13/imdbspider/spider/api/ApiFactory.java index 63e89c0..887931b 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/spider/api/ApiFactory.java +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/api/ApiFactory.java @@ -18,10 +18,6 @@ public interface ApiFactory { List createTasksForSearchMovieByTitle(String title, int maxCount, EnumSet dataTypes) throws ImdbSpiderException; - default List createTasksForSearchMovieByTitle(String title, int maxCount, MovieDataType... dataTypes) throws ImdbSpiderException { - return createTasksForSearchMovieByTitle(title, maxCount, EnumSet.copyOf(Arrays.asList(dataTypes))); - } - Task taskByDataType(DataType dataType) throws DataTypeNotSupportedException; void fillUpImdbObject(ImdbObject imdbObject, Task task); diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java index e335899..47c3b51 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/api/v1_0/ApiFactory_1_0.java @@ -1,10 +1,15 @@ package ru.bvn13.imdbspider.spider.api.v1_0; import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; import ru.bvn13.imdbspider.exceptions.api.DataTypeNotSupportedException; +import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException; import ru.bvn13.imdbspider.imdb.*; +import ru.bvn13.imdbspider.imdb.accessories.Link; +import ru.bvn13.imdbspider.imdb.accessories.SoundMix; import ru.bvn13.imdbspider.spider.api.ApiFactory; +import ru.bvn13.imdbspider.spider.processor.HtmlProcessor; import ru.bvn13.imdbspider.spider.tasker.Task; import java.net.URLEncoder; @@ -12,15 +17,18 @@ import java.nio.charset.Charset; import java.util.ArrayList; import java.util.EnumSet; import java.util.List; +import java.util.function.BiConsumer; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @author boyko_vn at 09.01.2019 + * + * IMDB :: SPIDER :: API :: version 1.0 (started 09.01.2019) */ public class ApiFactory_1_0 implements ApiFactory { - private static final String URL_MAIN = "https://www.imdb.com"; + public static final String URL_MAIN = "https://www.imdb.com"; private static final String URL_SEARCH_TITLE = "https://www.imdb.com/find?ref_=nv_sr_fn&q={{title}}&s=tt"; @@ -29,6 +37,90 @@ public class ApiFactory_1_0 implements ApiFactory { private EnumSet defaultMovieDataTypeSet = EnumSet.of(MovieDataType.ID, MovieDataType.TITLE, MovieDataType.YEAR); private EnumSet movieDataTypeSet; + private HtmlProcessor htmlProcessor; + + private static class POSTPROCESS { + + static final BiConsumer GET_TEXT_OF_FIRST_ELEMENT = (task, s) -> { + task.setResultType(String.class); + if (task.getCssSelectorResult().size() > 0) { + task.setResult(task.getCssSelectorResult().first().text().trim()); + } else { + task.setResult(""); + } + }; + + static final BiConsumer GET_OWN_TEXT_OF_FIRST_ELEMENT = (task, s) -> { + task.setResultType(String.class); + if (task.getCssSelectorResult().size() > 0) { + task.setResult(task.getCssSelectorResult().first().ownText().trim()); + } else { + task.setResult(""); + } + }; + + static final BiConsumer GET_WHOLE_TEXT_OF_FIRST_ELEMENT = (task, s) -> { + task.setResultType(String.class); + if (task.getCssSelectorResult().size() > 0) { + task.setResult(task.getCssSelectorResult().first().wholeText().trim()); + } else { + task.setResult(""); + } + }; + + static final BiConsumer GET_OWN_TEXT_OF_PARENT_MODE = (task, s) -> { + task.setResultType(String.class); + if (task.getCssSelectorResult().size() > 0) { + task.setResult(task.getCssSelectorResult().first().parent().ownText().trim()); + } else { + task.setResult(""); + } + }; + + static final BiConsumer COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE = (task, s) -> { + task.setResultType(List.class); + List titles = new ArrayList<>(); + if (task.getCssSelectorResult().size() > 0) { + for (Element title : task.getCssSelectorResult().first().parent().select("a")) { + titles.add(title.text().trim()); + } + } + task.setResult(titles); + }; + + static final BiConsumer COLLECT_ALL_NESTED_LINKS_OF_PARENT_NODE = (task, s) -> { + task.setResultType(List.class); + List titles = new ArrayList<>(); + if (task.getCssSelectorResult().size() > 0) { + for (Element link : task.getCssSelectorResult().first().parent().select("a")) { + final String url = link.attr("href").trim(); + titles.add(new Link() + .setTitle(link.text().trim()) + .setUrl((url.startsWith("/") ? String.format("%s%s", URL_MAIN, url) : url)) + ); + } + } + task.setResult(titles); + }; + + static final BiConsumer GET_TITLE_OF_FIRST_LINK_IN_PARENT_MODE = (task, s) -> { + task.setResultType(String.class); + task.setResult(""); + if (task.getCssSelectorResult().size() > 0) { + Elements links = task.getCssSelectorResult().first().parent().select("a"); + if (links.size() > 0) { + task.setResult(links.first().text().trim()); + } + } + }; + + + } + + public ApiFactory_1_0(HtmlProcessor htmlProcessor) { + this.htmlProcessor = htmlProcessor; + } + @Override public List createTasksForSearchMovieByTitle(String title, int maxCount, EnumSet dataTypes) throws ImdbSpiderException { @@ -92,21 +184,11 @@ public class ApiFactory_1_0 implements ApiFactory { break; case TITLE: t.setCssSelector("#title-overview-widget > div.vital > div.title_block > div > div.titleBar > div.title_wrapper > h1"); - t.setPostprocess((task, s) -> { - task.setResultType(String.class); - task.setResult(task.getCssSelectorResult().first().wholeText().trim()); - }); + t.setPostprocess(POSTPROCESS.GET_WHOLE_TEXT_OF_FIRST_ELEMENT); break; case ORIGINAL_TITLE: t.setCssSelector("#title-overview-widget > div.vital > div.title_block > div > div.titleBar > div.title_wrapper > div.originalTitle"); - t.setPostprocess((task, s) -> { - task.setResultType(String.class); - if (task.getCssSelectorResult().size() > 0) { - task.setResult(task.getCssSelectorResult().first().ownText()); - } else { - task.setResult(""); - } - }); + t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_FIRST_ELEMENT); break; case YEAR: t.setCssSelector("#titleYear > a"); @@ -123,6 +205,115 @@ public class ApiFactory_1_0 implements ApiFactory { } }); break; + case POSTER: + t.setCssSelector("#title-overview-widget > div.vital > div.slate_wrapper > div.poster > a > img"); + t.setPostprocess((task, s) -> { + task.setResultType(String.class); + if (task.getCssSelectorResult().size() > 0) { + task.setResult(task.getCssSelectorResult().first().attr("src")); + } else { + task.setResult(""); + } + }); + break; + case STORYLINE: + t.setCssSelector("#titleStoryLine > div:nth-child(3) > p > span"); + t.setPostprocess(POSTPROCESS.GET_TEXT_OF_FIRST_ELEMENT); + break; + case RANDOM_TAGLINE: + t.setCssSelector("#titleStoryLine > div > h4:contains(Taglines)"); //#titleStoryLine > div:nth-child(8) > h4 + t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); + break; + case GENRES: + t.setCssSelector("#titleStoryLine > div > h4:contains(Genres)"); + t.setPostprocess(POSTPROCESS.COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE); + break; + case CERTIFICATE: + t.setCssSelector("#titleStoryLine > div > h4:contains(Certificate)"); + t.setPostprocess((task, s) -> { + task.setResultType(String.class); + if (task.getCssSelectorResult().size() > 0) { + task.setResult(task.getCssSelectorResult().first().parent().select("span:nth-child(2)").first().text().trim()); + } + }); + break; + case OFFICIAL_SITES: + t.setCssSelector("#titleDetails > div > h4:contains(Official Sites)"); + t.setPostprocess(POSTPROCESS.COLLECT_ALL_NESTED_LINKS_OF_PARENT_NODE); + break; + case COUNTRIES: + t.setCssSelector("#titleDetails > div > h4:contains(Country)"); + t.setPostprocess(POSTPROCESS.COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE); + break; + case LANGUAGES: + t.setCssSelector("#titleDetails > div > h4:contains(Language)"); + t.setPostprocess(POSTPROCESS.COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE); + break; + case RELEASE_DATE: + t.setCssSelector("#titleDetails > div > h4:contains(Release Date)"); + t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); + break; + case BUDGET: + t.setCssSelector("#titleDetails > div > h4:contains(Budget)"); + t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); + break; + case CUMULATIVE_WORLDWIDE_GROSS: + t.setCssSelector("#titleDetails > div > h4:contains(Cumulative Worldwide Gross)"); + t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); + break; + case RUNTIME: + t.setCssSelector("#titleDetails > div > h4:contains(Runtime)"); + t.setPostprocess((task, s) -> { + task.setResultType(String.class); + task.setResult(""); + if (task.getCssSelectorResult().size() > 0) { + task.setResult(task.getCssSelectorResult().first().parent().text().replace("Runtime:", "").trim()); + } + }); + break; + case SOUND_MIXES: + t.setCssSelector("#titleDetails > div > h4:contains(Sound Mix)"); + t.setPostprocess((task, s) -> { + task.setResultType(List.class); + List titles = new ArrayList<>(); + if (task.getCssSelectorResult().size() > 0) { + String html = task.getCssSelectorResult().first().parent().html(); + html = html.replace("\r", ""); + html = html.replace("\n", ""); + html = html.replace("|", "|"); + + // remove header:

Sound Mix:

+ html = html.replaceAll("()", ""); + + String[] lines = html.split("\\|"); + + for (int i=0; i%s", lines[i]), "div"); + if (els.size() > 0) { + Element div = els.first(); + Element link = div.selectFirst("a"); + titles.add(new SoundMix() + .setName(link.text().trim()) + .setDescription(div.ownText()) + ); + } + } catch (HtmlProcessorException e) { + e.printStackTrace(); + } + } + } + task.setResult(titles); + }); + break; + case COLOR: + t.setCssSelector("#titleDetails > div > h4:contains(Color)"); + t.setPostprocess(POSTPROCESS.GET_TITLE_OF_FIRST_LINK_IN_PARENT_MODE); + break; + case ASPECT_RATIO: + t.setCssSelector("#titleDetails > div > h4:contains(Aspect Ratio)"); + t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE); + break; } return t; } @@ -184,6 +375,66 @@ public class ApiFactory_1_0 implements ApiFactory { movie.setYear((Integer) task.getResult()); isDone = true; break; + case POSTER: + movie.setPosterLink((String) task.getResult()); + isDone = true; + break; + case STORYLINE: + movie.setStoryline((String) task.getResult()); + isDone = true; + break; + case RANDOM_TAGLINE: + movie.setRandomTagline((String) task.getResult()); + isDone = true; + break; + case GENRES: + movie.setGenres((List) task.getResult()); + isDone = true; + break; + case CERTIFICATE: + movie.setCertificate((String) task.getResult()); + isDone = true; + break; + case OFFICIAL_SITES: + movie.setOfficialSites((List) task.getResult()); + isDone = true; + break; + case COUNTRIES: + movie.setCountries((List) task.getResult()); + isDone = true; + break; + case LANGUAGES: + movie.setLanguages((List) task.getResult()); + isDone = true; + break; + case RELEASE_DATE: + movie.setReleaseDate((String) task.getResult()); + isDone = true; + break; + case BUDGET: + movie.setBudget((String) task.getResult()); + isDone = true; + break; + case CUMULATIVE_WORLDWIDE_GROSS: + movie.setCumulativeWorldwideGross((String) task.getResult()); + isDone = true; + break; + case RUNTIME: + movie.setRuntime((String) task.getResult()); + isDone = true; + break; + case SOUND_MIXES: + movie.setSoundMixes((List) task.getResult()); + isDone = true; + break; + case COLOR: + movie.setColor((String) task.getResult()); + isDone = true; + break; + case ASPECT_RATIO: + movie.setAspectRatio((String) task.getResult()); + isDone = true; + break; } if (isDone) { diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/extractor/HtmlExtractor.java b/core/src/main/java/ru/bvn13/imdbspider/spider/extractor/HtmlExtractor.java index d0a42a5..488cca6 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/spider/extractor/HtmlExtractor.java +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/extractor/HtmlExtractor.java @@ -34,7 +34,7 @@ public class HtmlExtractor { : resultString; } - public String getHtml(String url) throws HtmlExtractorException { + public String getHtml(String url, Map headers) throws HtmlExtractorException { URL obj = null; @@ -52,6 +52,9 @@ public class HtmlExtractor { } connection.setRequestProperty("Accept", "text/html"); + for (Map.Entry header : headers.entrySet()) { + connection.setRequestProperty(header.getKey(), header.getValue()); + } try { connection.setRequestMethod("GET"); diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Manager.java b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Manager.java index a0d9960..97c7329 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Manager.java +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Manager.java @@ -3,6 +3,7 @@ package ru.bvn13.imdbspider.spider.tasker; import ru.bvn13.imdbspider.exceptions.extractor.HtmlExtractorException; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.*; @@ -12,14 +13,13 @@ import java.util.concurrent.*; */ public class Manager { - private ExecutorService executor; + private Map httpRequestHeaders = new HashMap<>(); - public Manager() { - this.executor = Executors.newCachedThreadPool(); + public void addHttpRequestHeader(String key, String value) { + this.httpRequestHeaders.put(key, value); } - - public void processTasks(List allTasks) throws ExecutionException, InterruptedException { + public void processTasks(List allTasks) { Map> groupedTasks = new ConcurrentHashMap<>(allTasks.size()); @@ -39,7 +39,7 @@ public class Manager { groupedTasks.entrySet().parallelStream().forEach(stringListEntry -> { Worker w = new Worker(stringListEntry.getKey(), stringListEntry.getValue()); try { - w.run(); + w.run(httpRequestHeaders); } catch (HtmlExtractorException e) { e.printStackTrace(); } diff --git a/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Worker.java b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Worker.java index fd7c3bb..fbdb32d 100644 --- a/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Worker.java +++ b/core/src/main/java/ru/bvn13/imdbspider/spider/tasker/Worker.java @@ -7,7 +7,9 @@ import ru.bvn13.imdbspider.spider.extractor.HtmlExtractor; import ru.bvn13.imdbspider.spider.processor.HtmlProcessor; import ru.bvn13.imdbspider.spider.processor.JsoupHtmlProcessor; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.concurrent.*; /** @@ -21,22 +23,17 @@ public class Worker { private final HtmlExtractor htmlExtractor; private final HtmlProcessor htmlProcessor; - private final ExecutorService executor; - public Worker(String url, List tasks) { this.url = url; this.tasks = tasks; this.htmlExtractor = new HtmlExtractor(); this.htmlProcessor = new JsoupHtmlProcessor(); - - this.executor = Executors.newCachedThreadPool(); } + public Boolean run(Map httpRequestHeaders) throws HtmlExtractorException { - public Boolean run() throws HtmlExtractorException { - - final String html = htmlExtractor.getHtml(url); + final String html = htmlExtractor.getHtml(url, httpRequestHeaders); tasks.parallelStream().forEach(task -> { diff --git a/core/src/test/java/ru/bvn13/imdbspider/runner/MovieSearchTest.java b/core/src/test/java/ru/bvn13/imdbspider/runner/MovieSearchTest.java index 9d633f4..0f70671 100644 --- a/core/src/test/java/ru/bvn13/imdbspider/runner/MovieSearchTest.java +++ b/core/src/test/java/ru/bvn13/imdbspider/runner/MovieSearchTest.java @@ -10,26 +10,123 @@ import ru.bvn13.imdbspider.exceptions.ImdbSpiderException; import ru.bvn13.imdbspider.imdb.Movie; import ru.bvn13.imdbspider.imdb.MovieDataType; import ru.bvn13.imdbspider.imdb.MovieList; - -import java.util.EnumSet; +import ru.bvn13.imdbspider.imdb.accessories.SoundMix; -public class MovieSearchTest -{ +public class MovieSearchTest { + + private static final String TERMINATOR_STORYLINE = "A cyborg is sent from the future on a deadly mission. He has to kill Sarah Connor, a young woman whose life will have a great significance in years to come. Sarah has only one protector - Kyle Reese - also sent from the future. The Terminator uses his exceptional intelligence and strength to find Sarah, but is there any way to stop the seemingly indestructible cyborg ?"; + private static final String TERMINATOR_POSTER_LINK = "https://m.media-amazon.com/images/M/MV5BYTViNzMxZjEtZGEwNy00MDNiLWIzNGQtZDY2MjQ1OWViZjFmXkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_UX182_CR0,0,182,268_AL_.jpg"; + private static ImdbSpider spider; @BeforeClass public static void initClass() { - spider = ImdbSpider.withApi_1_0(); + spider = ImdbSpider.withApi_1_0() + .addHttpRequestHeader("Content-Language", "ru-RU"); } @Test public void testSearchTerminator() throws ImdbSpiderException { - MovieList result = spider.searchMovieByTitle("Терминатор", 5, EnumSet.of(MovieDataType.ID, MovieDataType.TITLE, MovieDataType.ORIGINAL_TITLE, MovieDataType.YEAR)); + MovieList result = spider.searchMovieByTitle("Терминатор", 5, + MovieDataType.ID, + MovieDataType.TITLE, + MovieDataType.ORIGINAL_TITLE, + MovieDataType.YEAR, + MovieDataType.STORYLINE, + MovieDataType.RANDOM_TAGLINE, + MovieDataType.GENRES, + MovieDataType.CERTIFICATE, + MovieDataType.OFFICIAL_SITES, + MovieDataType.COUNTRIES, + MovieDataType.LANGUAGES, + MovieDataType.RELEASE_DATE, + MovieDataType.BUDGET, + MovieDataType.CUMULATIVE_WORLDWIDE_GROSS, + MovieDataType.RUNTIME, + MovieDataType.SOUND_MIXES, + MovieDataType.COLOR, + MovieDataType.ASPECT_RATIO, + MovieDataType.POSTER + ); + + assertTrue(result.getMovies().size() > 0); Movie movie = result.getMovies().get(0); + + assertTrue(movie.isDataTypeRetrieved(MovieDataType.ID)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.TITLE)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.ORIGINAL_TITLE)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.YEAR)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.STORYLINE)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.RANDOM_TAGLINE)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.GENRES)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.CERTIFICATE)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.OFFICIAL_SITES)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.COUNTRIES)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.LANGUAGES)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.RELEASE_DATE)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.BUDGET)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.CUMULATIVE_WORLDWIDE_GROSS)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.RUNTIME)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.SOUND_MIXES)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.COLOR)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.ASPECT_RATIO)); + assertTrue(movie.isDataTypeRetrieved(MovieDataType.POSTER)); + assertEquals("0088247", movie.getId()); assertEquals("The Terminator", movie.getOriginalTitle()); assertEquals(Integer.valueOf(1984), movie.getYear()); + + assertEquals(TERMINATOR_STORYLINE, movie.getStoryline()); + //assertEquals(TERMINATOR_TAGLINES, movie.getRandomTagline()); + assertTrue(movie.getGenres().contains("Action")); + assertTrue(movie.getGenres().contains("Sci-Fi")); + + assertEquals("16+", movie.getCertificate()); + + //assertTrue(movie.getOfficialSites().contains("Facebook")); + assertTrue(movie.getOfficialSites().size() > 0); + assertEquals("Facebook", movie.getOfficialSites().get(0).getTitle()); + //assertEquals(TERMINATOR_FACEBOOK_URL, movie.getOfficialSites().get(0).getUrl()); //not comparable, dynamic link + + assertTrue(movie.getCountries().size() > 0); + assertTrue(movie.getCountries().contains("UK")); + assertTrue(movie.getCountries().contains("USA")); + + assertTrue(movie.getLanguages().size() > 0); + assertTrue(movie.getLanguages().contains("English")); + assertTrue(movie.getLanguages().contains("Spanish")); + + assertEquals("26 October 1984 (USA)", movie.getReleaseDate()); + + assertEquals("$6,400,000", movie.getBudget()); + assertEquals("$40,000,000", movie.getCumulativeWorldwideGross()); + + assertEquals("107 min", movie.getRuntime()); + + //sound mixes + assertTrue(movie.getSoundMixes().size() > 0); + boolean hasMono=false, hasDolby=false, hasDTS=false; + String descrMono="", descrDolby="", descrDTS=""; + for (SoundMix soundMix : movie.getSoundMixes()) { + switch (soundMix.getName()) { + case "Mono" : hasMono = true; descrMono = soundMix.getDescription(); break; + case "Dolby" : hasDolby = true; descrDolby = soundMix.getDescription(); break; + case "DTS" : hasDTS = true; descrDTS = soundMix.getDescription(); break; + } + } + assertTrue(hasMono); + assertTrue(hasDolby); + assertTrue(hasDTS); + assertEquals("(original release)", descrMono); + assertEquals("(DVD Re-Release)", descrDolby); + assertEquals("(DTS HD Master Audio)", descrDTS); + + assertEquals("Color", movie.getColor()); + + assertEquals("1.85 : 1", movie.getAspectRatio()); + + assertEquals(TERMINATOR_POSTER_LINK, movie.getPosterLink()); } }