mirror of https://github.com/bvn13/imdb-spider.git
implemented almost all simple movie params. test included.
parent
9898b9113d
commit
0d0abda561
|
@ -6,15 +6,14 @@ import ru.bvn13.imdbspider.imdb.MovieList;
|
|||
import ru.bvn13.imdbspider.spider.api.v1_0.ApiFactory_1_0;
|
||||
import ru.bvn13.imdbspider.spider.composer.ImdbObjectComposerFactory;
|
||||
import ru.bvn13.imdbspider.spider.composer.MovieListComposer;
|
||||
import ru.bvn13.imdbspider.spider.processor.JsoupHtmlProcessor;
|
||||
import ru.bvn13.imdbspider.spider.tasker.Manager;
|
||||
import ru.bvn13.imdbspider.spider.tasker.Task;
|
||||
import ru.bvn13.imdbspider.spider.api.ApiFactory;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 09.01.2019
|
||||
|
@ -27,7 +26,7 @@ public class ImdbSpider {
|
|||
private ImdbObjectComposerFactory imdbObjectComposerFactory;
|
||||
|
||||
public static ImdbSpider withApi_1_0() {
|
||||
ApiFactory apiFactory = new ApiFactory_1_0();
|
||||
ApiFactory apiFactory = new ApiFactory_1_0(new JsoupHtmlProcessor());
|
||||
return new ImdbSpider(apiFactory, new ImdbObjectComposerFactory(apiFactory));
|
||||
}
|
||||
|
||||
|
@ -39,6 +38,11 @@ public class ImdbSpider {
|
|||
manager = new Manager();
|
||||
}
|
||||
|
||||
public ImdbSpider addHttpRequestHeader(String key, String value) {
|
||||
manager.addHttpRequestHeader(key, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
public MovieList searchMovieByTitle(String title) throws ImdbSpiderException {
|
||||
return searchMovieByTitle(title, 0);
|
||||
}
|
||||
|
@ -47,16 +51,16 @@ public class ImdbSpider {
|
|||
return searchMovieByTitle(title, maxCount, EnumSet.of(MovieDataType.TITLE));
|
||||
}
|
||||
|
||||
public MovieList searchMovieByTitle(String title, int maxCount, MovieDataType... dataTypes) throws ImdbSpiderException {
|
||||
return searchMovieByTitle(title, maxCount, EnumSet.copyOf(Arrays.asList(dataTypes)));
|
||||
}
|
||||
|
||||
public MovieList searchMovieByTitle(String title, int maxCount, EnumSet<MovieDataType> dataTypes) throws ImdbSpiderException {
|
||||
|
||||
List<Task> tasks = apiFactory.createTasksForSearchMovieByTitle(title, maxCount, dataTypes);
|
||||
|
||||
LocalDateTime dateStart = LocalDateTime.now();
|
||||
try {
|
||||
manager.processTasks(tasks);
|
||||
} catch (ExecutionException | InterruptedException e) {
|
||||
throw new ImdbSpiderException("Error has been occurred!", e);
|
||||
}
|
||||
manager.processTasks(tasks);
|
||||
LocalDateTime dateEnd = LocalDateTime.now();
|
||||
Duration diff = Duration.between(dateStart, dateEnd);
|
||||
System.out.println("TIME SPENT: "+(diff.toMillis())+" msec");
|
||||
|
@ -69,4 +73,6 @@ public class ImdbSpider {
|
|||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
package ru.bvn13.imdbspider.imdb;
|
||||
|
||||
import ru.bvn13.imdbspider.imdb.accessories.Link;
|
||||
import ru.bvn13.imdbspider.imdb.accessories.SoundMix;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
|
@ -12,6 +16,23 @@ public class Movie extends ImdbObject<MovieDataType> {
|
|||
private String title;
|
||||
private String originalTitle;
|
||||
private Integer year;
|
||||
private String posterLink;
|
||||
private String storyline;
|
||||
private String randomTagline;
|
||||
private List<String> genres;
|
||||
private String certificate;
|
||||
private List<Link> officialSites;
|
||||
private List<String> countries;
|
||||
private List<String> languages;
|
||||
private String releaseDate;
|
||||
private String budget;
|
||||
private String cumulativeWorldwideGross;
|
||||
private String runtime;
|
||||
private List<SoundMix> soundMixes;
|
||||
private String color;
|
||||
private String aspectRatio;
|
||||
|
||||
//private List<String> taglines;
|
||||
private Map<String, String> akas = new ConcurrentHashMap<>(50);
|
||||
|
||||
@Override
|
||||
|
@ -43,6 +64,126 @@ public class Movie extends ImdbObject<MovieDataType> {
|
|||
this.year = year;
|
||||
}
|
||||
|
||||
public String getPosterLink() {
|
||||
return posterLink;
|
||||
}
|
||||
|
||||
public void setPosterLink(String posterLink) {
|
||||
this.posterLink = posterLink;
|
||||
}
|
||||
|
||||
public String getStoryline() {
|
||||
return storyline;
|
||||
}
|
||||
|
||||
public void setStoryline(String storyline) {
|
||||
this.storyline = storyline;
|
||||
}
|
||||
|
||||
public String getRandomTagline() {
|
||||
return randomTagline;
|
||||
}
|
||||
|
||||
public void setRandomTagline(String randomTagline) {
|
||||
this.randomTagline = randomTagline;
|
||||
}
|
||||
|
||||
public List<String> getGenres() {
|
||||
return genres;
|
||||
}
|
||||
|
||||
public void setGenres(List<String> genres) {
|
||||
this.genres = genres;
|
||||
}
|
||||
|
||||
public String getCertificate() {
|
||||
return certificate;
|
||||
}
|
||||
|
||||
public void setCertificate(String certificate) {
|
||||
this.certificate = certificate;
|
||||
}
|
||||
|
||||
public List<Link> getOfficialSites() {
|
||||
return officialSites;
|
||||
}
|
||||
|
||||
public void setOfficialSites(List<Link> officialSites) {
|
||||
this.officialSites = officialSites;
|
||||
}
|
||||
|
||||
public List<String> getCountries() {
|
||||
return countries;
|
||||
}
|
||||
|
||||
public void setCountries(List<String> countries) {
|
||||
this.countries = countries;
|
||||
}
|
||||
|
||||
public List<String> getLanguages() {
|
||||
return languages;
|
||||
}
|
||||
|
||||
public void setLanguages(List<String> languages) {
|
||||
this.languages = languages;
|
||||
}
|
||||
|
||||
public String getReleaseDate() {
|
||||
return releaseDate;
|
||||
}
|
||||
|
||||
public void setReleaseDate(String releaseDate) {
|
||||
this.releaseDate = releaseDate;
|
||||
}
|
||||
|
||||
public String getBudget() {
|
||||
return budget;
|
||||
}
|
||||
|
||||
public void setBudget(String budget) {
|
||||
this.budget = budget;
|
||||
}
|
||||
|
||||
public String getCumulativeWorldwideGross() {
|
||||
return cumulativeWorldwideGross;
|
||||
}
|
||||
|
||||
public void setCumulativeWorldwideGross(String cumulativeWorldwideGross) {
|
||||
this.cumulativeWorldwideGross = cumulativeWorldwideGross;
|
||||
}
|
||||
|
||||
public String getRuntime() {
|
||||
return runtime;
|
||||
}
|
||||
|
||||
public void setRuntime(String runtime) {
|
||||
this.runtime = runtime;
|
||||
}
|
||||
|
||||
public List<SoundMix> getSoundMixes() {
|
||||
return soundMixes;
|
||||
}
|
||||
|
||||
public void setSoundMixes(List<SoundMix> soundMixes) {
|
||||
this.soundMixes = soundMixes;
|
||||
}
|
||||
|
||||
public String getColor() {
|
||||
return color;
|
||||
}
|
||||
|
||||
public void setColor(String color) {
|
||||
this.color = color;
|
||||
}
|
||||
|
||||
public String getAspectRatio() {
|
||||
return aspectRatio;
|
||||
}
|
||||
|
||||
public void setAspectRatio(String aspectRatio) {
|
||||
this.aspectRatio = aspectRatio;
|
||||
}
|
||||
|
||||
public Map<String, String> getAkas() {
|
||||
return akas;
|
||||
}
|
||||
|
|
|
@ -11,6 +11,22 @@ public enum MovieDataType implements DataType {
|
|||
TITLE("title"),
|
||||
ORIGINAL_TITLE("original_title"),
|
||||
YEAR("year"),
|
||||
POSTER("poster"),
|
||||
STORYLINE("storyline"),
|
||||
RANDOM_TAGLINE("random_tagline"),
|
||||
GENRES("genres"),
|
||||
CERTIFICATE("certificate"),
|
||||
OFFICIAL_SITES("official_sites"),
|
||||
COUNTRIES("countries"),
|
||||
LANGUAGES("languages"),
|
||||
RELEASE_DATE("release_date"),
|
||||
BUDGET("budget"),
|
||||
CUMULATIVE_WORLDWIDE_GROSS("cumulative worldwide gross"),
|
||||
RUNTIME("runtime"),
|
||||
SOUND_MIXES("sound_mixes"),
|
||||
COLOR("color"),
|
||||
ASPECT_RATIO("aspect_ratio"),
|
||||
TAGLINES("taglines"),
|
||||
AKAS("akas")
|
||||
|
||||
;
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
package ru.bvn13.imdbspider.imdb.accessories;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 14.01.2019
|
||||
*/
|
||||
public class Link {
|
||||
|
||||
private String url;
|
||||
private String title;
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public Link setUrl(String url) {
|
||||
this.url = url;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public Link setTitle(String title) {
|
||||
this.title = title;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
package ru.bvn13.imdbspider.imdb.accessories;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 14.01.2019
|
||||
*/
|
||||
public class SoundMix {
|
||||
|
||||
private String name;
|
||||
private String description;
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public SoundMix setName(String name) {
|
||||
this.name = name;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
public SoundMix setDescription(String description) {
|
||||
this.description = description;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -18,10 +18,6 @@ public interface ApiFactory {
|
|||
|
||||
List<Task> createTasksForSearchMovieByTitle(String title, int maxCount, EnumSet<MovieDataType> dataTypes) throws ImdbSpiderException;
|
||||
|
||||
default List<Task> createTasksForSearchMovieByTitle(String title, int maxCount, MovieDataType... dataTypes) throws ImdbSpiderException {
|
||||
return createTasksForSearchMovieByTitle(title, maxCount, EnumSet.copyOf(Arrays.asList(dataTypes)));
|
||||
}
|
||||
|
||||
Task taskByDataType(DataType dataType) throws DataTypeNotSupportedException;
|
||||
|
||||
void fillUpImdbObject(ImdbObject imdbObject, Task task);
|
||||
|
|
|
@ -1,10 +1,15 @@
|
|||
package ru.bvn13.imdbspider.spider.api.v1_0;
|
||||
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import ru.bvn13.imdbspider.exceptions.ImdbSpiderException;
|
||||
import ru.bvn13.imdbspider.exceptions.api.DataTypeNotSupportedException;
|
||||
import ru.bvn13.imdbspider.exceptions.processor.HtmlProcessorException;
|
||||
import ru.bvn13.imdbspider.imdb.*;
|
||||
import ru.bvn13.imdbspider.imdb.accessories.Link;
|
||||
import ru.bvn13.imdbspider.imdb.accessories.SoundMix;
|
||||
import ru.bvn13.imdbspider.spider.api.ApiFactory;
|
||||
import ru.bvn13.imdbspider.spider.processor.HtmlProcessor;
|
||||
import ru.bvn13.imdbspider.spider.tasker.Task;
|
||||
|
||||
import java.net.URLEncoder;
|
||||
|
@ -12,15 +17,18 @@ import java.nio.charset.Charset;
|
|||
import java.util.ArrayList;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* @author boyko_vn at 09.01.2019
|
||||
*
|
||||
* IMDB :: SPIDER :: API :: version 1.0 (started 09.01.2019)
|
||||
*/
|
||||
public class ApiFactory_1_0 implements ApiFactory {
|
||||
|
||||
private static final String URL_MAIN = "https://www.imdb.com";
|
||||
public static final String URL_MAIN = "https://www.imdb.com";
|
||||
|
||||
private static final String URL_SEARCH_TITLE = "https://www.imdb.com/find?ref_=nv_sr_fn&q={{title}}&s=tt";
|
||||
|
||||
|
@ -29,6 +37,90 @@ public class ApiFactory_1_0 implements ApiFactory {
|
|||
private EnumSet<MovieDataType> defaultMovieDataTypeSet = EnumSet.of(MovieDataType.ID, MovieDataType.TITLE, MovieDataType.YEAR);
|
||||
private EnumSet<MovieDataType> movieDataTypeSet;
|
||||
|
||||
private HtmlProcessor htmlProcessor;
|
||||
|
||||
private static class POSTPROCESS {
|
||||
|
||||
static final BiConsumer<Task, String> GET_TEXT_OF_FIRST_ELEMENT = (task, s) -> {
|
||||
task.setResultType(String.class);
|
||||
if (task.getCssSelectorResult().size() > 0) {
|
||||
task.setResult(task.getCssSelectorResult().first().text().trim());
|
||||
} else {
|
||||
task.setResult("");
|
||||
}
|
||||
};
|
||||
|
||||
static final BiConsumer<Task, String> GET_OWN_TEXT_OF_FIRST_ELEMENT = (task, s) -> {
|
||||
task.setResultType(String.class);
|
||||
if (task.getCssSelectorResult().size() > 0) {
|
||||
task.setResult(task.getCssSelectorResult().first().ownText().trim());
|
||||
} else {
|
||||
task.setResult("");
|
||||
}
|
||||
};
|
||||
|
||||
static final BiConsumer<Task, String> GET_WHOLE_TEXT_OF_FIRST_ELEMENT = (task, s) -> {
|
||||
task.setResultType(String.class);
|
||||
if (task.getCssSelectorResult().size() > 0) {
|
||||
task.setResult(task.getCssSelectorResult().first().wholeText().trim());
|
||||
} else {
|
||||
task.setResult("");
|
||||
}
|
||||
};
|
||||
|
||||
static final BiConsumer<Task, String> GET_OWN_TEXT_OF_PARENT_MODE = (task, s) -> {
|
||||
task.setResultType(String.class);
|
||||
if (task.getCssSelectorResult().size() > 0) {
|
||||
task.setResult(task.getCssSelectorResult().first().parent().ownText().trim());
|
||||
} else {
|
||||
task.setResult("");
|
||||
}
|
||||
};
|
||||
|
||||
static final BiConsumer<Task, String> COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE = (task, s) -> {
|
||||
task.setResultType(List.class);
|
||||
List<String> titles = new ArrayList<>();
|
||||
if (task.getCssSelectorResult().size() > 0) {
|
||||
for (Element title : task.getCssSelectorResult().first().parent().select("a")) {
|
||||
titles.add(title.text().trim());
|
||||
}
|
||||
}
|
||||
task.setResult(titles);
|
||||
};
|
||||
|
||||
static final BiConsumer<Task, String> COLLECT_ALL_NESTED_LINKS_OF_PARENT_NODE = (task, s) -> {
|
||||
task.setResultType(List.class);
|
||||
List<Link> titles = new ArrayList<>();
|
||||
if (task.getCssSelectorResult().size() > 0) {
|
||||
for (Element link : task.getCssSelectorResult().first().parent().select("a")) {
|
||||
final String url = link.attr("href").trim();
|
||||
titles.add(new Link()
|
||||
.setTitle(link.text().trim())
|
||||
.setUrl((url.startsWith("/") ? String.format("%s%s", URL_MAIN, url) : url))
|
||||
);
|
||||
}
|
||||
}
|
||||
task.setResult(titles);
|
||||
};
|
||||
|
||||
static final BiConsumer<Task, String> GET_TITLE_OF_FIRST_LINK_IN_PARENT_MODE = (task, s) -> {
|
||||
task.setResultType(String.class);
|
||||
task.setResult("");
|
||||
if (task.getCssSelectorResult().size() > 0) {
|
||||
Elements links = task.getCssSelectorResult().first().parent().select("a");
|
||||
if (links.size() > 0) {
|
||||
task.setResult(links.first().text().trim());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
|
||||
public ApiFactory_1_0(HtmlProcessor htmlProcessor) {
|
||||
this.htmlProcessor = htmlProcessor;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Task> createTasksForSearchMovieByTitle(String title, int maxCount, EnumSet<MovieDataType> dataTypes) throws ImdbSpiderException {
|
||||
|
||||
|
@ -92,21 +184,11 @@ public class ApiFactory_1_0 implements ApiFactory {
|
|||
break;
|
||||
case TITLE:
|
||||
t.setCssSelector("#title-overview-widget > div.vital > div.title_block > div > div.titleBar > div.title_wrapper > h1");
|
||||
t.setPostprocess((task, s) -> {
|
||||
task.setResultType(String.class);
|
||||
task.setResult(task.getCssSelectorResult().first().wholeText().trim());
|
||||
});
|
||||
t.setPostprocess(POSTPROCESS.GET_WHOLE_TEXT_OF_FIRST_ELEMENT);
|
||||
break;
|
||||
case ORIGINAL_TITLE:
|
||||
t.setCssSelector("#title-overview-widget > div.vital > div.title_block > div > div.titleBar > div.title_wrapper > div.originalTitle");
|
||||
t.setPostprocess((task, s) -> {
|
||||
task.setResultType(String.class);
|
||||
if (task.getCssSelectorResult().size() > 0) {
|
||||
task.setResult(task.getCssSelectorResult().first().ownText());
|
||||
} else {
|
||||
task.setResult("");
|
||||
}
|
||||
});
|
||||
t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_FIRST_ELEMENT);
|
||||
break;
|
||||
case YEAR:
|
||||
t.setCssSelector("#titleYear > a");
|
||||
|
@ -123,6 +205,115 @@ public class ApiFactory_1_0 implements ApiFactory {
|
|||
}
|
||||
});
|
||||
break;
|
||||
case POSTER:
|
||||
t.setCssSelector("#title-overview-widget > div.vital > div.slate_wrapper > div.poster > a > img");
|
||||
t.setPostprocess((task, s) -> {
|
||||
task.setResultType(String.class);
|
||||
if (task.getCssSelectorResult().size() > 0) {
|
||||
task.setResult(task.getCssSelectorResult().first().attr("src"));
|
||||
} else {
|
||||
task.setResult("");
|
||||
}
|
||||
});
|
||||
break;
|
||||
case STORYLINE:
|
||||
t.setCssSelector("#titleStoryLine > div:nth-child(3) > p > span");
|
||||
t.setPostprocess(POSTPROCESS.GET_TEXT_OF_FIRST_ELEMENT);
|
||||
break;
|
||||
case RANDOM_TAGLINE:
|
||||
t.setCssSelector("#titleStoryLine > div > h4:contains(Taglines)"); //#titleStoryLine > div:nth-child(8) > h4
|
||||
t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE);
|
||||
break;
|
||||
case GENRES:
|
||||
t.setCssSelector("#titleStoryLine > div > h4:contains(Genres)");
|
||||
t.setPostprocess(POSTPROCESS.COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE);
|
||||
break;
|
||||
case CERTIFICATE:
|
||||
t.setCssSelector("#titleStoryLine > div > h4:contains(Certificate)");
|
||||
t.setPostprocess((task, s) -> {
|
||||
task.setResultType(String.class);
|
||||
if (task.getCssSelectorResult().size() > 0) {
|
||||
task.setResult(task.getCssSelectorResult().first().parent().select("span:nth-child(2)").first().text().trim());
|
||||
}
|
||||
});
|
||||
break;
|
||||
case OFFICIAL_SITES:
|
||||
t.setCssSelector("#titleDetails > div > h4:contains(Official Sites)");
|
||||
t.setPostprocess(POSTPROCESS.COLLECT_ALL_NESTED_LINKS_OF_PARENT_NODE);
|
||||
break;
|
||||
case COUNTRIES:
|
||||
t.setCssSelector("#titleDetails > div > h4:contains(Country)");
|
||||
t.setPostprocess(POSTPROCESS.COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE);
|
||||
break;
|
||||
case LANGUAGES:
|
||||
t.setCssSelector("#titleDetails > div > h4:contains(Language)");
|
||||
t.setPostprocess(POSTPROCESS.COLLECT_TITLES_OF_ALL_NESTED_LINKS_OF_PARENT_NODE);
|
||||
break;
|
||||
case RELEASE_DATE:
|
||||
t.setCssSelector("#titleDetails > div > h4:contains(Release Date)");
|
||||
t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE);
|
||||
break;
|
||||
case BUDGET:
|
||||
t.setCssSelector("#titleDetails > div > h4:contains(Budget)");
|
||||
t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE);
|
||||
break;
|
||||
case CUMULATIVE_WORLDWIDE_GROSS:
|
||||
t.setCssSelector("#titleDetails > div > h4:contains(Cumulative Worldwide Gross)");
|
||||
t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE);
|
||||
break;
|
||||
case RUNTIME:
|
||||
t.setCssSelector("#titleDetails > div > h4:contains(Runtime)");
|
||||
t.setPostprocess((task, s) -> {
|
||||
task.setResultType(String.class);
|
||||
task.setResult("");
|
||||
if (task.getCssSelectorResult().size() > 0) {
|
||||
task.setResult(task.getCssSelectorResult().first().parent().text().replace("Runtime:", "").trim());
|
||||
}
|
||||
});
|
||||
break;
|
||||
case SOUND_MIXES:
|
||||
t.setCssSelector("#titleDetails > div > h4:contains(Sound Mix)");
|
||||
t.setPostprocess((task, s) -> {
|
||||
task.setResultType(List.class);
|
||||
List<SoundMix> titles = new ArrayList<>();
|
||||
if (task.getCssSelectorResult().size() > 0) {
|
||||
String html = task.getCssSelectorResult().first().parent().html();
|
||||
html = html.replace("\r", "");
|
||||
html = html.replace("\n", "");
|
||||
html = html.replace("<span class=\"ghost\">|</span>", "|");
|
||||
|
||||
// remove header: <h4 class="inline">Sound Mix:</h4>
|
||||
html = html.replaceAll("(<h4.+\\/h4>)", "");
|
||||
|
||||
String[] lines = html.split("\\|");
|
||||
|
||||
for (int i=0; i<lines.length; i++) {
|
||||
try {
|
||||
Elements els = htmlProcessor.process(String.format("<div>%s</div>", lines[i]), "div");
|
||||
if (els.size() > 0) {
|
||||
Element div = els.first();
|
||||
Element link = div.selectFirst("a");
|
||||
titles.add(new SoundMix()
|
||||
.setName(link.text().trim())
|
||||
.setDescription(div.ownText())
|
||||
);
|
||||
}
|
||||
} catch (HtmlProcessorException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
task.setResult(titles);
|
||||
});
|
||||
break;
|
||||
case COLOR:
|
||||
t.setCssSelector("#titleDetails > div > h4:contains(Color)");
|
||||
t.setPostprocess(POSTPROCESS.GET_TITLE_OF_FIRST_LINK_IN_PARENT_MODE);
|
||||
break;
|
||||
case ASPECT_RATIO:
|
||||
t.setCssSelector("#titleDetails > div > h4:contains(Aspect Ratio)");
|
||||
t.setPostprocess(POSTPROCESS.GET_OWN_TEXT_OF_PARENT_MODE);
|
||||
break;
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
@ -184,6 +375,66 @@ public class ApiFactory_1_0 implements ApiFactory {
|
|||
movie.setYear((Integer) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case POSTER:
|
||||
movie.setPosterLink((String) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case STORYLINE:
|
||||
movie.setStoryline((String) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case RANDOM_TAGLINE:
|
||||
movie.setRandomTagline((String) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case GENRES:
|
||||
movie.setGenres((List<String>) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case CERTIFICATE:
|
||||
movie.setCertificate((String) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case OFFICIAL_SITES:
|
||||
movie.setOfficialSites((List<Link>) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case COUNTRIES:
|
||||
movie.setCountries((List<String>) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case LANGUAGES:
|
||||
movie.setLanguages((List<String>) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case RELEASE_DATE:
|
||||
movie.setReleaseDate((String) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case BUDGET:
|
||||
movie.setBudget((String) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case CUMULATIVE_WORLDWIDE_GROSS:
|
||||
movie.setCumulativeWorldwideGross((String) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case RUNTIME:
|
||||
movie.setRuntime((String) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case SOUND_MIXES:
|
||||
movie.setSoundMixes((List<SoundMix>) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case COLOR:
|
||||
movie.setColor((String) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
case ASPECT_RATIO:
|
||||
movie.setAspectRatio((String) task.getResult());
|
||||
isDone = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (isDone) {
|
||||
|
|
|
@ -34,7 +34,7 @@ public class HtmlExtractor {
|
|||
: resultString;
|
||||
}
|
||||
|
||||
public String getHtml(String url) throws HtmlExtractorException {
|
||||
public String getHtml(String url, Map<String, String> headers) throws HtmlExtractorException {
|
||||
|
||||
URL obj = null;
|
||||
|
||||
|
@ -52,6 +52,9 @@ public class HtmlExtractor {
|
|||
}
|
||||
|
||||
connection.setRequestProperty("Accept", "text/html");
|
||||
for (Map.Entry<String, String> header : headers.entrySet()) {
|
||||
connection.setRequestProperty(header.getKey(), header.getValue());
|
||||
}
|
||||
|
||||
try {
|
||||
connection.setRequestMethod("GET");
|
||||
|
|
|
@ -3,6 +3,7 @@ package ru.bvn13.imdbspider.spider.tasker;
|
|||
import ru.bvn13.imdbspider.exceptions.extractor.HtmlExtractorException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.*;
|
||||
|
@ -12,14 +13,13 @@ import java.util.concurrent.*;
|
|||
*/
|
||||
public class Manager {
|
||||
|
||||
private ExecutorService executor;
|
||||
private Map<String, String> httpRequestHeaders = new HashMap<>();
|
||||
|
||||
public Manager() {
|
||||
this.executor = Executors.newCachedThreadPool();
|
||||
public void addHttpRequestHeader(String key, String value) {
|
||||
this.httpRequestHeaders.put(key, value);
|
||||
}
|
||||
|
||||
|
||||
public void processTasks(List<Task> allTasks) throws ExecutionException, InterruptedException {
|
||||
public void processTasks(List<Task> allTasks) {
|
||||
|
||||
Map<String, List<Task>> groupedTasks = new ConcurrentHashMap<>(allTasks.size());
|
||||
|
||||
|
@ -39,7 +39,7 @@ public class Manager {
|
|||
groupedTasks.entrySet().parallelStream().forEach(stringListEntry -> {
|
||||
Worker w = new Worker(stringListEntry.getKey(), stringListEntry.getValue());
|
||||
try {
|
||||
w.run();
|
||||
w.run(httpRequestHeaders);
|
||||
} catch (HtmlExtractorException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
|
|
@ -7,7 +7,9 @@ import ru.bvn13.imdbspider.spider.extractor.HtmlExtractor;
|
|||
import ru.bvn13.imdbspider.spider.processor.HtmlProcessor;
|
||||
import ru.bvn13.imdbspider.spider.processor.JsoupHtmlProcessor;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
/**
|
||||
|
@ -21,22 +23,17 @@ public class Worker {
|
|||
private final HtmlExtractor htmlExtractor;
|
||||
private final HtmlProcessor htmlProcessor;
|
||||
|
||||
private final ExecutorService executor;
|
||||
|
||||
public Worker(String url, List<Task> tasks) {
|
||||
this.url = url;
|
||||
this.tasks = tasks;
|
||||
|
||||
this.htmlExtractor = new HtmlExtractor();
|
||||
this.htmlProcessor = new JsoupHtmlProcessor();
|
||||
|
||||
this.executor = Executors.newCachedThreadPool();
|
||||
}
|
||||
|
||||
public Boolean run(Map<String, String> httpRequestHeaders) throws HtmlExtractorException {
|
||||
|
||||
public Boolean run() throws HtmlExtractorException {
|
||||
|
||||
final String html = htmlExtractor.getHtml(url);
|
||||
final String html = htmlExtractor.getHtml(url, httpRequestHeaders);
|
||||
|
||||
tasks.parallelStream().forEach(task -> {
|
||||
|
||||
|
|
|
@ -10,26 +10,123 @@ import ru.bvn13.imdbspider.exceptions.ImdbSpiderException;
|
|||
import ru.bvn13.imdbspider.imdb.Movie;
|
||||
import ru.bvn13.imdbspider.imdb.MovieDataType;
|
||||
import ru.bvn13.imdbspider.imdb.MovieList;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import ru.bvn13.imdbspider.imdb.accessories.SoundMix;
|
||||
|
||||
|
||||
public class MovieSearchTest
|
||||
{
|
||||
public class MovieSearchTest {
|
||||
|
||||
private static final String TERMINATOR_STORYLINE = "A cyborg is sent from the future on a deadly mission. He has to kill Sarah Connor, a young woman whose life will have a great significance in years to come. Sarah has only one protector - Kyle Reese - also sent from the future. The Terminator uses his exceptional intelligence and strength to find Sarah, but is there any way to stop the seemingly indestructible cyborg ?";
|
||||
private static final String TERMINATOR_POSTER_LINK = "https://m.media-amazon.com/images/M/MV5BYTViNzMxZjEtZGEwNy00MDNiLWIzNGQtZDY2MjQ1OWViZjFmXkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_UX182_CR0,0,182,268_AL_.jpg";
|
||||
|
||||
private static ImdbSpider spider;
|
||||
|
||||
@BeforeClass
|
||||
public static void initClass() {
|
||||
spider = ImdbSpider.withApi_1_0();
|
||||
spider = ImdbSpider.withApi_1_0()
|
||||
.addHttpRequestHeader("Content-Language", "ru-RU");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSearchTerminator() throws ImdbSpiderException {
|
||||
MovieList result = spider.searchMovieByTitle("Терминатор", 5, EnumSet.of(MovieDataType.ID, MovieDataType.TITLE, MovieDataType.ORIGINAL_TITLE, MovieDataType.YEAR));
|
||||
MovieList result = spider.searchMovieByTitle("Терминатор", 5,
|
||||
MovieDataType.ID,
|
||||
MovieDataType.TITLE,
|
||||
MovieDataType.ORIGINAL_TITLE,
|
||||
MovieDataType.YEAR,
|
||||
MovieDataType.STORYLINE,
|
||||
MovieDataType.RANDOM_TAGLINE,
|
||||
MovieDataType.GENRES,
|
||||
MovieDataType.CERTIFICATE,
|
||||
MovieDataType.OFFICIAL_SITES,
|
||||
MovieDataType.COUNTRIES,
|
||||
MovieDataType.LANGUAGES,
|
||||
MovieDataType.RELEASE_DATE,
|
||||
MovieDataType.BUDGET,
|
||||
MovieDataType.CUMULATIVE_WORLDWIDE_GROSS,
|
||||
MovieDataType.RUNTIME,
|
||||
MovieDataType.SOUND_MIXES,
|
||||
MovieDataType.COLOR,
|
||||
MovieDataType.ASPECT_RATIO,
|
||||
MovieDataType.POSTER
|
||||
);
|
||||
|
||||
|
||||
assertTrue(result.getMovies().size() > 0);
|
||||
Movie movie = result.getMovies().get(0);
|
||||
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.ID));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.TITLE));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.ORIGINAL_TITLE));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.YEAR));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.STORYLINE));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.RANDOM_TAGLINE));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.GENRES));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.CERTIFICATE));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.OFFICIAL_SITES));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.COUNTRIES));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.LANGUAGES));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.RELEASE_DATE));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.BUDGET));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.CUMULATIVE_WORLDWIDE_GROSS));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.RUNTIME));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.SOUND_MIXES));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.COLOR));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.ASPECT_RATIO));
|
||||
assertTrue(movie.isDataTypeRetrieved(MovieDataType.POSTER));
|
||||
|
||||
assertEquals("0088247", movie.getId());
|
||||
assertEquals("The Terminator", movie.getOriginalTitle());
|
||||
assertEquals(Integer.valueOf(1984), movie.getYear());
|
||||
|
||||
assertEquals(TERMINATOR_STORYLINE, movie.getStoryline());
|
||||
//assertEquals(TERMINATOR_TAGLINES, movie.getRandomTagline());
|
||||
assertTrue(movie.getGenres().contains("Action"));
|
||||
assertTrue(movie.getGenres().contains("Sci-Fi"));
|
||||
|
||||
assertEquals("16+", movie.getCertificate());
|
||||
|
||||
//assertTrue(movie.getOfficialSites().contains("Facebook"));
|
||||
assertTrue(movie.getOfficialSites().size() > 0);
|
||||
assertEquals("Facebook", movie.getOfficialSites().get(0).getTitle());
|
||||
//assertEquals(TERMINATOR_FACEBOOK_URL, movie.getOfficialSites().get(0).getUrl()); //not comparable, dynamic link
|
||||
|
||||
assertTrue(movie.getCountries().size() > 0);
|
||||
assertTrue(movie.getCountries().contains("UK"));
|
||||
assertTrue(movie.getCountries().contains("USA"));
|
||||
|
||||
assertTrue(movie.getLanguages().size() > 0);
|
||||
assertTrue(movie.getLanguages().contains("English"));
|
||||
assertTrue(movie.getLanguages().contains("Spanish"));
|
||||
|
||||
assertEquals("26 October 1984 (USA)", movie.getReleaseDate());
|
||||
|
||||
assertEquals("$6,400,000", movie.getBudget());
|
||||
assertEquals("$40,000,000", movie.getCumulativeWorldwideGross());
|
||||
|
||||
assertEquals("107 min", movie.getRuntime());
|
||||
|
||||
//sound mixes
|
||||
assertTrue(movie.getSoundMixes().size() > 0);
|
||||
boolean hasMono=false, hasDolby=false, hasDTS=false;
|
||||
String descrMono="", descrDolby="", descrDTS="";
|
||||
for (SoundMix soundMix : movie.getSoundMixes()) {
|
||||
switch (soundMix.getName()) {
|
||||
case "Mono" : hasMono = true; descrMono = soundMix.getDescription(); break;
|
||||
case "Dolby" : hasDolby = true; descrDolby = soundMix.getDescription(); break;
|
||||
case "DTS" : hasDTS = true; descrDTS = soundMix.getDescription(); break;
|
||||
}
|
||||
}
|
||||
assertTrue(hasMono);
|
||||
assertTrue(hasDolby);
|
||||
assertTrue(hasDTS);
|
||||
assertEquals("(original release)", descrMono);
|
||||
assertEquals("(DVD Re-Release)", descrDolby);
|
||||
assertEquals("(DTS HD Master Audio)", descrDTS);
|
||||
|
||||
assertEquals("Color", movie.getColor());
|
||||
|
||||
assertEquals("1.85 : 1", movie.getAspectRatio());
|
||||
|
||||
assertEquals(TERMINATOR_POSTER_LINK, movie.getPosterLink());
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue