diff --git a/build.gradle b/build.gradle index 7675eed..41140f4 100644 --- a/build.gradle +++ b/build.gradle @@ -35,7 +35,7 @@ subprojects { } group = 'com.bvn13.covid19' - version = '0.1.1' + version = '0.1.2' apply plugin: 'java' apply plugin: 'idea' diff --git a/buildit b/buildit index c442435..0b0e3bb 100644 --- a/buildit +++ b/buildit @@ -1,6 +1,6 @@ #!/bin/bash -version=0.1.1 +version=0.1.2 ./gradlew :covid19-db-migrator:clean :covid19-db-migrator:assemble ./gradlew :covid19-site:clean :covid19-site:assemble diff --git a/covid19-scheduler/build.gradle b/covid19-scheduler/build.gradle index 045a60e..97f0940 100644 --- a/covid19-scheduler/build.gradle +++ b/covid19-scheduler/build.gradle @@ -7,6 +7,12 @@ dependencies { implementation "org.apache.camel.springboot:camel-spring-boot-starter:${camelVersion}" implementation 'org.springframework.boot:spring-boot-starter-data-jpa' + // https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-java + implementation 'org.seleniumhq.selenium:selenium-java:3.141.59' + // https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-firefox-driver + implementation 'org.seleniumhq.selenium:selenium-firefox-driver:3.141.59' + + compileOnly "org.springframework.boot:spring-boot-configuration-processor:${springBootVersion}" annotationProcessor "org.springframework.boot:spring-boot-configuration-processor:${springBootVersion}" diff --git a/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/Covid19SchedulerApplication.java b/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/Covid19SchedulerApplication.java index 02c3de5..464e037 100644 --- a/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/Covid19SchedulerApplication.java +++ b/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/Covid19SchedulerApplication.java @@ -23,6 +23,10 @@ import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.context.properties.EnableConfigurationProperties; import org.springframework.context.annotation.Import; import org.springframework.data.jpa.repository.config.EnableJpaRepositories; +import org.springframework.util.Assert; + +import java.io.File; +import java.nio.file.Files; @EnableJpaRepositories("com.bvn13.covid19.scheduler") @SpringBootApplication @@ -33,6 +37,17 @@ import org.springframework.data.jpa.repository.config.EnableJpaRepositories; public class Covid19SchedulerApplication { public static void main(String[] args) { + + String geckoFilename = ResourceManager.extract("/files/geckodriver"); + Assert.notNull(geckoFilename, "geckodriver not found inside JAR"); + + File file = new File(geckoFilename); + if (!file.canExecute()) { + Assert.isTrue(file.setExecutable(true, true), "Could not make executable: "+geckoFilename); + } + + System.setProperty("webdriver.gecko.driver", geckoFilename); + SpringApplication.run(Covid19SchedulerApplication.class, args); } diff --git a/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/ResourceManager.java b/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/ResourceManager.java new file mode 100644 index 0000000..5e11af3 --- /dev/null +++ b/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/ResourceManager.java @@ -0,0 +1,73 @@ +package com.bvn13.covid19.scheduler; + +import java.io.*; +import java.util.Hashtable; + +/** + * @see 'https://coderwall.com/p/d0kssw/resource-extraction-from-jar-to-local-file-system' + */ +public class ResourceManager { + + // Stores paths to files with the global jarFilePath as the key + private static Hashtable fileCache = new Hashtable(); + + /** + * Extract the specified resource from inside the jar to the local file system. + * @param jarFilePath absolute path to the resource + * @return full file system path if file successfully extracted, else null on error + */ + public static String extract(String jarFilePath){ + + if(jarFilePath == null) + return null; + + // See if we already have the file + if(fileCache.contains(jarFilePath)) + return fileCache.get(jarFilePath); + + // Alright, we don't have the file, let's extract it + try { + // Read the file we're looking for + InputStream fileStream = ResourceManager.class.getResourceAsStream(jarFilePath); + + // Was the resource found? + if(fileStream == null) + return null; + + // Grab the file name + String[] chopped = jarFilePath.split("\\/"); + String fileName = chopped[chopped.length-1]; + + // Create our temp file (first param is just random bits) + File tempFile = File.createTempFile("covid19-", fileName); + + // Set this file to be deleted on VM exit + tempFile.deleteOnExit(); + + // Create an output stream to barf to the temp file + OutputStream out = new FileOutputStream(tempFile); + + // Write the file to the temp file + byte[] buffer = new byte[1024]; + int len = fileStream.read(buffer); + while (len != -1) { + out.write(buffer, 0, len); + len = fileStream.read(buffer); + } + + // Store this file in the cache list + fileCache.put(jarFilePath, tempFile.getAbsolutePath()); + + // Close the streams + fileStream.close(); + out.close(); + + // Return the path of this sweet new file + return tempFile.getAbsolutePath(); + + } catch (IOException e) { + return null; + } + } + +} diff --git a/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/updater/stopcoronovirusrf/StopcoronovirusRfDataRetriever.java b/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/updater/stopcoronovirusrf/StopcoronovirusRfDataRetriever.java index 7b4674b..e39d572 100644 --- a/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/updater/stopcoronovirusrf/StopcoronovirusRfDataRetriever.java +++ b/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/updater/stopcoronovirusrf/StopcoronovirusRfDataRetriever.java @@ -1,75 +1,7 @@ -/* -Copyright [2020] [bvn13] - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - */ - package com.bvn13.covid19.scheduler.updater.stopcoronovirusrf; -import com.bvn13.covid19.scheduler.updater.stopcoronovirusrf.model.RowData; import org.apache.camel.Exchange; -import org.apache.camel.Handler; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.stereotype.Component; - -import java.util.ArrayList; -import java.util.List; - -@Component -public class StopcoronovirusRfDataRetriever { - - //private static final String URL = "https://стопкоронавирус.рф/"; - private static final String HOST = "xn--80aesfpebagmfblc0a.xn--p1ai"; - private static final String URL = "https://xn--80aesfpebagmfblc0a.xn--p1ai/"; - - @Value("${app.user-agent}") - private String userAgent; - - @Handler - public void retrieveData(Exchange exchange) throws Exception { - Document doc = Jsoup.connect(URL) - .userAgent(userAgent) - .timeout(30*1000) - //.referrer("http://google.com") -// .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") -// .header("Accept-Encoding", "gzip, deflate, br") -// .header("Accept-Language", "ru-RU,ru;q=0.5") -// .header("Cache-Control", "no-cache") -// .header("Connection", "keep-alive") -// .header("Pragma", "no-cache") -// .header("Host", HOST) - .get(); - - Elements tableData = doc.select("div.d-map__list > table > tbody > tr"); - - List rows = new ArrayList<>(tableData.size()); - - for (Element row : tableData) { - rows.add(RowData.builder() - .region(row.selectFirst("th").text()) - .sick(row.selectFirst("td > span.d-map__indicator_sick").parent().ownText()) - .healed(row.selectFirst("td > span.d-map__indicator_healed").parent().ownText()) - .died(row.selectFirst("td > span.d-map__indicator_die").parent().ownText()) - .build()); - } - - exchange.getIn().setHeader(StopcoronovirusRfUpdater.HEADER_DATE_OF_DATA, doc.selectFirst(".d-map__title span").ownText()); - exchange.getIn().setBody(rows); - - } +public interface StopcoronovirusRfDataRetriever { + void retrieveData(Exchange exchange) throws Exception; } diff --git a/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/updater/stopcoronovirusrf/StopcoronovirusRfJsoupDataRetriever.java b/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/updater/stopcoronovirusrf/StopcoronovirusRfJsoupDataRetriever.java new file mode 100644 index 0000000..eec7fa5 --- /dev/null +++ b/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/updater/stopcoronovirusrf/StopcoronovirusRfJsoupDataRetriever.java @@ -0,0 +1,74 @@ +/* +Copyright [2020] [bvn13] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.bvn13.covid19.scheduler.updater.stopcoronovirusrf; + +import com.bvn13.covid19.scheduler.updater.stopcoronovirusrf.model.RowData; +import org.apache.camel.Exchange; +import org.apache.camel.Handler; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; + +import java.util.ArrayList; +import java.util.List; + +public class StopcoronovirusRfJsoupDataRetriever implements StopcoronovirusRfDataRetriever { + + //private static final String URL = "https://стопкоронавирус.рф/"; + private static final String HOST = "xn--80aesfpebagmfblc0a.xn--p1ai"; + private static final String URL = "https://xn--80aesfpebagmfblc0a.xn--p1ai/"; + + @Value("${app.user-agent}") + private String userAgent; + + @Handler + public void retrieveData(Exchange exchange) throws Exception { + Document doc = Jsoup.connect(URL) + .userAgent(userAgent) + .timeout(30*1000) + //.referrer("http://google.com") +// .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") +// .header("Accept-Encoding", "gzip, deflate, br") +// .header("Accept-Language", "ru-RU,ru;q=0.5") +// .header("Cache-Control", "no-cache") +// .header("Connection", "keep-alive") +// .header("Pragma", "no-cache") +// .header("Host", HOST) + .get(); + + Elements tableData = doc.select("div.d-map__list > table > tbody > tr"); + + List rows = new ArrayList<>(tableData.size()); + + for (Element row : tableData) { + rows.add(RowData.builder() + .region(row.selectFirst("th").text()) + .sick(row.selectFirst("td > span.d-map__indicator_sick").parent().ownText()) + .healed(row.selectFirst("td > span.d-map__indicator_healed").parent().ownText()) + .died(row.selectFirst("td > span.d-map__indicator_die").parent().ownText()) + .build()); + } + + exchange.getIn().setHeader(StopcoronovirusRfUpdater.HEADER_DATE_OF_DATA, doc.selectFirst(".d-map__title span").ownText()); + exchange.getIn().setBody(rows); + + } + +} diff --git a/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/updater/stopcoronovirusrf/StopcoronovirusRfSeleniumDataRetriever.java b/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/updater/stopcoronovirusrf/StopcoronovirusRfSeleniumDataRetriever.java new file mode 100644 index 0000000..22088d8 --- /dev/null +++ b/covid19-scheduler/src/main/java/com/bvn13/covid19/scheduler/updater/stopcoronovirusrf/StopcoronovirusRfSeleniumDataRetriever.java @@ -0,0 +1,91 @@ +/* +Copyright [2020] [bvn13] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.bvn13.covid19.scheduler.updater.stopcoronovirusrf; + +import com.bvn13.covid19.scheduler.updater.stopcoronovirusrf.model.RowData; +import org.apache.camel.Exchange; +import org.apache.camel.Handler; +import org.apache.commons.lang3.StringUtils; +import org.openqa.selenium.By; +import org.openqa.selenium.JavascriptExecutor; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.firefox.FirefoxBinary; +import org.openqa.selenium.firefox.FirefoxDriver; +import org.openqa.selenium.firefox.FirefoxOptions; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; +import org.springframework.util.Assert; + +import java.util.ArrayList; +import java.util.List; + +@Component +public class StopcoronovirusRfSeleniumDataRetriever implements StopcoronovirusRfDataRetriever { + + //private static final String URL = "https://стопкоронавирус.рф/"; + private static final String HOST = "xn--80aesfpebagmfblc0a.xn--p1ai"; + private static final String URL = "https://xn--80aesfpebagmfblc0a.xn--p1ai/information/"; + + @Value("${app.user-agent}") + private String userAgent; + + @Handler + public void retrieveData(Exchange exchange) throws Exception { + FirefoxBinary firefoxBinary = new FirefoxBinary(); + FirefoxOptions options = new FirefoxOptions(); + options.setBinary(firefoxBinary); + options.setHeadless(true); // <-- headless set here + WebDriver driver = new FirefoxDriver(options); + try { + driver.manage().window().maximize(); + driver.get(URL); + List tableData = driver.findElements(By.cssSelector(".d-map__list > table > tbody > tr")); + + if (tableData.size() <= 0) { + throw new IllegalStateException("Data not found!"); + } + + WebElement lastRow = driver.findElement(By.cssSelector(".d-map__list > table > tbody > tr:last-child")); + + JavascriptExecutor js = (JavascriptExecutor) driver; + js.executeScript("window.scrollBy(0,10000)"); + js.executeScript("arguments[0].scrollIntoView()",lastRow); +// js.executeScript("$(\".d-map__list > table > tbody\").animate({ scrollTop: \"10000px\" });"); + + List rows = new ArrayList<>(tableData.size()); + + for (WebElement row : tableData) { + RowData rowData = RowData.builder() + .region(row.findElement(By.cssSelector("th")).getAttribute("innerText")) + .sick(row.findElement(By.cssSelector("td.col-sick")).getAttribute("innerText")) + .healed(row.findElement(By.cssSelector("td.col-healed")).getAttribute("innerText")) + .died(row.findElement(By.cssSelector("td.col-died")).getAttribute("innerText")) + .build(); + Assert.isTrue(StringUtils.isNotBlank(rowData.getRegion()), "Broken data found after " + rows.size() + " rows"); + } + + exchange.getIn().setHeader(StopcoronovirusRfUpdater.HEADER_DATE_OF_DATA, driver.findElement(By.cssSelector(".cv-section__title small")).getText()); + exchange.getIn().setBody(rows); + + } finally { + driver.quit(); + } + } + +} diff --git a/covid19-scheduler/src/main/resources/application.yaml b/covid19-scheduler/src/main/resources/application.yaml index 56ea950..421275e 100644 --- a/covid19-scheduler/src/main/resources/application.yaml +++ b/covid19-scheduler/src/main/resources/application.yaml @@ -44,6 +44,11 @@ spring: jdbc: lob: non_contextual_creation: true + +webdriver: + gecko: + driver: /home/bvn13/soft/geckodriver + logging: level: root: info diff --git a/covid19-scheduler/src/main/resources/files/geckodriver b/covid19-scheduler/src/main/resources/files/geckodriver new file mode 100755 index 0000000..ff08a41 Binary files /dev/null and b/covid19-scheduler/src/main/resources/files/geckodriver differ