mirror of https://github.com/bvn13/covid19-ru.git
covid19 data retrieved changed from JSoup to Selenium (Firefox geckodriver) engine.
parent
3e377aa63d
commit
a3482987ed
|
@ -35,7 +35,7 @@ subprojects {
|
||||||
}
|
}
|
||||||
|
|
||||||
group = 'com.bvn13.covid19'
|
group = 'com.bvn13.covid19'
|
||||||
version = '0.1.1'
|
version = '0.1.2'
|
||||||
|
|
||||||
apply plugin: 'java'
|
apply plugin: 'java'
|
||||||
apply plugin: 'idea'
|
apply plugin: 'idea'
|
||||||
|
|
2
buildit
2
buildit
|
@ -1,6 +1,6 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
version=0.1.1
|
version=0.1.2
|
||||||
|
|
||||||
./gradlew :covid19-db-migrator:clean :covid19-db-migrator:assemble
|
./gradlew :covid19-db-migrator:clean :covid19-db-migrator:assemble
|
||||||
./gradlew :covid19-site:clean :covid19-site:assemble
|
./gradlew :covid19-site:clean :covid19-site:assemble
|
||||||
|
|
|
@ -7,6 +7,12 @@ dependencies {
|
||||||
implementation "org.apache.camel.springboot:camel-spring-boot-starter:${camelVersion}"
|
implementation "org.apache.camel.springboot:camel-spring-boot-starter:${camelVersion}"
|
||||||
implementation 'org.springframework.boot:spring-boot-starter-data-jpa'
|
implementation 'org.springframework.boot:spring-boot-starter-data-jpa'
|
||||||
|
|
||||||
|
// https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-java
|
||||||
|
implementation 'org.seleniumhq.selenium:selenium-java:3.141.59'
|
||||||
|
// https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-firefox-driver
|
||||||
|
implementation 'org.seleniumhq.selenium:selenium-firefox-driver:3.141.59'
|
||||||
|
|
||||||
|
|
||||||
compileOnly "org.springframework.boot:spring-boot-configuration-processor:${springBootVersion}"
|
compileOnly "org.springframework.boot:spring-boot-configuration-processor:${springBootVersion}"
|
||||||
annotationProcessor "org.springframework.boot:spring-boot-configuration-processor:${springBootVersion}"
|
annotationProcessor "org.springframework.boot:spring-boot-configuration-processor:${springBootVersion}"
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,10 @@ import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||||
import org.springframework.context.annotation.Import;
|
import org.springframework.context.annotation.Import;
|
||||||
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
||||||
|
import org.springframework.util.Assert;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
|
||||||
@EnableJpaRepositories("com.bvn13.covid19.scheduler")
|
@EnableJpaRepositories("com.bvn13.covid19.scheduler")
|
||||||
@SpringBootApplication
|
@SpringBootApplication
|
||||||
|
@ -33,6 +37,17 @@ import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
||||||
public class Covid19SchedulerApplication {
|
public class Covid19SchedulerApplication {
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
|
|
||||||
|
String geckoFilename = ResourceManager.extract("/files/geckodriver");
|
||||||
|
Assert.notNull(geckoFilename, "geckodriver not found inside JAR");
|
||||||
|
|
||||||
|
File file = new File(geckoFilename);
|
||||||
|
if (!file.canExecute()) {
|
||||||
|
Assert.isTrue(file.setExecutable(true, true), "Could not make executable: "+geckoFilename);
|
||||||
|
}
|
||||||
|
|
||||||
|
System.setProperty("webdriver.gecko.driver", geckoFilename);
|
||||||
|
|
||||||
SpringApplication.run(Covid19SchedulerApplication.class, args);
|
SpringApplication.run(Covid19SchedulerApplication.class, args);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,73 @@
|
||||||
|
package com.bvn13.covid19.scheduler;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.Hashtable;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see 'https://coderwall.com/p/d0kssw/resource-extraction-from-jar-to-local-file-system'
|
||||||
|
*/
|
||||||
|
public class ResourceManager {
|
||||||
|
|
||||||
|
// Stores paths to files with the global jarFilePath as the key
|
||||||
|
private static Hashtable<String, String> fileCache = new Hashtable<String, String>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract the specified resource from inside the jar to the local file system.
|
||||||
|
* @param jarFilePath absolute path to the resource
|
||||||
|
* @return full file system path if file successfully extracted, else null on error
|
||||||
|
*/
|
||||||
|
public static String extract(String jarFilePath){
|
||||||
|
|
||||||
|
if(jarFilePath == null)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
// See if we already have the file
|
||||||
|
if(fileCache.contains(jarFilePath))
|
||||||
|
return fileCache.get(jarFilePath);
|
||||||
|
|
||||||
|
// Alright, we don't have the file, let's extract it
|
||||||
|
try {
|
||||||
|
// Read the file we're looking for
|
||||||
|
InputStream fileStream = ResourceManager.class.getResourceAsStream(jarFilePath);
|
||||||
|
|
||||||
|
// Was the resource found?
|
||||||
|
if(fileStream == null)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
// Grab the file name
|
||||||
|
String[] chopped = jarFilePath.split("\\/");
|
||||||
|
String fileName = chopped[chopped.length-1];
|
||||||
|
|
||||||
|
// Create our temp file (first param is just random bits)
|
||||||
|
File tempFile = File.createTempFile("covid19-", fileName);
|
||||||
|
|
||||||
|
// Set this file to be deleted on VM exit
|
||||||
|
tempFile.deleteOnExit();
|
||||||
|
|
||||||
|
// Create an output stream to barf to the temp file
|
||||||
|
OutputStream out = new FileOutputStream(tempFile);
|
||||||
|
|
||||||
|
// Write the file to the temp file
|
||||||
|
byte[] buffer = new byte[1024];
|
||||||
|
int len = fileStream.read(buffer);
|
||||||
|
while (len != -1) {
|
||||||
|
out.write(buffer, 0, len);
|
||||||
|
len = fileStream.read(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store this file in the cache list
|
||||||
|
fileCache.put(jarFilePath, tempFile.getAbsolutePath());
|
||||||
|
|
||||||
|
// Close the streams
|
||||||
|
fileStream.close();
|
||||||
|
out.close();
|
||||||
|
|
||||||
|
// Return the path of this sweet new file
|
||||||
|
return tempFile.getAbsolutePath();
|
||||||
|
|
||||||
|
} catch (IOException e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,75 +1,7 @@
|
||||||
/*
|
|
||||||
Copyright [2020] [bvn13]
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package com.bvn13.covid19.scheduler.updater.stopcoronovirusrf;
|
package com.bvn13.covid19.scheduler.updater.stopcoronovirusrf;
|
||||||
|
|
||||||
import com.bvn13.covid19.scheduler.updater.stopcoronovirusrf.model.RowData;
|
|
||||||
import org.apache.camel.Exchange;
|
import org.apache.camel.Exchange;
|
||||||
import org.apache.camel.Handler;
|
|
||||||
import org.jsoup.Jsoup;
|
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
import org.jsoup.nodes.Element;
|
|
||||||
import org.jsoup.select.Elements;
|
|
||||||
import org.springframework.beans.factory.annotation.Value;
|
|
||||||
import org.springframework.stereotype.Component;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@Component
|
|
||||||
public class StopcoronovirusRfDataRetriever {
|
|
||||||
|
|
||||||
//private static final String URL = "https://стопкоронавирус.рф/";
|
|
||||||
private static final String HOST = "xn--80aesfpebagmfblc0a.xn--p1ai";
|
|
||||||
private static final String URL = "https://xn--80aesfpebagmfblc0a.xn--p1ai/";
|
|
||||||
|
|
||||||
@Value("${app.user-agent}")
|
|
||||||
private String userAgent;
|
|
||||||
|
|
||||||
@Handler
|
|
||||||
public void retrieveData(Exchange exchange) throws Exception {
|
|
||||||
Document doc = Jsoup.connect(URL)
|
|
||||||
.userAgent(userAgent)
|
|
||||||
.timeout(30*1000)
|
|
||||||
//.referrer("http://google.com")
|
|
||||||
// .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
|
|
||||||
// .header("Accept-Encoding", "gzip, deflate, br")
|
|
||||||
// .header("Accept-Language", "ru-RU,ru;q=0.5")
|
|
||||||
// .header("Cache-Control", "no-cache")
|
|
||||||
// .header("Connection", "keep-alive")
|
|
||||||
// .header("Pragma", "no-cache")
|
|
||||||
// .header("Host", HOST)
|
|
||||||
.get();
|
|
||||||
|
|
||||||
Elements tableData = doc.select("div.d-map__list > table > tbody > tr");
|
|
||||||
|
|
||||||
List<RowData> rows = new ArrayList<>(tableData.size());
|
|
||||||
|
|
||||||
for (Element row : tableData) {
|
|
||||||
rows.add(RowData.builder()
|
|
||||||
.region(row.selectFirst("th").text())
|
|
||||||
.sick(row.selectFirst("td > span.d-map__indicator_sick").parent().ownText())
|
|
||||||
.healed(row.selectFirst("td > span.d-map__indicator_healed").parent().ownText())
|
|
||||||
.died(row.selectFirst("td > span.d-map__indicator_die").parent().ownText())
|
|
||||||
.build());
|
|
||||||
}
|
|
||||||
|
|
||||||
exchange.getIn().setHeader(StopcoronovirusRfUpdater.HEADER_DATE_OF_DATA, doc.selectFirst(".d-map__title span").ownText());
|
|
||||||
exchange.getIn().setBody(rows);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
public interface StopcoronovirusRfDataRetriever {
|
||||||
|
void retrieveData(Exchange exchange) throws Exception;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,74 @@
|
||||||
|
/*
|
||||||
|
Copyright [2020] [bvn13]
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.bvn13.covid19.scheduler.updater.stopcoronovirusrf;
|
||||||
|
|
||||||
|
import com.bvn13.covid19.scheduler.updater.stopcoronovirusrf.model.RowData;
|
||||||
|
import org.apache.camel.Exchange;
|
||||||
|
import org.apache.camel.Handler;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
import org.springframework.beans.factory.annotation.Value;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class StopcoronovirusRfJsoupDataRetriever implements StopcoronovirusRfDataRetriever {
|
||||||
|
|
||||||
|
//private static final String URL = "https://стопкоронавирус.рф/";
|
||||||
|
private static final String HOST = "xn--80aesfpebagmfblc0a.xn--p1ai";
|
||||||
|
private static final String URL = "https://xn--80aesfpebagmfblc0a.xn--p1ai/";
|
||||||
|
|
||||||
|
@Value("${app.user-agent}")
|
||||||
|
private String userAgent;
|
||||||
|
|
||||||
|
@Handler
|
||||||
|
public void retrieveData(Exchange exchange) throws Exception {
|
||||||
|
Document doc = Jsoup.connect(URL)
|
||||||
|
.userAgent(userAgent)
|
||||||
|
.timeout(30*1000)
|
||||||
|
//.referrer("http://google.com")
|
||||||
|
// .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
|
||||||
|
// .header("Accept-Encoding", "gzip, deflate, br")
|
||||||
|
// .header("Accept-Language", "ru-RU,ru;q=0.5")
|
||||||
|
// .header("Cache-Control", "no-cache")
|
||||||
|
// .header("Connection", "keep-alive")
|
||||||
|
// .header("Pragma", "no-cache")
|
||||||
|
// .header("Host", HOST)
|
||||||
|
.get();
|
||||||
|
|
||||||
|
Elements tableData = doc.select("div.d-map__list > table > tbody > tr");
|
||||||
|
|
||||||
|
List<RowData> rows = new ArrayList<>(tableData.size());
|
||||||
|
|
||||||
|
for (Element row : tableData) {
|
||||||
|
rows.add(RowData.builder()
|
||||||
|
.region(row.selectFirst("th").text())
|
||||||
|
.sick(row.selectFirst("td > span.d-map__indicator_sick").parent().ownText())
|
||||||
|
.healed(row.selectFirst("td > span.d-map__indicator_healed").parent().ownText())
|
||||||
|
.died(row.selectFirst("td > span.d-map__indicator_die").parent().ownText())
|
||||||
|
.build());
|
||||||
|
}
|
||||||
|
|
||||||
|
exchange.getIn().setHeader(StopcoronovirusRfUpdater.HEADER_DATE_OF_DATA, doc.selectFirst(".d-map__title span").ownText());
|
||||||
|
exchange.getIn().setBody(rows);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,91 @@
|
||||||
|
/*
|
||||||
|
Copyright [2020] [bvn13]
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.bvn13.covid19.scheduler.updater.stopcoronovirusrf;
|
||||||
|
|
||||||
|
import com.bvn13.covid19.scheduler.updater.stopcoronovirusrf.model.RowData;
|
||||||
|
import org.apache.camel.Exchange;
|
||||||
|
import org.apache.camel.Handler;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.openqa.selenium.By;
|
||||||
|
import org.openqa.selenium.JavascriptExecutor;
|
||||||
|
import org.openqa.selenium.WebDriver;
|
||||||
|
import org.openqa.selenium.WebElement;
|
||||||
|
import org.openqa.selenium.chrome.ChromeDriver;
|
||||||
|
import org.openqa.selenium.firefox.FirefoxBinary;
|
||||||
|
import org.openqa.selenium.firefox.FirefoxDriver;
|
||||||
|
import org.openqa.selenium.firefox.FirefoxOptions;
|
||||||
|
import org.springframework.beans.factory.annotation.Value;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.util.Assert;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
public class StopcoronovirusRfSeleniumDataRetriever implements StopcoronovirusRfDataRetriever {
|
||||||
|
|
||||||
|
//private static final String URL = "https://стопкоронавирус.рф/";
|
||||||
|
private static final String HOST = "xn--80aesfpebagmfblc0a.xn--p1ai";
|
||||||
|
private static final String URL = "https://xn--80aesfpebagmfblc0a.xn--p1ai/information/";
|
||||||
|
|
||||||
|
@Value("${app.user-agent}")
|
||||||
|
private String userAgent;
|
||||||
|
|
||||||
|
@Handler
|
||||||
|
public void retrieveData(Exchange exchange) throws Exception {
|
||||||
|
FirefoxBinary firefoxBinary = new FirefoxBinary();
|
||||||
|
FirefoxOptions options = new FirefoxOptions();
|
||||||
|
options.setBinary(firefoxBinary);
|
||||||
|
options.setHeadless(true); // <-- headless set here
|
||||||
|
WebDriver driver = new FirefoxDriver(options);
|
||||||
|
try {
|
||||||
|
driver.manage().window().maximize();
|
||||||
|
driver.get(URL);
|
||||||
|
List<WebElement> tableData = driver.findElements(By.cssSelector(".d-map__list > table > tbody > tr"));
|
||||||
|
|
||||||
|
if (tableData.size() <= 0) {
|
||||||
|
throw new IllegalStateException("Data not found!");
|
||||||
|
}
|
||||||
|
|
||||||
|
WebElement lastRow = driver.findElement(By.cssSelector(".d-map__list > table > tbody > tr:last-child"));
|
||||||
|
|
||||||
|
JavascriptExecutor js = (JavascriptExecutor) driver;
|
||||||
|
js.executeScript("window.scrollBy(0,10000)");
|
||||||
|
js.executeScript("arguments[0].scrollIntoView()",lastRow);
|
||||||
|
// js.executeScript("$(\".d-map__list > table > tbody\").animate({ scrollTop: \"10000px\" });");
|
||||||
|
|
||||||
|
List<RowData> rows = new ArrayList<>(tableData.size());
|
||||||
|
|
||||||
|
for (WebElement row : tableData) {
|
||||||
|
RowData rowData = RowData.builder()
|
||||||
|
.region(row.findElement(By.cssSelector("th")).getAttribute("innerText"))
|
||||||
|
.sick(row.findElement(By.cssSelector("td.col-sick")).getAttribute("innerText"))
|
||||||
|
.healed(row.findElement(By.cssSelector("td.col-healed")).getAttribute("innerText"))
|
||||||
|
.died(row.findElement(By.cssSelector("td.col-died")).getAttribute("innerText"))
|
||||||
|
.build();
|
||||||
|
Assert.isTrue(StringUtils.isNotBlank(rowData.getRegion()), "Broken data found after " + rows.size() + " rows");
|
||||||
|
}
|
||||||
|
|
||||||
|
exchange.getIn().setHeader(StopcoronovirusRfUpdater.HEADER_DATE_OF_DATA, driver.findElement(By.cssSelector(".cv-section__title small")).getText());
|
||||||
|
exchange.getIn().setBody(rows);
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
driver.quit();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -44,6 +44,11 @@ spring:
|
||||||
jdbc:
|
jdbc:
|
||||||
lob:
|
lob:
|
||||||
non_contextual_creation: true
|
non_contextual_creation: true
|
||||||
|
|
||||||
|
webdriver:
|
||||||
|
gecko:
|
||||||
|
driver: /home/bvn13/soft/geckodriver
|
||||||
|
|
||||||
logging:
|
logging:
|
||||||
level:
|
level:
|
||||||
root: info
|
root: info
|
||||||
|
|
Binary file not shown.
Loading…
Reference in New Issue