mirror of https://github.com/bvn13/covid19-ru.git
covid19 data retrieved changed from JSoup to Selenium (Firefox geckodriver) engine.
parent
3e377aa63d
commit
a3482987ed
|
@ -35,7 +35,7 @@ subprojects {
|
|||
}
|
||||
|
||||
group = 'com.bvn13.covid19'
|
||||
version = '0.1.1'
|
||||
version = '0.1.2'
|
||||
|
||||
apply plugin: 'java'
|
||||
apply plugin: 'idea'
|
||||
|
|
2
buildit
2
buildit
|
@ -1,6 +1,6 @@
|
|||
#!/bin/bash
|
||||
|
||||
version=0.1.1
|
||||
version=0.1.2
|
||||
|
||||
./gradlew :covid19-db-migrator:clean :covid19-db-migrator:assemble
|
||||
./gradlew :covid19-site:clean :covid19-site:assemble
|
||||
|
|
|
@ -7,6 +7,12 @@ dependencies {
|
|||
implementation "org.apache.camel.springboot:camel-spring-boot-starter:${camelVersion}"
|
||||
implementation 'org.springframework.boot:spring-boot-starter-data-jpa'
|
||||
|
||||
// https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-java
|
||||
implementation 'org.seleniumhq.selenium:selenium-java:3.141.59'
|
||||
// https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-firefox-driver
|
||||
implementation 'org.seleniumhq.selenium:selenium-firefox-driver:3.141.59'
|
||||
|
||||
|
||||
compileOnly "org.springframework.boot:spring-boot-configuration-processor:${springBootVersion}"
|
||||
annotationProcessor "org.springframework.boot:spring-boot-configuration-processor:${springBootVersion}"
|
||||
|
||||
|
|
|
@ -23,6 +23,10 @@ import org.springframework.boot.autoconfigure.SpringBootApplication;
|
|||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.context.annotation.Import;
|
||||
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
||||
import org.springframework.util.Assert;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Files;
|
||||
|
||||
@EnableJpaRepositories("com.bvn13.covid19.scheduler")
|
||||
@SpringBootApplication
|
||||
|
@ -33,6 +37,17 @@ import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
|||
public class Covid19SchedulerApplication {
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
String geckoFilename = ResourceManager.extract("/files/geckodriver");
|
||||
Assert.notNull(geckoFilename, "geckodriver not found inside JAR");
|
||||
|
||||
File file = new File(geckoFilename);
|
||||
if (!file.canExecute()) {
|
||||
Assert.isTrue(file.setExecutable(true, true), "Could not make executable: "+geckoFilename);
|
||||
}
|
||||
|
||||
System.setProperty("webdriver.gecko.driver", geckoFilename);
|
||||
|
||||
SpringApplication.run(Covid19SchedulerApplication.class, args);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
package com.bvn13.covid19.scheduler;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Hashtable;
|
||||
|
||||
/**
|
||||
* @see 'https://coderwall.com/p/d0kssw/resource-extraction-from-jar-to-local-file-system'
|
||||
*/
|
||||
public class ResourceManager {
|
||||
|
||||
// Stores paths to files with the global jarFilePath as the key
|
||||
private static Hashtable<String, String> fileCache = new Hashtable<String, String>();
|
||||
|
||||
/**
|
||||
* Extract the specified resource from inside the jar to the local file system.
|
||||
* @param jarFilePath absolute path to the resource
|
||||
* @return full file system path if file successfully extracted, else null on error
|
||||
*/
|
||||
public static String extract(String jarFilePath){
|
||||
|
||||
if(jarFilePath == null)
|
||||
return null;
|
||||
|
||||
// See if we already have the file
|
||||
if(fileCache.contains(jarFilePath))
|
||||
return fileCache.get(jarFilePath);
|
||||
|
||||
// Alright, we don't have the file, let's extract it
|
||||
try {
|
||||
// Read the file we're looking for
|
||||
InputStream fileStream = ResourceManager.class.getResourceAsStream(jarFilePath);
|
||||
|
||||
// Was the resource found?
|
||||
if(fileStream == null)
|
||||
return null;
|
||||
|
||||
// Grab the file name
|
||||
String[] chopped = jarFilePath.split("\\/");
|
||||
String fileName = chopped[chopped.length-1];
|
||||
|
||||
// Create our temp file (first param is just random bits)
|
||||
File tempFile = File.createTempFile("covid19-", fileName);
|
||||
|
||||
// Set this file to be deleted on VM exit
|
||||
tempFile.deleteOnExit();
|
||||
|
||||
// Create an output stream to barf to the temp file
|
||||
OutputStream out = new FileOutputStream(tempFile);
|
||||
|
||||
// Write the file to the temp file
|
||||
byte[] buffer = new byte[1024];
|
||||
int len = fileStream.read(buffer);
|
||||
while (len != -1) {
|
||||
out.write(buffer, 0, len);
|
||||
len = fileStream.read(buffer);
|
||||
}
|
||||
|
||||
// Store this file in the cache list
|
||||
fileCache.put(jarFilePath, tempFile.getAbsolutePath());
|
||||
|
||||
// Close the streams
|
||||
fileStream.close();
|
||||
out.close();
|
||||
|
||||
// Return the path of this sweet new file
|
||||
return tempFile.getAbsolutePath();
|
||||
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,75 +1,7 @@
|
|||
/*
|
||||
Copyright [2020] [bvn13]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package com.bvn13.covid19.scheduler.updater.stopcoronovirusrf;
|
||||
|
||||
import com.bvn13.covid19.scheduler.updater.stopcoronovirusrf.model.RowData;
|
||||
import org.apache.camel.Exchange;
|
||||
import org.apache.camel.Handler;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Component
|
||||
public class StopcoronovirusRfDataRetriever {
|
||||
|
||||
//private static final String URL = "https://стопкоронавирус.рф/";
|
||||
private static final String HOST = "xn--80aesfpebagmfblc0a.xn--p1ai";
|
||||
private static final String URL = "https://xn--80aesfpebagmfblc0a.xn--p1ai/";
|
||||
|
||||
@Value("${app.user-agent}")
|
||||
private String userAgent;
|
||||
|
||||
@Handler
|
||||
public void retrieveData(Exchange exchange) throws Exception {
|
||||
Document doc = Jsoup.connect(URL)
|
||||
.userAgent(userAgent)
|
||||
.timeout(30*1000)
|
||||
//.referrer("http://google.com")
|
||||
// .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
|
||||
// .header("Accept-Encoding", "gzip, deflate, br")
|
||||
// .header("Accept-Language", "ru-RU,ru;q=0.5")
|
||||
// .header("Cache-Control", "no-cache")
|
||||
// .header("Connection", "keep-alive")
|
||||
// .header("Pragma", "no-cache")
|
||||
// .header("Host", HOST)
|
||||
.get();
|
||||
|
||||
Elements tableData = doc.select("div.d-map__list > table > tbody > tr");
|
||||
|
||||
List<RowData> rows = new ArrayList<>(tableData.size());
|
||||
|
||||
for (Element row : tableData) {
|
||||
rows.add(RowData.builder()
|
||||
.region(row.selectFirst("th").text())
|
||||
.sick(row.selectFirst("td > span.d-map__indicator_sick").parent().ownText())
|
||||
.healed(row.selectFirst("td > span.d-map__indicator_healed").parent().ownText())
|
||||
.died(row.selectFirst("td > span.d-map__indicator_die").parent().ownText())
|
||||
.build());
|
||||
}
|
||||
|
||||
exchange.getIn().setHeader(StopcoronovirusRfUpdater.HEADER_DATE_OF_DATA, doc.selectFirst(".d-map__title span").ownText());
|
||||
exchange.getIn().setBody(rows);
|
||||
|
||||
}
|
||||
|
||||
public interface StopcoronovirusRfDataRetriever {
|
||||
void retrieveData(Exchange exchange) throws Exception;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
Copyright [2020] [bvn13]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package com.bvn13.covid19.scheduler.updater.stopcoronovirusrf;
|
||||
|
||||
import com.bvn13.covid19.scheduler.updater.stopcoronovirusrf.model.RowData;
|
||||
import org.apache.camel.Exchange;
|
||||
import org.apache.camel.Handler;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class StopcoronovirusRfJsoupDataRetriever implements StopcoronovirusRfDataRetriever {
|
||||
|
||||
//private static final String URL = "https://стопкоронавирус.рф/";
|
||||
private static final String HOST = "xn--80aesfpebagmfblc0a.xn--p1ai";
|
||||
private static final String URL = "https://xn--80aesfpebagmfblc0a.xn--p1ai/";
|
||||
|
||||
@Value("${app.user-agent}")
|
||||
private String userAgent;
|
||||
|
||||
@Handler
|
||||
public void retrieveData(Exchange exchange) throws Exception {
|
||||
Document doc = Jsoup.connect(URL)
|
||||
.userAgent(userAgent)
|
||||
.timeout(30*1000)
|
||||
//.referrer("http://google.com")
|
||||
// .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
|
||||
// .header("Accept-Encoding", "gzip, deflate, br")
|
||||
// .header("Accept-Language", "ru-RU,ru;q=0.5")
|
||||
// .header("Cache-Control", "no-cache")
|
||||
// .header("Connection", "keep-alive")
|
||||
// .header("Pragma", "no-cache")
|
||||
// .header("Host", HOST)
|
||||
.get();
|
||||
|
||||
Elements tableData = doc.select("div.d-map__list > table > tbody > tr");
|
||||
|
||||
List<RowData> rows = new ArrayList<>(tableData.size());
|
||||
|
||||
for (Element row : tableData) {
|
||||
rows.add(RowData.builder()
|
||||
.region(row.selectFirst("th").text())
|
||||
.sick(row.selectFirst("td > span.d-map__indicator_sick").parent().ownText())
|
||||
.healed(row.selectFirst("td > span.d-map__indicator_healed").parent().ownText())
|
||||
.died(row.selectFirst("td > span.d-map__indicator_die").parent().ownText())
|
||||
.build());
|
||||
}
|
||||
|
||||
exchange.getIn().setHeader(StopcoronovirusRfUpdater.HEADER_DATE_OF_DATA, doc.selectFirst(".d-map__title span").ownText());
|
||||
exchange.getIn().setBody(rows);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
/*
|
||||
Copyright [2020] [bvn13]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package com.bvn13.covid19.scheduler.updater.stopcoronovirusrf;
|
||||
|
||||
import com.bvn13.covid19.scheduler.updater.stopcoronovirusrf.model.RowData;
|
||||
import org.apache.camel.Exchange;
|
||||
import org.apache.camel.Handler;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.openqa.selenium.By;
|
||||
import org.openqa.selenium.JavascriptExecutor;
|
||||
import org.openqa.selenium.WebDriver;
|
||||
import org.openqa.selenium.WebElement;
|
||||
import org.openqa.selenium.chrome.ChromeDriver;
|
||||
import org.openqa.selenium.firefox.FirefoxBinary;
|
||||
import org.openqa.selenium.firefox.FirefoxDriver;
|
||||
import org.openqa.selenium.firefox.FirefoxOptions;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.util.Assert;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Component
|
||||
public class StopcoronovirusRfSeleniumDataRetriever implements StopcoronovirusRfDataRetriever {
|
||||
|
||||
//private static final String URL = "https://стопкоронавирус.рф/";
|
||||
private static final String HOST = "xn--80aesfpebagmfblc0a.xn--p1ai";
|
||||
private static final String URL = "https://xn--80aesfpebagmfblc0a.xn--p1ai/information/";
|
||||
|
||||
@Value("${app.user-agent}")
|
||||
private String userAgent;
|
||||
|
||||
@Handler
|
||||
public void retrieveData(Exchange exchange) throws Exception {
|
||||
FirefoxBinary firefoxBinary = new FirefoxBinary();
|
||||
FirefoxOptions options = new FirefoxOptions();
|
||||
options.setBinary(firefoxBinary);
|
||||
options.setHeadless(true); // <-- headless set here
|
||||
WebDriver driver = new FirefoxDriver(options);
|
||||
try {
|
||||
driver.manage().window().maximize();
|
||||
driver.get(URL);
|
||||
List<WebElement> tableData = driver.findElements(By.cssSelector(".d-map__list > table > tbody > tr"));
|
||||
|
||||
if (tableData.size() <= 0) {
|
||||
throw new IllegalStateException("Data not found!");
|
||||
}
|
||||
|
||||
WebElement lastRow = driver.findElement(By.cssSelector(".d-map__list > table > tbody > tr:last-child"));
|
||||
|
||||
JavascriptExecutor js = (JavascriptExecutor) driver;
|
||||
js.executeScript("window.scrollBy(0,10000)");
|
||||
js.executeScript("arguments[0].scrollIntoView()",lastRow);
|
||||
// js.executeScript("$(\".d-map__list > table > tbody\").animate({ scrollTop: \"10000px\" });");
|
||||
|
||||
List<RowData> rows = new ArrayList<>(tableData.size());
|
||||
|
||||
for (WebElement row : tableData) {
|
||||
RowData rowData = RowData.builder()
|
||||
.region(row.findElement(By.cssSelector("th")).getAttribute("innerText"))
|
||||
.sick(row.findElement(By.cssSelector("td.col-sick")).getAttribute("innerText"))
|
||||
.healed(row.findElement(By.cssSelector("td.col-healed")).getAttribute("innerText"))
|
||||
.died(row.findElement(By.cssSelector("td.col-died")).getAttribute("innerText"))
|
||||
.build();
|
||||
Assert.isTrue(StringUtils.isNotBlank(rowData.getRegion()), "Broken data found after " + rows.size() + " rows");
|
||||
}
|
||||
|
||||
exchange.getIn().setHeader(StopcoronovirusRfUpdater.HEADER_DATE_OF_DATA, driver.findElement(By.cssSelector(".cv-section__title small")).getText());
|
||||
exchange.getIn().setBody(rows);
|
||||
|
||||
} finally {
|
||||
driver.quit();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -44,6 +44,11 @@ spring:
|
|||
jdbc:
|
||||
lob:
|
||||
non_contextual_creation: true
|
||||
|
||||
webdriver:
|
||||
gecko:
|
||||
driver: /home/bvn13/soft/geckodriver
|
||||
|
||||
logging:
|
||||
level:
|
||||
root: info
|
||||
|
|
Binary file not shown.
Loading…
Reference in New Issue