covid19 data retrieved changed from JSoup to Selenium (Firefox geckodriver) engine.

develop
bvn13 2020-05-04 14:02:54 +03:00
parent 3e377aa63d
commit a3482987ed
10 changed files with 268 additions and 72 deletions

View File

@ -35,7 +35,7 @@ subprojects {
}
group = 'com.bvn13.covid19'
version = '0.1.1'
version = '0.1.2'
apply plugin: 'java'
apply plugin: 'idea'

View File

@ -1,6 +1,6 @@
#!/bin/bash
version=0.1.1
version=0.1.2
./gradlew :covid19-db-migrator:clean :covid19-db-migrator:assemble
./gradlew :covid19-site:clean :covid19-site:assemble

View File

@ -7,6 +7,12 @@ dependencies {
implementation "org.apache.camel.springboot:camel-spring-boot-starter:${camelVersion}"
implementation 'org.springframework.boot:spring-boot-starter-data-jpa'
// https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-java
implementation 'org.seleniumhq.selenium:selenium-java:3.141.59'
// https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-firefox-driver
implementation 'org.seleniumhq.selenium:selenium-firefox-driver:3.141.59'
compileOnly "org.springframework.boot:spring-boot-configuration-processor:${springBootVersion}"
annotationProcessor "org.springframework.boot:spring-boot-configuration-processor:${springBootVersion}"

View File

@ -23,6 +23,10 @@ import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.context.annotation.Import;
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
import org.springframework.util.Assert;
import java.io.File;
import java.nio.file.Files;
@EnableJpaRepositories("com.bvn13.covid19.scheduler")
@SpringBootApplication
@ -33,6 +37,17 @@ import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
public class Covid19SchedulerApplication {
public static void main(String[] args) {
String geckoFilename = ResourceManager.extract("/files/geckodriver");
Assert.notNull(geckoFilename, "geckodriver not found inside JAR");
File file = new File(geckoFilename);
if (!file.canExecute()) {
Assert.isTrue(file.setExecutable(true, true), "Could not make executable: "+geckoFilename);
}
System.setProperty("webdriver.gecko.driver", geckoFilename);
SpringApplication.run(Covid19SchedulerApplication.class, args);
}

View File

@ -0,0 +1,73 @@
package com.bvn13.covid19.scheduler;
import java.io.*;
import java.util.Hashtable;
/**
* @see 'https://coderwall.com/p/d0kssw/resource-extraction-from-jar-to-local-file-system'
*/
public class ResourceManager {
// Stores paths to files with the global jarFilePath as the key
private static Hashtable<String, String> fileCache = new Hashtable<String, String>();
/**
* Extract the specified resource from inside the jar to the local file system.
* @param jarFilePath absolute path to the resource
* @return full file system path if file successfully extracted, else null on error
*/
public static String extract(String jarFilePath){
if(jarFilePath == null)
return null;
// See if we already have the file
if(fileCache.contains(jarFilePath))
return fileCache.get(jarFilePath);
// Alright, we don't have the file, let's extract it
try {
// Read the file we're looking for
InputStream fileStream = ResourceManager.class.getResourceAsStream(jarFilePath);
// Was the resource found?
if(fileStream == null)
return null;
// Grab the file name
String[] chopped = jarFilePath.split("\\/");
String fileName = chopped[chopped.length-1];
// Create our temp file (first param is just random bits)
File tempFile = File.createTempFile("covid19-", fileName);
// Set this file to be deleted on VM exit
tempFile.deleteOnExit();
// Create an output stream to barf to the temp file
OutputStream out = new FileOutputStream(tempFile);
// Write the file to the temp file
byte[] buffer = new byte[1024];
int len = fileStream.read(buffer);
while (len != -1) {
out.write(buffer, 0, len);
len = fileStream.read(buffer);
}
// Store this file in the cache list
fileCache.put(jarFilePath, tempFile.getAbsolutePath());
// Close the streams
fileStream.close();
out.close();
// Return the path of this sweet new file
return tempFile.getAbsolutePath();
} catch (IOException e) {
return null;
}
}
}

View File

@ -1,75 +1,7 @@
/*
Copyright [2020] [bvn13]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.bvn13.covid19.scheduler.updater.stopcoronovirusrf;
import com.bvn13.covid19.scheduler.updater.stopcoronovirusrf.model.RowData;
import org.apache.camel.Exchange;
import org.apache.camel.Handler;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.List;
@Component
public class StopcoronovirusRfDataRetriever {
//private static final String URL = "https://стопкоронавирус.рф/";
private static final String HOST = "xn--80aesfpebagmfblc0a.xn--p1ai";
private static final String URL = "https://xn--80aesfpebagmfblc0a.xn--p1ai/";
@Value("${app.user-agent}")
private String userAgent;
@Handler
public void retrieveData(Exchange exchange) throws Exception {
Document doc = Jsoup.connect(URL)
.userAgent(userAgent)
.timeout(30*1000)
//.referrer("http://google.com")
// .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
// .header("Accept-Encoding", "gzip, deflate, br")
// .header("Accept-Language", "ru-RU,ru;q=0.5")
// .header("Cache-Control", "no-cache")
// .header("Connection", "keep-alive")
// .header("Pragma", "no-cache")
// .header("Host", HOST)
.get();
Elements tableData = doc.select("div.d-map__list > table > tbody > tr");
List<RowData> rows = new ArrayList<>(tableData.size());
for (Element row : tableData) {
rows.add(RowData.builder()
.region(row.selectFirst("th").text())
.sick(row.selectFirst("td > span.d-map__indicator_sick").parent().ownText())
.healed(row.selectFirst("td > span.d-map__indicator_healed").parent().ownText())
.died(row.selectFirst("td > span.d-map__indicator_die").parent().ownText())
.build());
}
exchange.getIn().setHeader(StopcoronovirusRfUpdater.HEADER_DATE_OF_DATA, doc.selectFirst(".d-map__title span").ownText());
exchange.getIn().setBody(rows);
}
public interface StopcoronovirusRfDataRetriever {
void retrieveData(Exchange exchange) throws Exception;
}

View File

@ -0,0 +1,74 @@
/*
Copyright [2020] [bvn13]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.bvn13.covid19.scheduler.updater.stopcoronovirusrf;
import com.bvn13.covid19.scheduler.updater.stopcoronovirusrf.model.RowData;
import org.apache.camel.Exchange;
import org.apache.camel.Handler;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.List;
public class StopcoronovirusRfJsoupDataRetriever implements StopcoronovirusRfDataRetriever {
//private static final String URL = "https://стопкоронавирус.рф/";
private static final String HOST = "xn--80aesfpebagmfblc0a.xn--p1ai";
private static final String URL = "https://xn--80aesfpebagmfblc0a.xn--p1ai/";
@Value("${app.user-agent}")
private String userAgent;
@Handler
public void retrieveData(Exchange exchange) throws Exception {
Document doc = Jsoup.connect(URL)
.userAgent(userAgent)
.timeout(30*1000)
//.referrer("http://google.com")
// .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
// .header("Accept-Encoding", "gzip, deflate, br")
// .header("Accept-Language", "ru-RU,ru;q=0.5")
// .header("Cache-Control", "no-cache")
// .header("Connection", "keep-alive")
// .header("Pragma", "no-cache")
// .header("Host", HOST)
.get();
Elements tableData = doc.select("div.d-map__list > table > tbody > tr");
List<RowData> rows = new ArrayList<>(tableData.size());
for (Element row : tableData) {
rows.add(RowData.builder()
.region(row.selectFirst("th").text())
.sick(row.selectFirst("td > span.d-map__indicator_sick").parent().ownText())
.healed(row.selectFirst("td > span.d-map__indicator_healed").parent().ownText())
.died(row.selectFirst("td > span.d-map__indicator_die").parent().ownText())
.build());
}
exchange.getIn().setHeader(StopcoronovirusRfUpdater.HEADER_DATE_OF_DATA, doc.selectFirst(".d-map__title span").ownText());
exchange.getIn().setBody(rows);
}
}

View File

@ -0,0 +1,91 @@
/*
Copyright [2020] [bvn13]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.bvn13.covid19.scheduler.updater.stopcoronovirusrf;
import com.bvn13.covid19.scheduler.updater.stopcoronovirusrf.model.RowData;
import org.apache.camel.Exchange;
import org.apache.camel.Handler;
import org.apache.commons.lang3.StringUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxBinary;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxOptions;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import org.springframework.util.Assert;
import java.util.ArrayList;
import java.util.List;
@Component
public class StopcoronovirusRfSeleniumDataRetriever implements StopcoronovirusRfDataRetriever {
//private static final String URL = "https://стопкоронавирус.рф/";
private static final String HOST = "xn--80aesfpebagmfblc0a.xn--p1ai";
private static final String URL = "https://xn--80aesfpebagmfblc0a.xn--p1ai/information/";
@Value("${app.user-agent}")
private String userAgent;
@Handler
public void retrieveData(Exchange exchange) throws Exception {
FirefoxBinary firefoxBinary = new FirefoxBinary();
FirefoxOptions options = new FirefoxOptions();
options.setBinary(firefoxBinary);
options.setHeadless(true); // <-- headless set here
WebDriver driver = new FirefoxDriver(options);
try {
driver.manage().window().maximize();
driver.get(URL);
List<WebElement> tableData = driver.findElements(By.cssSelector(".d-map__list > table > tbody > tr"));
if (tableData.size() <= 0) {
throw new IllegalStateException("Data not found!");
}
WebElement lastRow = driver.findElement(By.cssSelector(".d-map__list > table > tbody > tr:last-child"));
JavascriptExecutor js = (JavascriptExecutor) driver;
js.executeScript("window.scrollBy(0,10000)");
js.executeScript("arguments[0].scrollIntoView()",lastRow);
// js.executeScript("$(\".d-map__list > table > tbody\").animate({ scrollTop: \"10000px\" });");
List<RowData> rows = new ArrayList<>(tableData.size());
for (WebElement row : tableData) {
RowData rowData = RowData.builder()
.region(row.findElement(By.cssSelector("th")).getAttribute("innerText"))
.sick(row.findElement(By.cssSelector("td.col-sick")).getAttribute("innerText"))
.healed(row.findElement(By.cssSelector("td.col-healed")).getAttribute("innerText"))
.died(row.findElement(By.cssSelector("td.col-died")).getAttribute("innerText"))
.build();
Assert.isTrue(StringUtils.isNotBlank(rowData.getRegion()), "Broken data found after " + rows.size() + " rows");
}
exchange.getIn().setHeader(StopcoronovirusRfUpdater.HEADER_DATE_OF_DATA, driver.findElement(By.cssSelector(".cv-section__title small")).getText());
exchange.getIn().setBody(rows);
} finally {
driver.quit();
}
}
}

View File

@ -44,6 +44,11 @@ spring:
jdbc:
lob:
non_contextual_creation: true
webdriver:
gecko:
driver: /home/bvn13/soft/geckodriver
logging:
level:
root: info