mirror of https://github.com/bvn13/JIrcBot.git
129 lines
4.6 KiB
Java
129 lines
4.6 KiB
Java
package ru.bvn13.jircbot.utilities;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.io.InputStreamReader;
|
|
import java.net.MalformedURLException;
|
|
import java.net.URL;
|
|
import java.net.URLConnection;
|
|
import java.nio.charset.Charset;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
/**
|
|
* @author bvn13
|
|
* @since 05.10.2019
|
|
*/
|
|
public class WebTitleExtractor {
|
|
/* the CASE_INSENSITIVE flag accounts for
|
|
* sites that use uppercase title tags.
|
|
* the DOTALL flag accounts for sites that have
|
|
* line feeds in the title text */
|
|
private static final Pattern TITLE_TAG =
|
|
Pattern.compile("<title[^>]*>(.*)</title>", Pattern.CASE_INSENSITIVE|Pattern.DOTALL);
|
|
|
|
private static final int MAX_READ_BYTES = 10*1024;
|
|
|
|
/**
|
|
* @param url the HTML page
|
|
* @return title text (null if document isn't HTML or lacks a title tag)
|
|
* @throws IOException
|
|
*/
|
|
public static String getPageTitle(String url) throws IOException {
|
|
URL u = new URL(url);
|
|
URLConnection conn = u.openConnection();
|
|
|
|
// ContentType is an inner class defined below
|
|
ContentType contentType = getContentTypeHeader(conn);
|
|
if (contentType != null && !contentType.contentType.equals("text/html"))
|
|
return null; // don't continue if not HTML
|
|
else {
|
|
// determine the charset, or use the default
|
|
Charset charset = getCharset(contentType);
|
|
if (charset == null)
|
|
charset = Charset.defaultCharset();
|
|
|
|
// read the response body, using BufferedReader for performance
|
|
try (InputStream in = conn.getInputStream();
|
|
BufferedReader reader = new BufferedReader(new InputStreamReader(in, charset));
|
|
) {
|
|
|
|
int n = 0, totalRead = 0;
|
|
char[] buf = new char[1024];
|
|
StringBuilder content = new StringBuilder();
|
|
|
|
// read until EOF or first 8192 characters
|
|
while (totalRead < MAX_READ_BYTES && (n = reader.read(buf, 0, buf.length)) != -1) {
|
|
content.append(buf, 0, n);
|
|
totalRead += n;
|
|
}
|
|
reader.close();
|
|
|
|
// extract the title
|
|
Matcher matcher = TITLE_TAG.matcher(content);
|
|
if (matcher.find()) {
|
|
/* replace any occurrences of whitespace (which may
|
|
* include line feeds and other uglies) as well
|
|
* as HTML brackets with a space */
|
|
return matcher.group(1).replaceAll("[\\s\\<>]+", " ").trim();
|
|
} else
|
|
return null;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Loops through response headers until Content-Type is found.
|
|
* @param conn
|
|
* @return ContentType object representing the value of
|
|
* the Content-Type header
|
|
*/
|
|
private static ContentType getContentTypeHeader(URLConnection conn) {
|
|
int i = 0;
|
|
boolean moreHeaders = true;
|
|
do {
|
|
String headerName = conn.getHeaderFieldKey(i);
|
|
String headerValue = conn.getHeaderField(i);
|
|
if (headerName != null && headerName.equals("Content-Type"))
|
|
return new ContentType(headerValue);
|
|
|
|
i++;
|
|
moreHeaders = headerName != null || headerValue != null;
|
|
}
|
|
while (moreHeaders);
|
|
|
|
return null;
|
|
}
|
|
|
|
private static Charset getCharset(ContentType contentType) {
|
|
if (contentType != null && contentType.charsetName != null && Charset.isSupported(contentType.charsetName))
|
|
return Charset.forName(contentType.charsetName);
|
|
else
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Class holds the content type and charset (if present)
|
|
*/
|
|
private static final class ContentType {
|
|
private static final Pattern CHARSET_HEADER = Pattern.compile("charset=([-_a-zA-Z0-9]+)", Pattern.CASE_INSENSITIVE|Pattern.DOTALL);
|
|
|
|
private String contentType;
|
|
private String charsetName;
|
|
private ContentType(String headerValue) {
|
|
if (headerValue == null)
|
|
throw new IllegalArgumentException("ContentType must be constructed with a not-null headerValue");
|
|
int n = headerValue.indexOf(";");
|
|
if (n != -1) {
|
|
contentType = headerValue.substring(0, n);
|
|
Matcher matcher = CHARSET_HEADER.matcher(headerValue);
|
|
if (matcher.find())
|
|
charsetName = matcher.group(1);
|
|
}
|
|
else
|
|
contentType = headerValue;
|
|
}
|
|
}
|
|
}
|