#7 fixed, fixed title retrieving from URL

2019-10-05 17:21:41 +03:00 · 2019-10-05 17:21:41 +03:00 · 6cbf281698
parent adeaff4a01
commit 6cbf281698
9 changed files with 173 additions and 88 deletions
--- a/pom.xml
+++ b/pom.xml
@ -6,7 +6,7 @@

    <groupId>ru.bvn13</groupId>
    <artifactId>jircbot</artifactId>
-    <version>2.2.0</version>
+    <version>2.2.1</version>
    <packaging>jar</packaging>

    <parent>
@ -18,7 +18,7 @@


    <properties>
-        <bot.version>2.2.0</bot.version>
+        <bot.version>2.2.1</bot.version>

        <java.version>1.8</java.version>

--- a/src/main/java/ru/bvn13/jircbot/bot/JircBot.java
+++ b/src/main/java/ru/bvn13/jircbot/bot/JircBot.java
@ -14,6 +14,7 @@ import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Component;
 import ru.bvn13.jircbot.config.JircBotConfiguration;
+import ru.bvn13.jircbot.database.services.ChannelSettingsService;
 import ru.bvn13.jircbot.listeners.*;
 import ru.bvn13.jircbot.listeners.advices.AdviceListener;
 import ru.bvn13.jircbot.listeners.calculator.CalculatorListener;
@ -43,14 +44,16 @@ public class JircBot extends ListenerAdapter {
    }

    private JircBotConfiguration config;
+    private ChannelSettingsService channelSettingsService;

    private Map<String, PircBotX> bots = new HashMap<>();



    @Autowired
-    public JircBot(JircBotConfiguration config) {
+    public JircBot(JircBotConfiguration config, ChannelSettingsService channelSettingsService) {
        this.config = config;
+        this.channelSettingsService = channelSettingsService;
    }


@ -137,7 +140,7 @@ public class JircBot extends ListenerAdapter {

                    .setServers(servers)
                    .setAutoReconnect(true)
-                    .addAutoJoinChannels(c.getChannelsNames());
+                    .addAutoJoinChannels(channelSettingsService.getListeningChannels(c.getServer(), c.getChannelsNames()));

            if (c.getBotPassword() != null && !c.getBotPassword().isEmpty()) {
                confBuilder.setNickservPassword(c.getBotPassword());
--- a/src/main/java/ru/bvn13/jircbot/database/entities/ChannelSettings.java
+++ b/src/main/java/ru/bvn13/jircbot/database/entities/ChannelSettings.java
@ -40,6 +40,9 @@ public class ChannelSettings extends BaseModel {
    @Column(nullable = false)
    private Boolean autoRejoinEnabled = false;

+    @Column(nullable = false, columnDefinition = "Boolean DEFAULT false")
+    private Boolean joinOnStart = false;
+
    @Column(nullable = false)
    private Boolean linkPreviewEnabled = false;

--- a/src/main/java/ru/bvn13/jircbot/database/repositories/ChannelSettingsRepository.java
+++ b/src/main/java/ru/bvn13/jircbot/database/repositories/ChannelSettingsRepository.java
@ -1,11 +1,20 @@
 package ru.bvn13.jircbot.database.repositories;

 import org.springframework.data.jpa.repository.JpaRepository;
+import org.springframework.data.jpa.repository.Query;
+import org.springframework.data.repository.query.Param;
 import ru.bvn13.jircbot.database.entities.ChannelSettings;

+import java.util.List;
+
 /**
 * Created by bvn13 on 01.02.2018.
 */
 public interface ChannelSettingsRepository extends JpaRepository<ChannelSettings, Long> {
+
    ChannelSettings getFirstByServerHostAndChannelName(String serverHost, String channelName);
+
+    @Query(value = "select S from ChannelSettings S where S.joinOnStart = true and S.serverHost = :serverHost")
+    List<ChannelSettings> getAllChannelsToAutoJoinByServerHost(@Param("serverHost") String serverHost);
+
 }
--- a/src/main/java/ru/bvn13/jircbot/database/services/ChannelSettingsService.java
+++ b/src/main/java/ru/bvn13/jircbot/database/services/ChannelSettingsService.java
@ -5,6 +5,11 @@ import org.springframework.stereotype.Service;
 import ru.bvn13.jircbot.database.entities.ChannelSettings;
 import ru.bvn13.jircbot.database.repositories.ChannelSettingsRepository;

+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+import java.util.stream.Collectors;
+
 /**
 * Created by bvn13 on 01.02.2018.
 */
@ -38,4 +43,14 @@ public class ChannelSettingsService {
        channelSettingsRepository.save(settings);
    }

+    public Set<String> getListeningChannels(String serverHost, List<String> defaultChannels) {
+        Set<String> channels = channelSettingsRepository.getAllChannelsToAutoJoinByServerHost(serverHost).stream()
+                .map(ChannelSettings::getChannelName)
+                .collect(Collectors.toSet());
+        if (channels.isEmpty()) {
+            channels.addAll(defaultChannels);
+        }
+        return channels;
+    }
+
 }
--- a/src/main/java/ru/bvn13/jircbot/listeners/AdminListener.java
+++ b/src/main/java/ru/bvn13/jircbot/listeners/AdminListener.java
@ -92,6 +92,7 @@ public class AdminListener extends ImprovedListenerAdapter implements Descriptio
                        .command("set")
                        .description("set ON|OFF any of bot opportunity for channel\n\n"+
                                "Opportunities: \n\n"+
+                                "login | autologin | join-on-start - auto login mode on startup\n"+
                                "autorejoin | auto-rejoin - auto rejoin channel on kicking\n"+
                                "bash | bashorg - bach.org quoting\n"+
                                "deferredmessages | deferred-messages | tell - saving and delivering deferred messages\n"+
@ -380,6 +381,10 @@ public class AdminListener extends ImprovedListenerAdapter implements Descriptio
                case "quiz":
                    settings.setQuizEnabled(mode);
                    break;
+                case "join-on-start":
+                case "login":
+                case "autologin":
+                    settings.setJoinOnStart(mode);
                default:
                    throw new RuntimeException("Setting " + set + " not exist");
            }
--- a/src/main/java/ru/bvn13/jircbot/listeners/LinkPreviewListener.java
+++ b/src/main/java/ru/bvn13/jircbot/listeners/LinkPreviewListener.java
@ -10,6 +10,7 @@ import ru.bvn13.jircbot.documentation.DescriptionProvided;
 import ru.bvn13.jircbot.documentation.DocumentationProvider;
 import ru.bvn13.jircbot.documentation.ListenerDescription;
 import ru.bvn13.jircbot.services.InternetAccessor;
+import ru.bvn13.jircbot.utilities.WebTitleExtractor;

 import java.io.*;
 import java.util.ArrayList;
@ -25,8 +26,6 @@ import static ru.bvn13.jircbot.documentation.ListenerDescription.CommandDescript
@Component
 public class LinkPreviewListener extends ImprovedListenerAdapter implements DescriptionProvided {

-    private InternetAccessor internetAccessor;
-
    private static final Pattern REGEX = Pattern.compile("(?i)(?:(?:https?|ftp)://)(?:\\S+(?::\\S*)?@)?(?:(?!(?:10|127)(?:\\.\\d{1,3}){3})(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))|(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))\\.?)(?::\\d{2,5})?(?:[/?#]\\S*)?");

    private ChannelSettingsService channelSettingsService;
@ -60,10 +59,11 @@ public class LinkPreviewListener extends ImprovedListenerAdapter implements Desc

        List<String> links = findLink(event.getMessage());
        for (String link : links) {
-            String info = parseLink(link);
-            if (!info.isEmpty()) {
+            //String info = parseLink(link);
+            String title = WebTitleExtractor.getPageTitle(link);
+            if (title != null && !title.isEmpty()) {
                //event.respond(info);
-                event.getChannel().send().message(info);
+                event.getChannel().send().message("TITLE: "+title);
            }
        };

@ -79,84 +79,6 @@ public class LinkPreviewListener extends ImprovedListenerAdapter implements Desc
        return links;
    }

-
-    private String parseLink(String link) throws Exception {
-        String content = internetAccessor.retrieveContentByLink(link);
-
-        String encoding = null; //getCharsetFromHeaders(content.toString());
-//        if (encoding == null) {
-//            encoding = getCharsetFromBody(content.toString());
-//        }
-
-        String title = "";
-
-//        if (encoding != null && !encoding.isEmpty()) {
-//            content = internetAccessor.retrieveContentByLinkWithEncoding(link, encoding);
-//        }
-
-        title = content.substring(content.indexOf("<title>") + 7, content.indexOf("</title>"));
-
-        return "Title: "+title;
-    }
-
-
-    public String decodeTitle_buffered(String title, String encoding) throws IOException {
-        ByteArrayOutputStream out = new ByteArrayOutputStream();
-
-        Reader r = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(title.getBytes()), encoding));
-        Writer w = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
-
-        char[] buffer=new char[4096];
-        int len;
-        while((len=r.read(buffer)) != -1) {
-            w.write(buffer, 0, len);
-        }
-        r.close();
-        w.flush();
-        w.close();
-
-        return out.toString();
-    }
-
-    String decodeTitle(String title, String encoding) throws UnsupportedEncodingException {
-        return new String(title.getBytes("UTF-8"), encoding);
-    }
-
-    public String getCharsetFromHeaders(String contentType){
-        if (contentType != null && contentType.toLowerCase().trim().contains("charset=")){
-            String[] parts = contentType.toLowerCase().trim().split("=");
-            if (parts.length > 0)
-                return parts[1];
-        }
-        return null;
-    }
-
-    public static String getCharsetFromBody(String body) {
-        if (body != null) {
-            int headEnd = body.toLowerCase().trim().indexOf("</head>");
-
-            // return null if there's no head tags
-            if (headEnd == -1)
-                return null;
-
-            String body_head = body.toLowerCase().substring(0, headEnd);
-
-            Pattern p = Pattern.compile("charset=([\"\'a-z0-9A-Z-]+)");
-            Matcher m = p.matcher(body_head);
-            String str_match = "";
-            if (m.find()) {
-                str_match = m.toMatchResult().group(1);
-                return str_match.replaceAll("[\"']", "");
-            }
-        }
-        return null;
-    }
-
-    @Autowired
-    public void setInternetAccessor(InternetAccessor internetAccessor) {
-        this.internetAccessor = internetAccessor;
-    }
-
    @Autowired
    public void setChannelSettingsService(ChannelSettingsService channelSettingsService) {
        this.channelSettingsService = channelSettingsService;
--- a/src/main/java/ru/bvn13/jircbot/utilities/WebTitleExtractor.java
+++ b/src/main/java/ru/bvn13/jircbot/utilities/WebTitleExtractor.java
@ -0,0 +1,128 @@
+package ru.bvn13.jircbot.utilities;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLConnection;
+import java.nio.charset.Charset;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * @author bvn13
+ * @since 05.10.2019
+ */
+public class WebTitleExtractor {
+    /* the CASE_INSENSITIVE flag accounts for
+     * sites that use uppercase title tags.
+     * the DOTALL flag accounts for sites that have
+     * line feeds in the title text */
+    private static final Pattern TITLE_TAG =
+            Pattern.compile("<title[^>]*>(.*)</title>", Pattern.CASE_INSENSITIVE|Pattern.DOTALL);
+
+    private static final int MAX_READ_BYTES = 10*1024;
+
+    /**
+     * @param url the HTML page
+     * @return title text (null if document isn't HTML or lacks a title tag)
+     * @throws IOException
+     */
+    public static String getPageTitle(String url) throws IOException {
+        URL u = new URL(url);
+        URLConnection conn = u.openConnection();
+
+        // ContentType is an inner class defined below
+        ContentType contentType = getContentTypeHeader(conn);
+        if (contentType != null && !contentType.contentType.equals("text/html"))
+            return null; // don't continue if not HTML
+        else {
+            // determine the charset, or use the default
+            Charset charset = getCharset(contentType);
+            if (charset == null)
+                charset = Charset.defaultCharset();
+
+            // read the response body, using BufferedReader for performance
+            try (InputStream in = conn.getInputStream();
+                 BufferedReader reader = new BufferedReader(new InputStreamReader(in, charset));
+            ) {
+
+                int n = 0, totalRead = 0;
+                char[] buf = new char[1024];
+                StringBuilder content = new StringBuilder();
+
+                // read until EOF or first 8192 characters
+                while (totalRead < MAX_READ_BYTES && (n = reader.read(buf, 0, buf.length)) != -1) {
+                    content.append(buf, 0, n);
+                    totalRead += n;
+                }
+                reader.close();
+
+                // extract the title
+                Matcher matcher = TITLE_TAG.matcher(content);
+                if (matcher.find()) {
+                    /* replace any occurrences of whitespace (which may
+                     * include line feeds and other uglies) as well
+                     * as HTML brackets with a space */
+                    return matcher.group(1).replaceAll("[\\s\\<>]+", " ").trim();
+                } else
+                    return null;
+            }
+        }
+    }
+
+    /**
+     * Loops through response headers until Content-Type is found.
+     * @param conn
+     * @return ContentType object representing the value of
+     * the Content-Type header
+     */
+    private static ContentType getContentTypeHeader(URLConnection conn) {
+        int i = 0;
+        boolean moreHeaders = true;
+        do {
+            String headerName = conn.getHeaderFieldKey(i);
+            String headerValue = conn.getHeaderField(i);
+            if (headerName != null && headerName.equals("Content-Type"))
+                return new ContentType(headerValue);
+
+            i++;
+            moreHeaders = headerName != null || headerValue != null;
+        }
+        while (moreHeaders);
+
+        return null;
+    }
+
+    private static Charset getCharset(ContentType contentType) {
+        if (contentType != null && contentType.charsetName != null && Charset.isSupported(contentType.charsetName))
+            return Charset.forName(contentType.charsetName);
+        else
+            return null;
+    }
+
+    /**
+     * Class holds the content type and charset (if present)
+     */
+    private static final class ContentType {
+        private static final Pattern CHARSET_HEADER = Pattern.compile("charset=([-_a-zA-Z0-9]+)", Pattern.CASE_INSENSITIVE|Pattern.DOTALL);
+
+        private String contentType;
+        private String charsetName;
+        private ContentType(String headerValue) {
+            if (headerValue == null)
+                throw new IllegalArgumentException("ContentType must be constructed with a not-null headerValue");
+            int n = headerValue.indexOf(";");
+            if (n != -1) {
+                contentType = headerValue.substring(0, n);
+                Matcher matcher = CHARSET_HEADER.matcher(headerValue);
+                if (matcher.find())
+                    charsetName = matcher.group(1);
+            }
+            else
+                contentType = headerValue;
+        }
+    }
+}
--- a/src/main/resources/application.properties
+++ b/src/main/resources/application.properties
@ -1,5 +1,5 @@

-jircbot.version=2.2.0
+jircbot.version=2.2.1

 jircbot.config=config.json