urlinfo: check content-type before downloading + handle timeouts

improve codestyle and formatting
2021-05-20 00:23:15 +00:00
parent e7b89e7fdb
commit 9c3cd2f816
1 changed files with 10 additions and 8 deletions
--- a/bot/urlinfo.py
+++ b/bot/urlinfo.py
@@ -16,6 +16,7 @@ class URLInfo(Plugin):
        "^https?:\/\/(?:(?:vid|img|thumb)\.)?pr0gramm\.com"
    ]

+    # set the size limit to 2 MB so we don't fully download too large resources
    SIZE_LIMIT = 2 * 1024 ** 2

    @irc3.event(r'(?i)^:\S+ PRIVMSG (?P<target>\S+) :.*(?P<url>https?:\/\/\S+\.\S+).*')
@@ -23,26 +24,27 @@ class URLInfo(Plugin):
        for regex in self.BLACKLIST:
            if re.match(regex, url):
                return
+
        bytes_io = io.BytesIO()
+
        try:
            with requests.get(url, timeout=10, stream=True) as r:
                r.raise_for_status()
+                mime_type = r.headers.get("content-type")
+                if mime_type is not None and mime_type.split(";")[0] != "text/html":
+                    return
                size = 0
                for chunk in r.iter_content(chunk_size=1024 ** 2):
                    size += len(chunk)
                    if size >= self.SIZE_LIMIT:
                        return
                    bytes_io.write(chunk)
-        except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e:
+        except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError, requests.exceptions.ReadTimeout):
            return

-        mime_type = r.headers.get("content-type")
-        if mime_type is not None:
-            if mime_type.split(";")[0] != "text/html":
-                return
        bytes_io.seek(0)
        tree = etree.parse(bytes_io, etree.HTMLParser()).getroot()
-        title = tree.xpath("/html/head/title")
-        if len(title) > 0:
-            self.bot.privmsg(target, '\x02[URLInfo]\x02 ' + title[0].text.strip())
+        title_elements = tree.xpath("/html/head/title")
+        if len(title_elements) > 0:
+            self.bot.privmsg(target, '\x02[URLInfo]\x02 ' + title_elements[0].text.strip())