diff --git a/bot/urlinfo.py b/bot/urlinfo.py index dd0f04d..c780c67 100644 --- a/bot/urlinfo.py +++ b/bot/urlinfo.py @@ -16,6 +16,7 @@ class URLInfo(Plugin): "^https?:\/\/(?:(?:vid|img|thumb)\.)?pr0gramm\.com" ] + # set the size limit to 2 MB so we don't fully download too large resources SIZE_LIMIT = 2 * 1024 ** 2 @irc3.event(r'(?i)^:\S+ PRIVMSG (?P\S+) :.*(?Phttps?:\/\/\S+\.\S+).*') @@ -23,26 +24,27 @@ class URLInfo(Plugin): for regex in self.BLACKLIST: if re.match(regex, url): return + bytes_io = io.BytesIO() + try: with requests.get(url, timeout=10, stream=True) as r: r.raise_for_status() + mime_type = r.headers.get("content-type") + if mime_type is not None and mime_type.split(";")[0] != "text/html": + return size = 0 for chunk in r.iter_content(chunk_size=1024 ** 2): size += len(chunk) if size >= self.SIZE_LIMIT: return bytes_io.write(chunk) - except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e: + except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError, requests.exceptions.ReadTimeout): return - mime_type = r.headers.get("content-type") - if mime_type is not None: - if mime_type.split(";")[0] != "text/html": - return bytes_io.seek(0) tree = etree.parse(bytes_io, etree.HTMLParser()).getroot() - title = tree.xpath("/html/head/title") - if len(title) > 0: - self.bot.privmsg(target, '\x02[URLInfo]\x02 ' + title[0].text.strip()) + title_elements = tree.xpath("/html/head/title") + if len(title_elements) > 0: + self.bot.privmsg(target, '\x02[URLInfo]\x02 ' + title_elements[0].text.strip())