# -*- coding: utf-8 -*- import re import irc3 import requests import io from lxml import etree from . import Plugin class URLInfo(Plugin): BLACKLIST = [ r"^https?:\/\/(?:www\.)?youtube\.com", r"^https?:\/\/youtu\.be", r"^https?:\/\/w0bm\.com", r"^https?:\/\/f0ck\.me", r"^https?:\/\/(?:(?:vid|img|thumb)\.)?pr0gramm\.com" ] # set the size limit to 2 MB so we don't fully download too large resources SIZE_LIMIT = 2 * 1024 ** 2 @irc3.event(r'(?i)^:\S+ PRIVMSG (?P\S+) :.*?(?Phttps?:\/\/\S+\.\S+).*') def url_parser(self, target: str, url: str): for regex in self.BLACKLIST: if re.match(regex, url): return bytes_io = io.BytesIO() try: with requests.get(url, timeout=10, stream=True) as r: r.raise_for_status() mime_type = r.headers.get("content-type") if mime_type is not None and mime_type.split(";")[0] != "text/html": return size = 0 for chunk in r.iter_content(chunk_size=1024 ** 2): size += len(chunk) if size >= self.SIZE_LIMIT: return bytes_io.write(chunk) except requests.exceptions.RequestException: return bytes_io.seek(0) tree = etree.parse(bytes_io, etree.HTMLParser()).getroot() if tree is None: return title_elements = tree.xpath("/html/head/title") if len(title_elements) == 0: return title = title_elements[0].text if title is None: return title = title.strip() if title: self.bot.privmsg(target, '\x02[URLInfo]\x02 ' + title)