nxy/bot/urlinfo.py

62 lines
1.9 KiB
Python

# -*- coding: utf-8 -*-
import re
import irc3
import requests
import io
from lxml import etree
from . import Plugin
class URLInfo(Plugin):
BLACKLIST = [
"^https?:\/\/(?:www\.)?youtube\.com",
"^https?:\/\/youtu\.be",
"^https?:\/\/w0bm\.com",
"^https?:\/\/f0ck\.me",
"^https?:\/\/(?:(?:vid|img|thumb)\.)?pr0gramm\.com"
]
# set the size limit to 2 MB so we don't fully download too large resources
SIZE_LIMIT = 2 * 1024 ** 2
@irc3.event(r'(?i)^:\S+ PRIVMSG (?P<target>\S+) :.*(?P<url>https?:\/\/\S+\.\S+).*')
def url_parser(self, target: str, url: str):
for regex in self.BLACKLIST:
if re.match(regex, url):
return
bytes_io = io.BytesIO()
try:
with requests.get(url, timeout=10, stream=True) as r:
r.raise_for_status()
mime_type = r.headers.get("content-type")
if mime_type is not None and mime_type.split(";")[0] != "text/html":
return
size = 0
for chunk in r.iter_content(chunk_size=1024 ** 2):
size += len(chunk)
if size >= self.SIZE_LIMIT:
return
bytes_io.write(chunk)
except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError, requests.exceptions.ReadTimeout, requests.exceptions.TooManyRedirects):
return
bytes_io.seek(0)
tree = etree.parse(bytes_io, etree.HTMLParser()).getroot()
if tree is None:
return
title_elements = tree.xpath("/html/head/title")
if len(title_elements) == 0:
return
title = title_elements[0].text
if title is None:
return
title = title.strip()
if title:
self.bot.privmsg(target, '\x02[URLInfo]\x02 ' + title)