nxy/bot/urlinfo.py

# -*- coding: utf-8 -*-
import re
import irc3
import requests
import io
from lxml import etree

from . import Plugin

class URLInfo(Plugin):
    BLACKLIST = [
        r"^https?:\/\/(?:www\.)?youtube\.com",
        r"^https?:\/\/youtu\.be",
        r"^https?:\/\/w0bm\.com",
        r"^https?:\/\/f0ck\.me",
        r"^https?:\/\/(?:(?:vid|img|thumb)\.)?pr0gramm\.com"
    ]

    # set the size limit to 2 MB so we don't fully download too large resources
    SIZE_LIMIT = 2 * 1024 ** 2

    @irc3.event(r'(?i)^:\S+ PRIVMSG (?P<target>\S+) :.*?(?P<url>https?:\/\/\S+\.\S+).*')
    def url_parser(self, target: str, url: str):
        for regex in self.BLACKLIST:
            if re.match(regex, url):
                return

        bytes_io = io.BytesIO()

        try:
            with requests.get(url, timeout=10, stream=True) as r:
                r.raise_for_status()
                mime_type = r.headers.get("content-type")
                if mime_type is not None and mime_type.split(";")[0] != "text/html":
                    return
                size = 0
                for chunk in r.iter_content(chunk_size=1024 ** 2):
                    size += len(chunk)
                    if size >= self.SIZE_LIMIT:
                        return
                    bytes_io.write(chunk)
        except requests.exceptions.RequestException:
            return

        bytes_io.seek(0)
        tree = etree.parse(bytes_io, etree.HTMLParser()).getroot()
        if tree is None:
            return

        title_elements = tree.xpath("/html/head/title")
        if len(title_elements) == 0:
            return

        title = title_elements[0].text
        if title is None:
            return

        title = title.strip()
        if title:
            self.bot.privmsg(target, '\x02[URLInfo]\x02 ' + title)
add urlinfo plugin 2021-05-14 19:20:19 +00:00			`# -- coding: utf-8 --`
			`import re`
			`import irc3`
			`import requests`
			`import io`
			`from lxml import etree`

			`from . import Plugin`

			`class URLInfo(Plugin):`
			`BLACKLIST = [`
drugs, urlinfo, youtube: fix 'invalid escape sequence' SyntaxWarnings 2024-06-02 15:30:20 +00:00			`r"^https?:\/\/(?:www\.)?youtube\.com",`
			`r"^https?:\/\/youtu\.be",`
			`r"^https?:\/\/w0bm\.com",`
			`r"^https?:\/\/f0ck\.me",`
			`r"^https?:\/\/(?:(?:vid\|img\|thumb)\.)?pr0gramm\.com"`
add urlinfo plugin 2021-05-14 19:20:19 +00:00			`]`

urlinfo: check content-type before downloading + handle timeouts improve codestyle and formatting 2021-05-20 00:23:15 +00:00			`# set the size limit to 2 MB so we don't fully download too large resources`
urlinfo: limit download size + check content type 2021-05-14 22:13:30 +00:00			`SIZE_LIMIT = 2 * 1024 ** 2`

urlinfo: make .* in front of the URL non-greedy This is done because URLs may contain one or more other URLs that match the url pattern (e.g. https://web.archive.org links). Because .* is greedy by default, this caused only the last matching URL to be captured, instead of the full URL. 2023-08-05 13:42:46 +00:00			`@irc3.event(r'(?i)^:\S+ PRIVMSG (?P<target>\S+) :.?(?P<url>https?:\/\/\S+\.\S+).')`
add urlinfo plugin 2021-05-14 19:20:19 +00:00			`def url_parser(self, target: str, url: str):`
			`for regex in self.BLACKLIST:`
			`if re.match(regex, url):`
			`return`
urlinfo: check content-type before downloading + handle timeouts improve codestyle and formatting 2021-05-20 00:23:15 +00:00
urlinfo: limit download size + check content type 2021-05-14 22:13:30 +00:00			`bytes_io = io.BytesIO()`
urlinfo: check content-type before downloading + handle timeouts improve codestyle and formatting 2021-05-20 00:23:15 +00:00
add urlinfo plugin 2021-05-14 19:20:19 +00:00			`try:`
urlinfo: limit download size + check content type 2021-05-14 22:13:30 +00:00			`with requests.get(url, timeout=10, stream=True) as r:`
			`r.raise_for_status()`
urlinfo: check content-type before downloading + handle timeouts improve codestyle and formatting 2021-05-20 00:23:15 +00:00			`mime_type = r.headers.get("content-type")`
			`if mime_type is not None and mime_type.split(";")[0] != "text/html":`
			`return`
urlinfo: limit download size + check content type 2021-05-14 22:13:30 +00:00			`size = 0`
			`for chunk in r.iter_content(chunk_size=1024 ** 2):`
			`size += len(chunk)`
			`if size >= self.SIZE_LIMIT:`
			`return`
			`bytes_io.write(chunk)`
urlinfo: catch superclass RequestException instead of each exception individually 2023-07-21 11:44:20 +00:00			`except requests.exceptions.RequestException:`
add urlinfo plugin 2021-05-14 19:20:19 +00:00			`return`
urlinfo: limit download size + check content type 2021-05-14 22:13:30 +00:00
			`bytes_io.seek(0)`
			`tree = etree.parse(bytes_io, etree.HTMLParser()).getroot()`
urlinfo: fix some NoneType attribute errors 2021-07-20 21:40:38 +00:00			`if tree is None:`
			`return`

urlinfo: check content-type before downloading + handle timeouts improve codestyle and formatting 2021-05-20 00:23:15 +00:00			`title_elements = tree.xpath("/html/head/title")`
urlinfo: fix some NoneType attribute errors 2021-07-20 21:40:38 +00:00			`if len(title_elements) == 0:`
			`return`

			`title = title_elements[0].text`
			`if title is None:`
			`return`

			`title = title.strip()`
			`if title:`
			`self.bot.privmsg(target, '\x02[URLInfo]\x02 ' + title)`
add urlinfo plugin 2021-05-14 19:20:19 +00:00