urlbot-native/common.py

# -*- coding: utf-8 -*-
""" Common functions for urlbot """
import html.parser
import logging
import re
import time
import urllib.request
from collections import namedtuple

RATE_NO_LIMIT = 0x00
RATE_GLOBAL = 0x01
RATE_NO_SILENCE = 0x02
RATE_INTERACTIVE = 0x04
RATE_CHAT = 0x08
RATE_URL = 0x10
RATE_EVENT = 0x20
RATE_FUN = 0x40

BUFSIZ = 8192
EVENTLOOP_DELAY = 0.100  # seconds
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) ' \
             'Gecko/20100101 Firefox/31.0 Iceweasel/31.0'

Bucket = namedtuple("BucketConfig", ["history", "period", "max_hist_len"])

buckets = {
    # everything else
    RATE_GLOBAL: Bucket(history=[], period=60, max_hist_len=10),
    # bot writes with no visible stimuli
    RATE_NO_SILENCE: Bucket(history=[], period=10, max_hist_len=5),
    # interactive stuff like ping
    RATE_INTERACTIVE: Bucket(history=[], period=30, max_hist_len=5),
    # chitty-chat, master volume control
    RATE_CHAT: Bucket(history=[], period=10, max_hist_len=5),
    # reacting on URLs
    RATE_URL: Bucket(history=[], period=10, max_hist_len=5),
    # triggering events
    RATE_EVENT: Bucket(history=[], period=60, max_hist_len=10),
    # bot blames people, produces cake and entertains
    RATE_FUN: Bucket(history=[], period=180, max_hist_len=5),
}

rate_limit_classes = buckets.keys()


def rate_limit(rate_class=RATE_GLOBAL):
    """
    Remember N timestamps,
    if N[0] newer than now()-T then do not output, do not append.
    else pop(0); append()

    :param rate_class: the type of message to verify
    :return: False if blocked, True if allowed
    """
    if rate_class not in rate_limit_classes:
        return all(rate_limit(c) for c in rate_limit_classes if c & rate_class)

    now = time.time()
    bucket = buckets[rate_class]
    logging.getLogger(__name__).debug(
        "[ratelimit][bucket=%x][time=%s]%s",
        rate_class, now, bucket.history
    )

    if len(bucket.history) >= bucket.max_hist_len and bucket.history[0] > (now - bucket.period):
        # print("blocked")
        return False
    else:
        if bucket.history and len(bucket.history) > bucket.max_hist_len:
            bucket.history.pop(0)
        bucket.history.append(now)
        return True


def rate_limited(max_per_second):
    """
    very simple flow control context manager
    :param max_per_second: how many events per second may be executed - more are delayed
    :return:
    """
    min_interval = 1.0 / float(max_per_second)

    def decorate(func):
        lasttimecalled = [0.0]

        def ratelimitedfunction(*args, **kargs):
            elapsed = time.clock() - lasttimecalled[0]
            lefttowait = min_interval - elapsed
            if lefttowait > 0:
                time.sleep(lefttowait)
            ret = func(*args, **kargs)
            lasttimecalled[0] = time.clock()
            return ret

        return ratelimitedfunction

    return decorate


def get_version_git():
    import subprocess

    cmd = ['git', 'log', '--oneline', '--abbrev-commit']

    try:
        p = subprocess.Popen(cmd, bufsize=1, stdout=subprocess.PIPE)
        first_line = p.stdout.readline()
        line_count = len(p.stdout.readlines()) + 1

        if 0 == p.wait():
            # skip this 1st, 2nd, 3rd stuff and use always [0-9]th
            return "version (Git, %dth rev) '%s'" % (
                line_count, str(first_line.strip(), encoding='utf8')
            )
        else:
            return "(unknown version)"
    except:
        return "cannot determine version"


VERSION = get_version_git()


def fetch_page(url):
    log = logging.getLogger(__name__)
    log.info('fetching page ' + url)
    try:
        request = urllib.request.Request(url)
        request.add_header('User-Agent', USER_AGENT)
        response = urllib.request.urlopen(request)
        html_text = response.read(BUFSIZ)  # ignore more than BUFSIZ
        if html_text[0] == 0x1f and html_text[1] == 0x8b:
            import zlib
            try:
                gzip_data = zlib.decompress(html_text, zlib.MAX_WBITS | 16)
            except:
                pass
            else:
                html_text = gzip_data
        response.close()
        return 0, html_text, response.headers
    except Exception as e:
        log.warn('failed: %s' % e)
        return 1, str(e), 'dummy'


def extract_title(url):
    log = logging.getLogger(__name__)
    global parser

    if 'repo/urlbot-native.git' in url:
        log.info('repo URL found: ' + url)
        return 3, 'wee, that looks like my home repo!'

    log.info('extracting title from ' + url)

    (code, html_text, headers) = fetch_page(url)

    if 1 == code:
        return 3, 'failed: %s for %s' % (html_text, url)

    if not html_text:
        return -1, 'error'

    charset = ''
    if 'content-type' in headers:
        log.debug('content-type: ' + headers['content-type'])

        if 'text/' != headers['content-type'][:len('text/')]:
            return 1, headers['content-type']

        charset = re.sub(
            r'.*charset=(?P<charset>\S+).*',
            r'\g<charset>', headers['content-type'], re.IGNORECASE
        )

    if '' != charset:
        try:
            html_text = html_text.decode(charset)
        except LookupError:
            log.warn("invalid charset in '%s': '%s'" % (headers['content-type'], charset))

    if str != type(html_text):
        html_text = str(html_text)

    result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html_text, re.S | re.M | re.IGNORECASE)
    if result:
        match = result.groups()[0]

        parser = html.parser.HTMLParser()
        try:
            expanded_html = parser.unescape(match)
        except UnicodeDecodeError as e:  # idk why this can happen, but it does
            log.warn('parser.unescape() expoded here: ' + str(e))
            expanded_html = match
        return 0, expanded_html
    else:
        return 2, 'no title'
moved common stuff to common.py; import adjustments 2014-09-27 09:19:46 +02:00			`# -- coding: utf-8 --`
minor cleanup 2015-11-30 19:50:11 +01:00			`""" Common functions for urlbot """`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00			`import html.parser`
			`import logging`
			`import re`
fix ratelimiting the new ratelimiting: use ratelimit(ratelimit_class) to push and verify ratelimit buckets defined in common (can be extended during runtime). 2015-11-28 01:08:26 +01:00			`import time`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00			`import urllib.request`
fix ratelimiting the new ratelimiting: use ratelimit(ratelimit_class) to push and verify ratelimit buckets defined in common (can be extended during runtime). 2015-11-28 01:08:26 +01:00			`from collections import namedtuple`
moved common stuff to common.py; import adjustments 2014-09-27 09:19:46 +02:00
fix ratelimiting the new ratelimiting: use ratelimit(ratelimit_class) to push and verify ratelimit buckets defined in common (can be extended during runtime). 2015-11-28 01:08:26 +01:00			`RATE_NO_LIMIT = 0x00`
whitespace fixes according to pep8; cleanup pep8 --ignore=W191,E225,E501,E401,E302,E122,E123 *.py 2014-12-02 17:01:40 +01:00			`RATE_GLOBAL = 0x01`
			`RATE_NO_SILENCE = 0x02`
moved common stuff to common.py; import adjustments 2014-09-27 09:19:46 +02:00			`RATE_INTERACTIVE = 0x04`
whitespace fixes according to pep8; cleanup pep8 --ignore=W191,E225,E501,E401,E302,E122,E123 *.py 2014-12-02 17:01:40 +01:00			`RATE_CHAT = 0x08`
			`RATE_URL = 0x10`
fix ratelimiting the new ratelimiting: use ratelimit(ratelimit_class) to push and verify ratelimit buckets defined in common (can be extended during runtime). 2015-11-28 01:08:26 +01:00			`RATE_EVENT = 0x20`
			`RATE_FUN = 0x40`
fix totally wrong rate limit storage initialization 2015-11-20 22:23:31 +01:00
moved common stuff to common.py; import adjustments 2014-09-27 09:19:46 +02:00			`BUFSIZ = 8192`
using global uniq USER_AGENT 2015-08-21 23:35:28 +02:00			`EVENTLOOP_DELAY = 0.100 # seconds`
minor cleanup 2015-11-30 19:50:11 +01:00			`USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) ' \`
			`'Gecko/20100101 Firefox/31.0 Iceweasel/31.0'`
moved common stuff to common.py; import adjustments 2014-09-27 09:19:46 +02:00
fix ratelimiting the new ratelimiting: use ratelimit(ratelimit_class) to push and verify ratelimit buckets defined in common (can be extended during runtime). 2015-11-28 01:08:26 +01:00			`Bucket = namedtuple("BucketConfig", ["history", "period", "max_hist_len"])`

			`buckets = {`
all death to the tab character 2015-11-30 19:17:40 +01:00			`# everything else`
			`RATE_GLOBAL: Bucket(history=[], period=60, max_hist_len=10),`
			`# bot writes with no visible stimuli`
			`RATE_NO_SILENCE: Bucket(history=[], period=10, max_hist_len=5),`
			`# interactive stuff like ping`
			`RATE_INTERACTIVE: Bucket(history=[], period=30, max_hist_len=5),`
			`# chitty-chat, master volume control`
			`RATE_CHAT: Bucket(history=[], period=10, max_hist_len=5),`
			`# reacting on URLs`
			`RATE_URL: Bucket(history=[], period=10, max_hist_len=5),`
			`# triggering events`
			`RATE_EVENT: Bucket(history=[], period=60, max_hist_len=10),`
			`# bot blames people, produces cake and entertains`
			`RATE_FUN: Bucket(history=[], period=180, max_hist_len=5),`
fix ratelimiting the new ratelimiting: use ratelimit(ratelimit_class) to push and verify ratelimit buckets defined in common (can be extended during runtime). 2015-11-28 01:08:26 +01:00			`}`

			`rate_limit_classes = buckets.keys()`


			`def rate_limit(rate_class=RATE_GLOBAL):`
all death to the tab character 2015-11-30 19:17:40 +01:00			`"""`
			`Remember N timestamps,`
			`if N[0] newer than now()-T then do not output, do not append.`
			`else pop(0); append()`

			`:param rate_class: the type of message to verify`
			`:return: False if blocked, True if allowed`
			`"""`
			`if rate_class not in rate_limit_classes:`
			`return all(rate_limit(c) for c in rate_limit_classes if c & rate_class)`

			`now = time.time()`
			`bucket = buckets[rate_class]`
minor cleanup 2015-11-30 19:50:11 +01:00			`logging.getLogger(__name__).debug(`
			`"[ratelimit][bucket=%x][time=%s]%s",`
			`rate_class, now, bucket.history`
			`)`
all death to the tab character 2015-11-30 19:17:40 +01:00
			`if len(bucket.history) >= bucket.max_hist_len and bucket.history[0] > (now - bucket.period):`
			`# print("blocked")`
			`return False`
			`else:`
			`if bucket.history and len(bucket.history) > bucket.max_hist_len:`
			`bucket.history.pop(0)`
			`bucket.history.append(now)`
			`return True`
fix ratelimiting the new ratelimiting: use ratelimit(ratelimit_class) to push and verify ratelimit buckets defined in common (can be extended during runtime). 2015-11-28 01:08:26 +01:00

			`def rate_limited(max_per_second):`
all death to the tab character 2015-11-30 19:17:40 +01:00			`"""`
			`very simple flow control context manager`
			`:param max_per_second: how many events per second may be executed - more are delayed`
			`:return:`
			`"""`
			`min_interval = 1.0 / float(max_per_second)`
fix ratelimiting the new ratelimiting: use ratelimit(ratelimit_class) to push and verify ratelimit buckets defined in common (can be extended during runtime). 2015-11-28 01:08:26 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`def decorate(func):`
			`lasttimecalled = [0.0]`
fix ratelimiting the new ratelimiting: use ratelimit(ratelimit_class) to push and verify ratelimit buckets defined in common (can be extended during runtime). 2015-11-28 01:08:26 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`def ratelimitedfunction(args, *kargs):`
			`elapsed = time.clock() - lasttimecalled[0]`
			`lefttowait = min_interval - elapsed`
			`if lefttowait > 0:`
			`time.sleep(lefttowait)`
			`ret = func(args, *kargs)`
			`lasttimecalled[0] = time.clock()`
			`return ret`
fix ratelimiting the new ratelimiting: use ratelimit(ratelimit_class) to push and verify ratelimit buckets defined in common (can be extended during runtime). 2015-11-28 01:08:26 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`return ratelimitedfunction`
fix ratelimiting the new ratelimiting: use ratelimit(ratelimit_class) to push and verify ratelimit buckets defined in common (can be extended during runtime). 2015-11-28 01:08:26 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`return decorate`
fix ratelimiting the new ratelimiting: use ratelimit(ratelimit_class) to push and verify ratelimit buckets defined in common (can be extended during runtime). 2015-11-28 01:08:26 +01:00

moved VERSION stuff to common.py 2014-09-27 09:41:29 +02:00			`def get_version_git():`
all death to the tab character 2015-11-30 19:17:40 +01:00			`import subprocess`
moved VERSION stuff to common.py 2014-09-27 09:41:29 +02:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`cmd = ['git', 'log', '--oneline', '--abbrev-commit']`
moved VERSION stuff to common.py 2014-09-27 09:41:29 +02:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`try:`
			`p = subprocess.Popen(cmd, bufsize=1, stdout=subprocess.PIPE)`
			`first_line = p.stdout.readline()`
			`line_count = len(p.stdout.readlines()) + 1`
moved VERSION stuff to common.py 2014-09-27 09:41:29 +02:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`if 0 == p.wait():`
			`# skip this 1st, 2nd, 3rd stuff and use always [0-9]th`
			`return "version (Git, %dth rev) '%s'" % (`
			`line_count, str(first_line.strip(), encoding='utf8')`
			`)`
			`else:`
			`return "(unknown version)"`
			`except:`
			`return "cannot determine version"`
moved VERSION stuff to common.py 2014-09-27 09:41:29 +02:00
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
moved VERSION stuff to common.py 2014-09-27 09:41:29 +02:00			`VERSION = get_version_git()`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00

			`def fetch_page(url):`
all death to the tab character 2015-11-30 19:17:40 +01:00			`log = logging.getLogger(__name__)`
			`log.info('fetching page ' + url)`
			`try:`
			`request = urllib.request.Request(url)`
			`request.add_header('User-Agent', USER_AGENT)`
			`response = urllib.request.urlopen(request)`
			`html_text = response.read(BUFSIZ) # ignore more than BUFSIZ`
handle some pages sending gzip without the accept header 2015-12-19 22:53:28 +01:00			`if html_text[0] == 0x1f and html_text[1] == 0x8b:`
			`import zlib`
			`try:`
			`gzip_data = zlib.decompress(html_text, zlib.MAX_WBITS \| 16)`
			`except:`
			`pass`
			`else:`
			`html_text = gzip_data`
all death to the tab character 2015-11-30 19:17:40 +01:00			`response.close()`
			`return 0, html_text, response.headers`
			`except Exception as e:`
			`log.warn('failed: %s' % e)`
			`return 1, str(e), 'dummy'`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00

			`def extract_title(url):`
all death to the tab character 2015-11-30 19:17:40 +01:00			`log = logging.getLogger(__name__)`
			`global parser`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`if 'repo/urlbot-native.git' in url:`
			`log.info('repo URL found: ' + url)`
			`return 3, 'wee, that looks like my home repo!'`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`log.info('extracting title from ' + url)`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`(code, html_text, headers) = fetch_page(url)`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`if 1 == code:`
			`return 3, 'failed: %s for %s' % (html_text, url)`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`if not html_text:`
			`return -1, 'error'`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`charset = ''`
			`if 'content-type' in headers:`
			`log.debug('content-type: ' + headers['content-type'])`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`if 'text/' != headers['content-type'][:len('text/')]:`
			`return 1, headers['content-type']`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`charset = re.sub(`
			`r'.charset=(?P<charset>\S+).',`
			`r'\g<charset>', headers['content-type'], re.IGNORECASE`
			`)`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`if '' != charset:`
			`try:`
			`html_text = html_text.decode(charset)`
			`except LookupError:`
			`log.warn("invalid charset in '%s': '%s'" % (headers['content-type'], charset))`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`if str != type(html_text):`
			`html_text = str(html_text)`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`result = re.match(r'.?<title.?>(.?)</title>.?', html_text, re.S \| re.M \| re.IGNORECASE)`
			`if result:`
			`match = result.groups()[0]`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`parser = html.parser.HTMLParser()`
			`try:`
			`expanded_html = parser.unescape(match)`
			`except UnicodeDecodeError as e: # idk why this can happen, but it does`
			`log.warn('parser.unescape() expoded here: ' + str(e))`
			`expanded_html = match`
			`return 0, expanded_html`
			`else:`
			`return 2, 'no title'`