diff --git a/urlbot-mcabber/.gitignore b/urlbot-mcabber/.gitignore new file mode 100644 index 0000000..d150401 --- /dev/null +++ b/urlbot-mcabber/.gitignore @@ -0,0 +1,6 @@ +.*swp +*.pyc +cmdfifo +logs/ +event_files/ +urlbot.persistent diff --git a/urlbot-mcabber/common.py b/urlbot-mcabber/common.py new file mode 100644 index 0000000..42c786a --- /dev/null +++ b/urlbot-mcabber/common.py @@ -0,0 +1,84 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +if '__main__' == __name__: + print('''this is a library file, which is not meant to be executed''') + exit(-1) + +import sys, os, time, pickle +from local_config import conf + +RATE_GLOBAL = 0x01 +RATE_NO_SILENCE = 0x02 +RATE_INTERACTIVE = 0x04 +RATE_CHAT = 0x08 +RATE_URL = 0x10 + +BUFSIZ = 8192 +delay = 0.100 # seconds + +basedir = '.' +if 2 == len(sys.argv): basedir = sys.argv[1] + +event_files_dir = os.path.join(basedir, conf('path_event_files')) +fifo_path = os.path.join(basedir, conf('path_cmdfifo')) + +def debug_enabled(): +# return True + return False + +def logger(severity, message): +# sev = ( 'err', 'warn', 'info' ) +# if severity in sev: + args = (sys.argv[0], time.strftime('%Y-%m-%d.%H:%M:%S'), severity, message) + sys.stderr.write('%s %s %s: %s\n' % args) + +def conf_save(obj): + with open(conf('persistent_storage'), 'wb') as fd: + return pickle.dump(obj, fd) + +def conf_load(): + with open(conf('persistent_storage'), 'rb') as fd: + fd.seek(0) + return pickle.load(fd) + +def levenshtein(a, b, return_table=False): + '''returns the levenshtein distance between a and b''' + # initialisize a table with 0, but the 0-rows/cols with their index + d = [[ (i if 0 == j else j if 0 == i else 0) for j in range(len(b)+1) ] for i in range(len(a)+1) ] + + i = j = 0 + for i in range(1, len(a)+1): + for j in range(1, len(b)+1): + if a[i-1] == b[j-1]: + d[i][j] = d[i-1][j-1] + else: + d[i][j] = min( + d[i-1][j] + 1, # deletion + d[i][j-1] + 1, # insertion + d[i-1][j-1] + 1, # substitution + ) + + if return_table: + return (d, d[i][j]) + else: + return d[i][j] + +def get_version_git(): + import subprocess + + cmd = ['git', 'log', '--oneline', '--abbrev-commit'] + + p = subprocess.Popen(cmd, bufsize=1, stdout=subprocess.PIPE) + first_line = p.stdout.readline() + line_count = len(p.stdout.readlines()) + 1 + + if 0 == p.wait(): + # skip this 1st, 2nd, 3rd stuff and use always [0-9]th + return "version (Git, %dth rev) '%s'" % ( + line_count, str(first_line.strip(), encoding='utf8') + ) + else: + return "(unknown version)" + +VERSION = get_version_git() diff --git a/urlbot-mcabber/levenshtein_test.py b/urlbot-mcabber/levenshtein_test.py new file mode 100755 index 0000000..a29abc7 --- /dev/null +++ b/urlbot-mcabber/levenshtein_test.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 + +from common import levenshtein + +(a, b) = ('foo barbaz', 'foobar baz') +(a, b) = ('sitting', 'kitten') +(a, b) = ('Monte Kali (Heringen)', 'http://de.wikipedia.org/wiki/Monte_Kali_%28Heringen%29') + +(matrix, ret) = levenshtein(a, b, return_table=True) + +sep = ' '*0 +out = '' +for B in b: + out += sep + '%2s' % B +print(sep + ' '*4 + out) + +for i in range(len(matrix)): + if 0 == i: + out = ' ' + else: + out = '%2s' % a[i-1] + + for j in range(len(matrix[i])): + if 0 == i or 0 == j: + col = '30;42' + elif i == j: + col = '41' + else: + col = 0 + + if 0 != col: + out += sep + '\x1b[%sm%2d\x1b[m' %(col, matrix[i][j]) + else: + out += sep + '%2d' % matrix[i][j] + + print(out) + +print(ret) diff --git a/urlbot-mcabber/local_config.py.skel b/urlbot-mcabber/local_config.py.skel new file mode 100644 index 0000000..78a9112 --- /dev/null +++ b/urlbot-mcabber/local_config.py.skel @@ -0,0 +1,50 @@ +#!/usr/bin/python3 + +import time, sys + +def _logger(a, b): sys.stderr.write('logger: %s::%s\n' %(a, b)) +try: logger +except NameError: logger = _logger + +if '__main__' == __name__: + print('''this is a config file, which is not meant to be executed''') + exit(-1) + +config = { + 'src-url': 'FIXME', + + 'bot_user': 'urlbot', + 'bot_owner': 'FIXME', + + 'hist_max_count': 5, + 'hist_max_time': 10 * 60, + + 'uptime': -time.time(), + 'request_counter': 0, + + 'path_event_files': 'event_files', + 'path_cmdfifo': 'cmdfifo', + 'persistent_storage': 'urlbot.persistent', + + 'url_blacklist': [ + r'^.*heise\.de/[^/]+/meldung/.*$', + r'^.*wikipedia\.org/wiki/.*$' + ], + +# the "dice" feature will use more efficient random data (0) for given users + 'enhanced-random-user': ( 'FIXME', 'FIXME' ), + + 'tea_steep_time': (3*60 + 40), + + 'image_preview': True +} + +def conf(val): + if val in list(config.keys()): + return config[val] + logger('warn', 'conf(): unknown key ' + str(val)) + return None + +def set_conf(key, val): + config[key] = val + return None diff --git a/urlbot-mcabber/plugins.py b/urlbot-mcabber/plugins.py new file mode 100644 index 0000000..a66f453 --- /dev/null +++ b/urlbot-mcabber/plugins.py @@ -0,0 +1,623 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +if '__main__' == __name__: + print('''this is a plugin file, which is not meant to be executed''') + exit(-1) + +import time, random, unicodedata, re +from local_config import conf +from common import * +from urlbot import extract_title + +joblist = [] + +plugins = {} +plugins['parse'] = [] +plugins['command'] = [] + +def get_reply_data(data, field=0): + # FIXME: we can't determine if a user named 'foo> ' just wrote ' > bar' + # or a user 'foo' just wrote '> > bar' + f = data.split() + + if 0 == field: + if 1 > len(f): + return None + return f[0].strip('<>') + else: + if field >= len(f): + return None + return f[field] + +def register_event(t, callback, args): + joblist.append((t, callback, args)) + +def parse_mental_ill(args): + if 'register' == args: + return { + 'name': 'parse mental illness', + 'args': ('data', 'reply_user'), + 'ratelimit_class': RATE_NO_SILENCE | RATE_GLOBAL + } + + min_ill = 3 + c = 0 + flag = False + + # return True for min_ill '!' in a row + for d in args['data']: + if '!' == d or '?' == d: + c += 1 + else: + c = 0 + if (min_ill <= c): + flag = True + break + + if True == flag: + logger('plugin', 'sent mental illness reply') + return { + 'msg': '''Multiple exclamation/question marks are a sure sign of mental disease, with %s as a living example.''' % args['reply_user'] + } + +def parse_debbug(args): + if 'register' == args: + return { + 'name': 'parse Debian bug numbers', + 'args': ('data',), + 'ratelimit_class': RATE_NO_SILENCE | RATE_GLOBAL + } + + bugs = re.findall(r'#(\d{4,})', args['data']) + if not bugs: + return None + + url = 'https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=%s' % bugs[0] + status, title = extract_title(url) + + if 0 == status: + title = 'Debian Bug: ' + title + elif 3 == status: + pass + else: + return None + + logger('plugin', 'detected Debian bug') + return { + 'msg': title + } + +def parse_cve(args): + if 'register' == args: + return { + 'name': 'parse a CVE handle', + 'args': ('data',), + 'ratelimit_class': RATE_NO_SILENCE | RATE_GLOBAL + } + + cves = re.findall(r'(CVE-\d\d\d\d-\d+)', args['data'].upper()) + if not cves: + return None + + logger('plugin', 'detected CVE handle') + return { + 'msg': 'https://security-tracker.debian.org/tracker/%s' % cves[0] + } + +def parse_skynet(args): + if 'register' == args: + return { + 'name': 'parse skynet', + 'args': ('data',), + 'ratelimit_class': RATE_GLOBAL + } + + if 'skynet' in args['data'].lower(): + logger('plugin', 'sent skynet reply') + return { + 'msg': '''I'm an independent bot and have nothing to do with other artificial intelligence systems!''' + } + +def data_parse_other(data): + reply_user = get_reply_data(data) + + for p in plugins['parse']: + if ratelimit_exceeded(p['ratelimit_class']): + continue + + args = {} + + if 'args' in list(p.keys()): + for a in p['args']: + if None == a: continue + + if 'data' == a: + args['data'] = data + elif 'reply_user' == a: + args['reply_user'] = reply_user + else: + logger('warn', 'unknown required arg for %s: %s' %(p['name'], a)) + + ret = p['func'](args) + + if None != ret: + if 'msg' in list(ret.keys()): + ratelimit_touch(RATE_CHAT) + chat_write(ret['msg']) + +def command_command(args): + if 'register' == args: + return { + 'name': 'command', + 'desc': 'lists commands', + 'args': ('data', 'reply_user', 'cmd_list'), + 'ratelimit_class': RATE_GLOBAL + } + + if 'command' in args['data']: + logger('plugin', 'sent command list') + return { + 'msg': args['reply_user'] + ': known commands: ' + str(args['cmd_list']).strip('[]') + } + +def command_help(args): + if 'register' == args: + return { + 'name': 'help', + 'desc': 'print help for a command', + 'args': ('data', 'reply_user', 'cmd_list'), + 'ratelimit_class': RATE_GLOBAL + } + + + cmd = None + flag = False + + for word in args['data'].split(): + if True == flag: + cmd = word + break + + if 'help' == word: + flag = True + + if False == flag: # no match on 'help' + return None + + if None == cmd: + logger('plugin', 'empty help request') + return { + 'msg': args['reply_user'] + ': no command given' + } + + if not cmd in [p['name'] for p in plugins['command']]: + logger('plugin', 'no help found for %s' % cmd) + return { + 'msg': args['reply_user'] + ': no such command: %s' % cmd + } + + for p in plugins['command']: + if cmd == p['name']: + logger('plugin', 'sent help for %s' % cmd) + return { + 'msg': args['reply_user'] + ': help for %s: %s' %(cmd, p['desc']) + } + + +def command_version(args): + if 'register' == args: + return { + 'name': 'version', + 'desc': 'prints version', + 'args': ('data', 'reply_user'), + 'ratelimit_class': RATE_GLOBAL + } + + if 'version' in args['data']: + logger('plugin', 'sent version string') + return { + 'msg': args['reply_user'] + (''': I'm running ''' + VERSION) + } + +def command_klammer(args): + if 'register' == args: + return { + 'name': 'klammer', + 'desc': 'prints an anoying paper clip aka. Karl Klammer', + 'args': ('data', 'reply_user'), + 'ratelimit_class': RATE_GLOBAL + } + + if 'klammer' in args['data']: + logger('plugin', 'sent karl klammer') + return { + 'msg': + ( + args['reply_user'] + r''': _, Was moechten''', + args['reply_user'] + r''': ( _\_ Sie tun?''', + args['reply_user'] + r''': \0 O\ ''', + args['reply_user'] + r''': \\ \\ [ ] ja ''', + args['reply_user'] + r''': \`' ) [ ] noe''', + args['reply_user'] + r''': `'' ''' + ) + } + +def command_unicode(args): + if 'register' == args: + return { + 'name': 'unikot', + 'desc': 'prints an unicode string', + 'args': ('data', 'reply_user'), + 'ratelimit_class': RATE_GLOBAL + } + + if 'unikot' in args['data']: + logger('plugin', 'sent some unicode') + return { + 'msg': + ( + args['reply_user'] + ''': ┌────────┐''', + args['reply_user'] + ''': │Unicode!│''', + args['reply_user'] + ''': └────────┘''' + ) + } + +def command_source(args): + if 'register' == args: + return { + 'name': 'source', + 'desc': 'prints git URL', + 'args': ('data', 'reply_user'), + 'ratelimit_class': RATE_GLOBAL + } + + if 'source' in args['data']: + logger('plugin', 'sent source URL') + return { + 'msg': 'My source code can be found at %s' % conf('src-url') + } + +def command_dice(args): + if 'register' == args: + return { + 'name': 'dice', + 'desc': 'rolls a dice', + 'args': ('data', 'reply_user'), + 'ratelimit_class': RATE_INTERACTIVE + } + + if 'dice' in args['data']: + if args['reply_user'] in conf('enhanced-random-user'): + rnd = 0 # this might confuse users. good. + logger('plugin', 'sent random (enhanced)') + else: + rnd = random.randint(1, 6) + logger('plugin', 'sent random') + + dice_char = ['◇', '⚀', '⚁', '⚂', '⚃', '⚄', '⚅'] + return { + 'msg': 'rolling a dice for %s: %s (%d)' %(args['reply_user'], dice_char[rnd], rnd) + } + +def command_uptime(args): + if 'register' == args: + return { + 'name': 'uptime', + 'desc': 'prints uptime', + 'args': ('data', 'reply_user'), + 'ratelimit_class': RATE_GLOBAL + } + + if 'uptime' in args['data']: + u = int(conf('uptime') + time.time()) + plural_uptime = 's' + plural_request = 's' + + if 1 == u: plural_uptime = '' + if 1 == conf('request_counter'): plural_request = '' + + logger('plugin', 'sent statistics') + return { + 'msg': args['reply_user'] + (''': happily serving for %d second%s, %d request%s so far.''' %(u, plural_uptime, conf('request_counter'), plural_request)) + } + +def command_ping(args): + if 'register' == args: + return { + 'name': 'ping', + 'desc': 'sends pong', + 'args': ('data', 'reply_user'), + 'ratelimit_class': RATE_INTERACTIVE + } + + if 'ping' in args['data']: + rnd = random.randint(0, 3) # 1:4 + if 0 == rnd: + msg = args['reply_user'] + ''': peng (You're dead now.)''' + logger('plugin', 'sent pong (variant)') + elif 1 == rnd: + msg = args['reply_user'] + ''': I don't like you, leave me alone.''' + logger('plugin', 'sent pong (dontlike)') + else: + msg = args['reply_user'] + ''': pong''' + logger('plugin', 'sent pong') + + return { + 'msg': msg + } + +def command_info(args): + if 'register' == args: + return { + 'name': 'info', + 'desc': 'prints info message', + 'args': ('data', 'reply_user'), + 'ratelimit_class': RATE_GLOBAL + } + + if 'info' in args['data']: + logger('plugin', 'sent long info') + return { + 'msg': args['reply_user'] + (''': I'm a bot, my job is to extract tags from posted URLs. In case I'm annoying or for further questions, please talk to my master %s. I'm rate limited and shouldn't post more than %d messages per %d seconds. To make me exit immediately, highlight me with 'hangup' in the message (emergency only, please). For other commands, highlight me with 'command'.''' %(conf('bot_owner'), conf('hist_max_count'), conf('hist_max_time'))) + } + +def command_teatimer(args): + if 'register' == args: + return { + 'name': 'teatimer', + 'desc': 'sets a tea timer to $1 or currently %d seconds' % conf('tea_steep_time'), + 'args': ('reply_user', 'argv0', 'argv1'), + 'ratelimit_class': RATE_GLOBAL + } + + if 'teatimer' == args['argv0']: + steep = conf('tea_steep_time') + + if None != args['argv1']: + try: + steep = int(args['argv1']) + except Exception as e: + return { + 'msg': args['reply_user'] + ': error when parsing int(%s): %s' % ( + args['argv1'], str(e) + ) + } + + ready = time.time() + steep + + try: + logger('plugin', 'tea timer set to %s' % time.strftime('%F.%T', time.localtime(ready))) + except ValueError as e: + return { + 'msg': args['reply_user'] + ': time format error: ' + str(e) + } + + register_event(ready, chat_write, args['reply_user'] + ': Your tea is ready!') + + return { + 'msg': args['reply_user'] + ': Tea timer set to %s' % time.strftime( + '%F.%T', time.localtime(ready) + ) + } + +def command_decode(args): + if 'register' == args: + return { + 'name': 'decode', + 'desc': 'prints the long description of an unicode character', + 'args': ('data', 'reply_user'), + 'ratelimit_class': RATE_GLOBAL + } + + if not 'decode' in args['data']: + return + + d = args['data'].split() + + if 4 == len(d): + char = d[3][0] + char_esc = str(char.encode('unicode_escape'))[3:-1] + logger('plugin', 'decode called for %s' % char) + + try: + uni_name = unicodedata.name(char) + except Exception as e: + logger('plugin', 'decode(%s) failed: %s' %(char, str(e))) + return { + 'msg': args['reply_user'] + ": can't decode %s (%s): %s" %(char, char_esc, str(e)) + } + + return { + 'msg': args['reply_user'] + ': %s (%s) is called "%s"' %(char, char_esc, uni_name) + } + else: + return { + 'msg': args['reply_user'] + ': usage: decode {single character}' + } + +def command_show_blacklist(args): + if 'register' == args: + return { + 'name': 'show-blacklist', + 'desc': 'show the current URL blacklist', + 'args': ('data', 'reply_user'), + 'ratelimit_class': RATE_GLOBAL + } + + if 'show-blacklist' in args['data']: + logger('plugin', 'sent URL blacklist') + + return { + 'msg': [ + args['reply_user'] + ': URL blacklist: ' + b + for b in conf('url_blacklist') + ] + } + +#def command_dummy(args): +# if 'register' == args: +# return { +# 'name': 'dummy', +# 'desc': 'dummy description', +# 'args': ('data', 'reply_user'), +# 'ratelimit_class': RATE_GLOBAL +# } +# +# if 'dummy' in args['data']: +# logger('plugin', 'dummy plugin called') +# +# return { +# 'msg': args['reply_user'] + ': dummy plugin called' +# } + +def command_else(args): + logger('plugin', 'sent short info') + return { + 'msg': args['reply_user'] + ''': I'm a bot (highlight me with 'info' for more information).''' + } + +def data_parse_commands(data): + words = data.split(' ') + + if 2 > len(words): # need at least two words + return None + + # don't reply if beginning of the text matches bot_user + if not words[1].startswith(conf('bot_user')): + return None + + if 'hangup' in data: + chat_write('', prefix='/quit') + logger('warn', 'received hangup: ' + data) + return None + + reply_user = get_reply_data(data) + argv0 = get_reply_data(data, field=2) + argv1 = get_reply_data(data, field=3) + + for p in plugins['command']: + if ratelimit_exceeded(p['ratelimit_class']): + continue + + args = {} + + if 'args' in list(p.keys()): + for a in p['args']: + if None == a: continue + + if 'data' == a: + args['data'] = data + elif 'cmd_list' == a: + cmds = [c['name'] for c in plugins['command']] + cmds.sort() + args['cmd_list'] = cmds + elif 'reply_user' == a: + args['reply_user'] = reply_user + elif 'argv0' == a: + args['argv0'] = argv0 + elif 'argv1' == a: + args['argv1'] = argv1 + else: + logger('warn', 'unknown required arg for %s: %s' %(p['name'], a)) + + ret = p['func'](args) + + if None != ret: + if 'msg' in list(ret.keys()): + if str == type(ret['msg']): # FIXME 2to3 + ratelimit_touch(RATE_CHAT) + if ratelimit_exceeded(RATE_CHAT): + return False + + chat_write(ret['msg']) + else: + for line in ret['msg']: + ratelimit_touch(RATE_CHAT) + if ratelimit_exceeded(RATE_CHAT): + return False + + chat_write(line) + + return None + + ret = command_else({'reply_user': reply_user}) + if None != ret: + if ratelimit_exceeded(RATE_GLOBAL): + return False + + if 'msg' in list(ret.keys()): + if list is type(ret['msg']): + for m in ret['msg']: + chat_write(m) + else: + chat_write(ret['msg']) + +funcs = {} +funcs['parse'] = (parse_mental_ill, parse_skynet, parse_debbug, parse_cve) +funcs['command'] = ( + command_command, command_help, command_version, command_unicode, + command_klammer, command_source, command_dice, command_uptime, command_ping, + command_info, command_teatimer, command_decode, command_show_blacklist +) + +_dir = dir() + +if debug_enabled(): + def _chat_write(a): logger('chat_write', a) + def _conf(a): return 'bot' + def _ratelimit_exceeded(ignored=None): return False + def _ratelimit_touch(ignored=None): return True + + try: chat_write + except NameError: chat_write = _chat_write + try: conf + except NameError: conf = _conf + try: ratelimit_exceeded + except NameError: ratelimit_exceeded = _ratelimit_exceeded + try: ratelimit_touch + except NameError: ratelimit_touch = _ratelimit_touch + + logger('info', 'debugging enabled') + +def register(func_type, auto=False): + plugins[func_type] = [] + + if auto: + # FIXME: this is broken. dir() returns str, but not + # the addr of the functions which we'd need here. + for f in _dir: + print('testing(%s)' % f) + if not f.startswith(func_type + '_'): + continue + + try: + ret = f('register') + ret['func'] = f + plugins[func_type].append(ret) + except Exception as e: + logger('warn', 'auto-registering %s failed: %s' %(f, e)) + + else: + for f in funcs[func_type]: + ret = f('register') + ret['func'] = f + plugins[func_type].append(ret) + +def register_all(): + register('parse') + register('command') + +def event_trigger(): + if 0 == len(joblist): + return + + now = time.time() + + i = 0 + for (t, callback, args) in joblist: + if t < now: + callback(args) + del(joblist[i]) + + i += 1 diff --git a/urlbot-mcabber/strsim.py b/urlbot-mcabber/strsim.py new file mode 100755 index 0000000..9a98749 --- /dev/null +++ b/urlbot-mcabber/strsim.py @@ -0,0 +1,66 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +import re + +def str_sim(a, b, do_print=False): + a = a.lower() + b = b.lower() + + a_parts = re.split('[\W_]+', a) + b_parts = re.split('[\W_]+', b) + + # this is a "simple" way to declare out[a][b] + out = list(map(list, [[0]*len(b_parts)]*len(a_parts))) + + for i in range(0, len(a_parts)-1): + for j in range(0, len(b_parts)-1): + if a_parts[i] == b_parts[j]: + out[i][j] += 1 + + if do_print: + i = 0 + for j in range(0, len(b_parts)): + print(' |'*i + ' '*2 + '.- ' + b_parts[j]) + i += 1 + print(' |'*i) + + for i in range(0, len(a_parts)): + print(' ' + str(out[i]) + ' ' + a_parts[i]) + + return out + +def sum_array(array): + _sum = 0 + for a in array: + if list == type(a) or tuple == type(a) or hash == type(a): + _sum += sum_array(a) + elif int == type(a) or float == type(a): + _sum += a + return _sum + +def wrapper_print(a, b, comment=''): + ret = str_sim(a, b, do_print=True) + if '' != comment: comment = ' ^ ' + comment + print('[%2dx%2d::%2d]%s' %(len(ret), len(ret[0]), sum_array(ret), comment)) + +if '__main__' == __name__: + pairs = ( + ( + 'http://de.wikipedia.org/wiki/Monte_Kali_%28Heringen%29', + 'Monte Kali (Heringen)' + ), + ( + 'http://www.spiegel.de/politik/ausland/buddhisten-treffen-in-colombo-blitzender-moench-a-994447.html', + 'Buddhisten-Treffen in Colombo: Blitzender Mönch - SPIEGEL ONLINE' + ) + ) + + wrapper_print('foo bar baz', 'foo bar boom') + + for (url, title) in pairs: + wrapper_print(title, url, comment='raw') + url_no_proto = re.sub(r'https?://[^/]*/', '', url) + wrapper_print(title, url_no_proto, comment='no proto/domain') + url_no_proto_no_digits = re.sub(r'[0-9]*', '', url_no_proto) + wrapper_print(title, url_no_proto_no_digits, comment='no proto/domain/[0-9]') diff --git a/urlbot-mcabber/test_urlbot.py b/urlbot-mcabber/test_urlbot.py new file mode 100644 index 0000000..6fcd8ff --- /dev/null +++ b/urlbot-mcabber/test_urlbot.py @@ -0,0 +1,17 @@ +""" +To be executed with nose +""" +import unittest +from urlbot import fetch_page + + +class TestEventlooper(unittest.TestCase): + + def test_broken_url(self): + """ + Test that broken socket calls are not breaking + """ + broken_url = 'http://foo' + result = fetch_page(url=broken_url) + self.assertEqual(result, (None, None)) + diff --git a/urlbot-mcabber/urlbot.py b/urlbot-mcabber/urlbot.py new file mode 100755 index 0000000..5abb4db --- /dev/null +++ b/urlbot-mcabber/urlbot.py @@ -0,0 +1,282 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +import sys, os, stat, re, time, pickle, random +import urllib.request, urllib.parse, urllib.error, html.parser +from local_config import conf, set_conf +from common import * +from strsim import str_sim + +# rate limiting to 5 messages per 10 minutes +hist_ts = [] +hist_flag = True + +parser = None + +def fetch_page(url): + logger('info', 'fetching page ' + url) + try: + request = urllib.request.Request(url) + request.add_header('User-Agent', '''Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0 Iceweasel/31.0''') + response = urllib.request.urlopen(request) + html_text = response.read(BUFSIZ) # ignore more than BUFSIZ + response.close() + return (0, html_text, response.headers) + except Exception as e: + logger('warn', 'failed: ' + str(e)) + return (1, str(e), 'dummy') + + return (-1, None, None) + +def extract_title(url): + global parser + + if 'repo/urlbot.git' in url: + logger('info', 'repo URL found: ' + url) + return (3, 'wee, that looks like my home repo!') + + logger('info', 'extracting title from ' + url) + + (code, html_text, headers) = fetch_page(url) + + if 1 == code: + return (3, 'failed: %s for %s' %(html_text, url)) + + if html_text: + charset = '' + if 'content-type' in headers: + logger('debug', 'content-type: ' + headers['content-type']) + + if 'text/' != headers['content-type'][:len('text/')]: + return (1, headers['content-type']) + + charset = re.sub('.*charset=(?P<charset>\S+).*', + '\g<charset>', headers['content-type'], re.IGNORECASE) + + if '' != charset: + try: + html_text = html_text.decode(charset) + except LookupError: + logger('warn', 'invalid charset in ' + headers['content-type']) + + if str != type(html_text): + html_text = str(html_text) + + result = re.match(r'.*?<title.*?>(.*?).*?', html_text, re.S | re.M | re.IGNORECASE) + if result: + match = result.groups()[0] + + if None == parser: + parser = html.parser.HTMLParser() + + try: + expanded_html = parser.unescape(match) + except UnicodeDecodeError as e: # idk why this can happen, but it does + logger('warn', 'parser.unescape() expoded here: ' + str(e)) + expanded_html = match + return (0, expanded_html) + else: + return (2, 'no title') + + return (-1, 'error') + +def chat_write(message, prefix='/say '): + set_conf('request_counter', conf('request_counter') + 1) + + for m in message: + if 0x20 > ord(m): + logger('warn', 'strange char 0x%02x in chat_write(message), skipping' % ord(m)) + return False + + if debug_enabled(): + print(message) + else: + try: + fd = open(fifo_path, 'wb') +# FIXME 2to3 + # FIXME: somehow, unicode chars can end up inside a message, + # which seems to make both unicode() and ''.encode('utf8') fail. + try: + msg = str(prefix) + str(message) + '\n' + msg = msg.encode('utf8') + except UnicodeDecodeError as e: + logger('warn', 'encoding msg failed: ' + str(e)) + msg = prefix + message + '\n' + + fd.write(msg) + fd.close() + except IOError as e: + logger('err', "couldn't print to fifo %s: %s" % (fifo_path, str(e))) + +def ratelimit_touch(ignored=None): # FIXME: separate counters + hist_ts.append(time.time()) + + if conf('hist_max_count') < len(hist_ts): + hist_ts.pop(0) + + +def ratelimit_exceeded(ignored=None): # FIXME: separate counters + global hist_flag + + if conf('hist_max_count') < len(hist_ts): + first = hist_ts.pop(0) + if (time.time() - first) < conf('hist_max_time'): + if hist_flag: + hist_flag = False + chat_write('(rate limited to %d messages in %d seconds, try again at %s)' %(conf('hist_max_count'), conf('hist_max_time'), time.strftime('%T %Z', time.localtime(hist_ts[0] + conf('hist_max_time'))))) + + logger('warn', 'rate limiting exceeded: ' + pickle.dumps(hist_ts)) + return True + + hist_flag = True + return False + +def extract_url(data): + ret = None + result = re.findall("(https?://[^\s>]+)", data) + if result: + for url in result: + ratelimit_touch() + if ratelimit_exceeded(): + return False + + flag = False + for b in conf('url_blacklist'): + if not None is re.match(b, url): + flag = True + logger('info', 'url blacklist match for ' + url) + + if flag: + # an URL has matched the blacklist, continue to the next URL + continue + +# urllib.request is broken: +# >>> '.'.encode('idna') +# .... +# UnicodeError: label empty or too long +# >>> '.a.'.encode('idna') +# .... +# UnicodeError: label empty or too long +# >>> 'a.a.'.encode('idna') +# b'a.a.' + + try: + (status, title) = extract_title(url) + except UnicodeError as e: + (status, title) = (4, str(e)) + + if 0 == status: + title = title.strip() + lev_url = re.sub(r'https?://[^/]*/', '', url) + lev_res = levenshtein(lev_url, title) + + sim = str_sim(title, lev_url) + sim_len_title = len(sim) + sim_len_url = len(sim[0]) + sim_sum = sum([sum(a) for a in sim]) + + obj = conf_load() + obj['lev'].append((lev_res, title, url)) + obj['sim'].append((sim_sum, sim_len_title, sim_len_url, title, url)) + conf_save(obj) + + message = 'Title: %s: %s' %(title, url) + elif 1 == status: + if conf('image_preview'): + # of course it's fake, but it looks interesting at least + char = """,._-+=\|/*`~"'""" + message = 'No text but %s, 1-bit ASCII art preview: [%c] %s' %( + title, random.choice(char), url + ) + else: + logger('info', 'no message sent for non-text %s (%s)' %(url, title)) + continue + elif 2 == status: + message = 'No title: %s' % url + elif 3 == status: + message = title + elif 4 == status: + message = 'Bug triggered (%s), invalid URL/domain part: %s' % (title, url) + logger('warn', message) + else: + message = 'some error occurred when fetching %s' % url + + message = message.replace('\n', '\\n') + + logger('info', 'printing ' + message) + chat_write(message) + ret = True + return ret + +def parse_pn(data): + ## reply_user = data.split(' ')[0].strip('<>') + # since we can't determine if a user named 'foo> ' just wrote ' > bar' + # or a user 'foo' just wrote '> > bar', we can't safely answer here + logger('warn', 'received PN: ' + data) + return False + +def parse_delete(filepath): + try: + fd = open(filepath, 'r') + except IOError as e: + logger('err', 'file has vanished: %s: %s' % (filepath, e)) + return False + + content = fd.read(BUFSIZ) # ignore more than BUFSIZ + fd.close() + os.remove(filepath) # probably better crash here + + if content[1:1+len(conf('bot_user'))] == conf('bot_user'): + return + + if 'has set the subject to:' in content: + return + + if content.startswith('PRIV#'): + parse_pn(content) + return + + if 'nospoiler' in content: +# logger('info', "no spoiler for: " + content) + return + + if sys.argv[0] in content: + logger('info', 'silenced, this is my own log') + return + + if True != extract_url(content): + plugins.data_parse_commands(content) + plugins.data_parse_other(content) + return + +if '__main__' == __name__: + import plugins + + plugins.chat_write = chat_write + plugins.ratelimit_exceeded = ratelimit_exceeded + plugins.ratelimit_touch = ratelimit_touch + + plugins.register_all() + + print(sys.argv[0] + ' ' + VERSION) + + if not os.path.exists(fifo_path): + logger('error', 'fifo_path "%s" does not exist, exiting' % fifo_path) + exit(1) + + if not stat.S_ISFIFO(os.stat(fifo_path).st_mode): + logger('error', 'fifo_path "%s" is not a FIFO, exiting' % fifo_path) + exit(1) + + while 1: + try: + for f in os.listdir(event_files_dir): + if 'mcabber-' == f[:8]: + parse_delete(os.path.join(event_files_dir, f)) + + plugins.event_trigger() + + time.sleep(delay) + except KeyboardInterrupt: + print('') + exit(130)