1
0
mirror of http://aero2k.de/t/repos/urlbot-native.git synced 2017-09-06 15:25:38 +02:00
Files
urlbot-native-trex/urlbot.py

233 lines
5.9 KiB
Python
Raw Normal View History

2014-07-20 23:39:51 +02:00
#!/usr/bin/python
2014-08-10 22:10:00 +02:00
# -*- coding: utf-8 -*-
2014-07-20 23:39:51 +02:00
import sys, os, re, time, urllib, pickle, HTMLParser, stat
from local_config import conf, set_conf
from common import *
2014-07-20 23:39:51 +02:00
2014-07-21 04:54:50 +02:00
# rate limiting to 5 messages per 10 minutes
hist_ts = []
hist_flag = True
2014-07-21 04:54:50 +02:00
parser = None
class urllib_user_agent_wrapper(urllib.FancyURLopener):
version = '''Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0 Iceweasel/31.0'''
2014-07-21 00:53:26 +02:00
def fetch_page(url):
logger('info', 'fetching page ' + url)
try:
urllib._urlopener = urllib_user_agent_wrapper()
response = urllib.urlopen(url)
2014-07-21 08:28:46 +02:00
html = response.read(BUFSIZ) # ignore more than BUFSIZ
response.close()
2014-07-27 12:21:32 +02:00
return (html, response.headers)
except IOError as e:
logger('warn', 'failed: ' + e.errno)
2014-08-09 23:39:00 +02:00
2014-08-09 22:39:19 +02:00
return (None, None)
2014-07-21 00:53:26 +02:00
def extract_title(url):
global parser
if 'repo/urlbot.git' in url:
logger('info', 'repo URL found: ' + url)
return (3, 'wee, that looks like my home repo!')
2014-07-21 00:53:26 +02:00
logger('info', 'extracting title from ' + url)
2014-07-27 12:21:32 +02:00
(html, headers) = fetch_page(url)
if html:
charset = ''
2014-07-27 12:21:32 +02:00
if 'content-type' in headers:
logger('debug', 'content-type: ' + headers['content-type'])
2014-07-27 12:21:32 +02:00
if 'text/' != headers['content-type'][:len('text/')]:
return (1, headers['content-type'])
charset = re.sub('.*charset=(?P<charset>\S+).*',
'\g<charset>', headers['content-type'], re.IGNORECASE)
2014-08-09 23:39:00 +02:00
result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html, re.S | re.M | re.IGNORECASE)
if result:
match = result.groups()[0]
# if 'charset=UTF-8' in headers['content-type']:
# match = unicode(match)
if None == parser:
parser = HTMLParser.HTMLParser()
if '' != charset:
try:
match = match.decode(charset)
except LookupError:
2014-09-21 19:52:49 +02:00
logger('warn', 'invalid charset in ' + headers['content-type'])
try:
expanded_html = parser.unescape(match)
except UnicodeDecodeError as e: # idk why this can happen, but it does
logger('warn', 'parser.unescape() expoded here: ' + str(e))
expanded_html = match
return (0, expanded_html)
2014-08-01 20:49:07 +02:00
else:
return (2, 'no title')
2014-08-09 23:39:00 +02:00
2014-08-01 20:49:07 +02:00
return (-1, 'error')
2014-07-20 23:39:51 +02:00
def chat_write(message, prefix='/say '):
set_conf('request_counter', conf('request_counter') + 1)
2014-08-02 09:20:52 +02:00
2014-07-21 08:28:46 +02:00
if debug_enabled():
print message
else:
try:
fd = open(fifo_path, 'wb')
# FIXME: somehow, unicode chars can end up inside a <str> message,
# which seems to make both unicode() and ''.encode('utf8') fail.
try:
msg = unicode(prefix) + unicode(message) + '\n'
msg = msg.encode('utf8')
except UnicodeDecodeError:
msg = prefix + message + '\n'
fd.write(msg)
2014-07-21 08:28:46 +02:00
fd.close()
except IOError:
logger('err', "couldn't print to fifo " + fifo_path)
2014-09-27 05:51:18 +02:00
def ratelimit_touch(ignored=None): # FIXME: separate counters
2014-09-27 08:43:33 +02:00
hist_ts.append(time.time())
2014-07-21 04:54:50 +02:00
2014-09-27 05:56:39 +02:00
if conf('hist_max_count') < len(hist_ts):
2014-09-27 05:51:18 +02:00
hist_ts.pop(0)
def ratelimit_exceeded(ignored=None): # FIXME: separate counters
global hist_flag
2014-09-27 05:56:39 +02:00
if conf('hist_max_count') < len(hist_ts):
2014-07-21 04:54:50 +02:00
first = hist_ts.pop(0)
2014-09-27 08:43:33 +02:00
if (time.time() - first) < conf('hist_max_time'):
if hist_flag:
hist_flag = False
2014-09-27 05:56:39 +02:00
chat_write('(rate limited to %d messages in %d seconds, try again at %s)' %(conf('hist_max_count'), conf('hist_max_time'), time.strftime('%T %Z', time.localtime(hist_ts[0] + conf('hist_max_time')))))
2014-07-21 04:54:50 +02:00
logger('warn', 'rate limiting exceeded: ' + pickle.dumps(hist_ts))
return True
hist_flag = True
2014-07-21 04:54:50 +02:00
return False
2014-07-20 23:39:51 +02:00
def extract_url(data):
2014-08-02 20:48:06 +02:00
ret = None
2014-07-21 09:59:09 +02:00
result = re.findall("(https?://[^\s>]+)", data)
2014-07-20 23:39:51 +02:00
if result:
for r in result:
2014-09-27 05:51:18 +02:00
ratelimit_touch()
2014-07-21 04:54:50 +02:00
if ratelimit_exceeded():
return False
2014-07-27 12:21:32 +02:00
(status, title) = extract_title(r)
2014-07-21 00:53:26 +02:00
2014-07-27 12:21:32 +02:00
if 0 == status:
2014-08-01 20:49:07 +02:00
message = 'Title: %s: %s' % (title.strip(), e(r))
elif 1 == status:
logger('info', 'no message sent for non-text %s (%s)' %(r, title))
continue
2014-08-01 20:49:07 +02:00
elif 2 == status:
message = 'No title: %s' % (e(r))
elif 3 == status:
message = title
2014-08-01 20:49:07 +02:00
else:
message = 'some error occurred when fetching %s' % e(r)
2014-07-22 22:23:10 +02:00
message = message.replace('\n', '\\n')
2014-07-20 23:39:51 +02:00
logger('info', 'printing ' + message)
2014-07-21 08:28:46 +02:00
chat_write(message)
2014-08-02 20:48:06 +02:00
ret = True
return ret
def parse_pn(data):
## reply_user = data.split(' ')[0].strip('<>')
# since we can't determine if a user named 'foo> ' just wrote ' > bar'
# or a user 'foo' just wrote '> > bar', we can't safely answer here
logger('warn', 'received PN: ' + data)
return False
2014-07-20 23:39:51 +02:00
def parse_delete(filepath):
try:
fd = open(filepath, 'rb')
2014-07-21 08:28:46 +02:00
except IOError:
2014-07-20 23:39:51 +02:00
logger('err', 'file has vanished: ' + filepath)
2014-07-21 08:28:46 +02:00
return False
2014-07-20 23:39:51 +02:00
content = fd.read(BUFSIZ) # ignore more than BUFSIZ
fd.close()
os.remove(filepath) # probably better crash here
2014-07-20 23:39:51 +02:00
2014-09-27 05:51:18 +02:00
if content[1:1+len(conf('bot_user'))] == conf('bot_user'):
return
2014-07-20 23:39:51 +02:00
if 'has set the subject to:' in content:
return
if content.startswith('PRIV#'):
parse_pn(content)
return
2014-09-17 15:49:52 +02:00
if 'nospoiler' in content:
logger('info', "no spoiler for: " + content)
return
2014-07-20 23:39:51 +02:00
if True != extract_url(content):
plugins.data_parse_commands(content)
plugins.data_parse_other(content)
return
2014-07-20 23:39:51 +02:00
def get_version_git():
2014-08-09 23:39:00 +02:00
import subprocess
cmd = ['git', 'log', '-n', '1', '--oneline', '--abbrev-commit']
p = subprocess.Popen(cmd, bufsize=1, stdout=subprocess.PIPE)
first_line = p.stdout.readline()
if 0 == p.wait():
return "version (Git) '%s'" % e(first_line.strip())
else:
return "(unknown version)"
import plugins
2014-09-27 06:03:04 +02:00
plugins.chat_write = chat_write
plugins.ratelimit_exceeded = ratelimit_exceeded
2014-09-27 05:51:18 +02:00
plugins.ratelimit_touch = ratelimit_touch
2014-09-27 06:03:04 +02:00
plugins.register_all()
2014-08-09 23:50:40 +02:00
if '__main__' == __name__:
2014-09-27 06:12:34 +02:00
set_conf('version', get_version_git())
print sys.argv[0] + ' ' + conf('version')
2014-09-21 20:10:37 +02:00
if not os.path.exists(fifo_path):
logger('error', 'fifo_path "%s" does not exist, exiting' % fifo_path)
exit(1)
if not stat.S_ISFIFO(os.stat(fifo_path).st_mode):
logger('error', 'fifo_path "%s" is not a FIFO, exiting' % fifo_path)
exit(1)
while 1:
try:
for f in os.listdir(event_files_dir):
if 'mcabber-' == f[:8]:
parse_delete(os.path.join(event_files_dir, f))
2014-07-20 23:39:51 +02:00
time.sleep(delay)
except KeyboardInterrupt:
print ""
exit(130)