urlbot-native-trex/urlbot.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys, os, re, time, urllib, pickle, HTMLParser, stat
from local_config import conf, set_conf
from common import *

# rate limiting to 5 messages per 10 minutes
hist_ts = []
hist_flag = True

parser = None

class urllib_user_agent_wrapper(urllib.FancyURLopener):
	version = '''Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0 Iceweasel/31.0'''

def fetch_page(url):
	logger('info', 'fetching page ' + url)
	try:
		urllib._urlopener = urllib_user_agent_wrapper()
		response = urllib.urlopen(url)
		html = response.read(BUFSIZ) # ignore more than BUFSIZ
		response.close()
		return (html, response.headers)
	except IOError as e:
		logger('warn', 'failed: ' + e.errno)

	return (None, None)

def extract_title(url):
	global parser

	if 'repo/urlbot.git' in url:
		logger('info', 'repo URL found: ' + url)
		return (3, 'wee, that looks like my home repo!')

	logger('info', 'extracting title from ' + url)

	(html, headers) = fetch_page(url)
	if html:
		charset = ''
		if 'content-type' in headers:
			logger('debug', 'content-type: ' + headers['content-type'])

			if 'text/' != headers['content-type'][:len('text/')]:
				return (1, headers['content-type'])

			charset = re.sub('.*charset=(?P<charset>\S+).*',
				'\g<charset>', headers['content-type'], re.IGNORECASE)

		result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html, re.S | re.M | re.IGNORECASE)
		if result:
			match = result.groups()[0]

#			if 'charset=UTF-8' in headers['content-type']:
#				match = unicode(match)

			if None == parser:
				parser = HTMLParser.HTMLParser()

			if '' != charset:
				try:
					match = match.decode(charset)
				except LookupError:
					logger('warn', 'invalid charset in ' + headers['content-type'])

			try:
				expanded_html = parser.unescape(match)
			except UnicodeDecodeError as e: # idk why this can happen, but it does
				logger('warn', 'parser.unescape() expoded here: ' + str(e))
				expanded_html = match
			return (0, expanded_html)
		else:
			return (2, 'no title')

	return (-1, 'error')

def chat_write(message, prefix='/say '):
	set_conf('request_counter', conf('request_counter') + 1)

	if debug_enabled():
		print message
	else:
		try:
			fd = open(fifo_path, 'wb')

			# FIXME: somehow, unicode chars can end up inside a <str> message,
			# which seems to make both unicode() and ''.encode('utf8') fail.
			try:
				msg = unicode(prefix) + unicode(message) + '\n'
				msg = msg.encode('utf8')
			except UnicodeDecodeError:
				msg = prefix + message + '\n'

			fd.write(msg)
			fd.close()
		except IOError:
			logger('err', "couldn't print to fifo " + fifo_path)

def ratelimit_touch(ignored=None): # FIXME: separate counters
	hist_ts.append(time.time())

	if conf('hist_max_count') < len(hist_ts):
		hist_ts.pop(0)


def ratelimit_exceeded(ignored=None): # FIXME: separate counters
	global hist_flag

	if conf('hist_max_count') < len(hist_ts):
		first = hist_ts.pop(0)
		if (time.time() - first) < conf('hist_max_time'):
			if hist_flag:
				hist_flag = False
				chat_write('(rate limited to %d messages in %d seconds, try again at %s)' %(conf('hist_max_count'), conf('hist_max_time'), time.strftime('%T %Z', time.localtime(hist_ts[0] + conf('hist_max_time')))))

			logger('warn', 'rate limiting exceeded: ' + pickle.dumps(hist_ts))
			return True

	hist_flag = True
	return False

def extract_url(data):
	ret = None
	result = re.findall("(https?://[^\s>]+)", data)
	if result:
		for r in result:
			ratelimit_touch()
			if ratelimit_exceeded():
				return False

			(status, title) = extract_title(r)

			if 0 == status:
				message = 'Title: %s: %s' % (title.strip(), e(r))
			elif 1 == status:
				logger('info', 'no message sent for non-text %s (%s)' %(r, title))
				continue
			elif 2 == status:
				message = 'No title: %s' % (e(r))
			elif 3 == status:
				message = title
			else:
				message = 'some error occurred when fetching %s' % e(r)

			message = message.replace('\n', '\\n')

			logger('info', 'printing ' + message)
			chat_write(message)
			ret = True
	return ret

def parse_pn(data):
	## reply_user = data.split(' ')[0].strip('<>')
	# since we can't determine if a user named 'foo> ' just wrote ' > bar'
	# or a user 'foo' just wrote '> > bar', we can't safely answer here
	logger('warn', 'received PN: ' + data)
	return False

def parse_delete(filepath):
	try:
		fd = open(filepath, 'rb')
	except IOError:
		logger('err', 'file has vanished: ' + filepath)
		return False

	content = fd.read(BUFSIZ) # ignore more than BUFSIZ
	fd.close()
	os.remove(filepath) # probably better crash here

	if content[1:1+len(conf('bot_user'))] == conf('bot_user'):
		return

	if 'has set the subject to:' in content:
		return
	
	if content.startswith('PRIV#'):
		parse_pn(content)
		return
	
	if 'nospoiler' in content:
		logger('info', "no spoiler for: " + content)
		return

	if True != extract_url(content):
		plugins.data_parse_commands(content)
		plugins.data_parse_other(content)
		return

def get_version_git():
	import subprocess

	cmd = ['git', 'log', '-n', '1', '--oneline', '--abbrev-commit']

	p = subprocess.Popen(cmd, bufsize=1, stdout=subprocess.PIPE)
	first_line = p.stdout.readline()

	if 0 == p.wait():
		return "version (Git) '%s'" % e(first_line.strip())
	else:
		return "(unknown version)"

import plugins

plugins.chat_write = chat_write
plugins.ratelimit_exceeded = ratelimit_exceeded
plugins.ratelimit_touch = ratelimit_touch

plugins.register_all()

if '__main__' == __name__:
	set_conf('version', get_version_git())
	print sys.argv[0] + ' ' + conf('version')

	if not os.path.exists(fifo_path):
		logger('error', 'fifo_path "%s" does not exist, exiting' % fifo_path)
		exit(1)

	if not stat.S_ISFIFO(os.stat(fifo_path).st_mode):
		logger('error', 'fifo_path "%s" is not a FIFO, exiting' % fifo_path)
		exit(1)

	while 1:
		try:
			for f in os.listdir(event_files_dir):
				if 'mcabber-' == f[:8]:
					parse_delete(os.path.join(event_files_dir, f))

			time.sleep(delay)
		except KeyboardInterrupt:
			print ""
			exit(130)
init() 2014-07-20 23:39:51 +02:00			`#!/usr/bin/python`
decode HTML entities in <title>s 2014-08-10 22:10:00 +02:00			`# -- coding: utf-8 --`
init() 2014-07-20 23:39:51 +02:00
moved common stuff to common.py; import adjustments 2014-09-27 09:19:46 +02:00			`import sys, os, re, time, urllib, pickle, HTMLParser, stat`
scope fixes; put request_counter to conf() 2014-09-27 06:07:44 +02:00			`from local_config import conf, set_conf`
moved common stuff to common.py; import adjustments 2014-09-27 09:19:46 +02:00			`from common import *`
init() 2014-07-20 23:39:51 +02:00
rate limiting implemented 2014-07-21 04:54:50 +02:00			`# rate limiting to 5 messages per 10 minutes`
			`hist_ts = []`
visible message for rate limiting added 2014-07-21 09:49:13 +02:00			`hist_flag = True`
rate limiting implemented 2014-07-21 04:54:50 +02:00
"".decode() from content-type charset; HTMLParser caching for interactive import 2014-09-21 17:39:06 +02:00			`parser = None`

fake User-Agent to FF/31/Linux because Reddit drops us otherwise 2014-08-04 19:32:40 +02:00			`class urllib_user_agent_wrapper(urllib.FancyURLopener):`
			`version = '''Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0 Iceweasel/31.0'''`

fetching works 2014-07-21 00:53:26 +02:00			`def fetch_page(url):`
			`logger('info', 'fetching page ' + url)`
added some more error handling because error handling is cool 2014-07-21 02:58:29 +02:00			`try:`
fake User-Agent to FF/31/Linux because Reddit drops us otherwise 2014-08-04 19:32:40 +02:00			`urllib._urlopener = urllib_user_agent_wrapper()`
added some more error handling because error handling is cool 2014-07-21 02:58:29 +02:00			`response = urllib.urlopen(url)`
cleanup 2014-07-21 08:28:46 +02:00			`html = response.read(BUFSIZ) # ignore more than BUFSIZ`
added some more error handling because error handling is cool 2014-07-21 02:58:29 +02:00			`response.close()`
recognize image/* as content-type 2014-07-27 12:21:32 +02:00			`return (html, response.headers)`
added some more error handling because error handling is cool 2014-07-21 02:58:29 +02:00			`except IOError as e:`
			`logger('warn', 'failed: ' + e.errno)`
fixed some code style complaints 2014-08-09 23:39:00 +02:00
fix crash on non-resolveable URLs 2014-08-09 22:39:19 +02:00			`return (None, None)`
fetching works 2014-07-21 00:53:26 +02:00
			`def extract_title(url):`
"".decode() from content-type charset; HTMLParser caching for interactive import 2014-09-21 17:39:06 +02:00			`global parser`

print special message for 'repo/urlbot.git' 2014-08-09 20:29:38 +02:00			`if 'repo/urlbot.git' in url:`
			`logger('info', 'repo URL found: ' + url)`
			`return (3, 'wee, that looks like my home repo!')`

fetching works 2014-07-21 00:53:26 +02:00			`logger('info', 'extracting title from ' + url)`
added some more error handling because error handling is cool 2014-07-21 02:58:29 +02:00
recognize image/* as content-type 2014-07-27 12:21:32 +02:00			`(html, headers) = fetch_page(url)`
added some more error handling because error handling is cool 2014-07-21 02:58:29 +02:00			`if html:`
"".decode() from content-type charset; HTMLParser caching for interactive import 2014-09-21 17:39:06 +02:00			`charset = ''`
recognize image/* as content-type 2014-07-27 12:21:32 +02:00			`if 'content-type' in headers:`
"".decode() from content-type charset; HTMLParser caching for interactive import 2014-09-21 17:39:06 +02:00			`logger('debug', 'content-type: ' + headers['content-type'])`

recognize image/* as content-type 2014-07-27 12:21:32 +02:00			`if 'text/' != headers['content-type'][:len('text/')]:`
			`return (1, headers['content-type'])`

"".decode() from content-type charset; HTMLParser caching for interactive import 2014-09-21 17:39:06 +02:00			`charset = re.sub('.charset=(?P<charset>\S+).',`
			`'\g<charset>', headers['content-type'], re.IGNORECASE)`

fixed some code style complaints 2014-08-09 23:39:00 +02:00			`result = re.match(r'.?<title.?>(.?)</title>.?', html, re.S \| re.M \| re.IGNORECASE)`
added some more error handling because error handling is cool 2014-07-21 02:58:29 +02:00			`if result:`
parse_pn(); some fixups for upcoming relaunch 2014-09-14 12:05:01 +02:00			`match = result.groups()[0]`

			`# if 'charset=UTF-8' in headers['content-type']:`
			`# match = unicode(match)`

"".decode() from content-type charset; HTMLParser caching for interactive import 2014-09-21 17:39:06 +02:00			`if None == parser:`
			`parser = HTMLParser.HTMLParser()`

			`if '' != charset:`
			`try:`
			`match = match.decode(charset)`
			`except LookupError:`
fix critical typo 2014-09-21 19:52:49 +02:00			`logger('warn', 'invalid charset in ' + headers['content-type'])`
"".decode() from content-type charset; HTMLParser caching for interactive import 2014-09-21 17:39:06 +02:00
workaround error in HTMLParser with unicode input 2014-08-20 02:44:11 +02:00			`try:`
parse_pn(); some fixups for upcoming relaunch 2014-09-14 12:05:01 +02:00			`expanded_html = parser.unescape(match)`
workaround error in HTMLParser with unicode input 2014-08-20 02:44:11 +02:00			`except UnicodeDecodeError as e: # idk why this can happen, but it does`
			`logger('warn', 'parser.unescape() expoded here: ' + str(e))`
parse_pn(); some fixups for upcoming relaunch 2014-09-14 12:05:01 +02:00			`expanded_html = match`
workaround error in HTMLParser with unicode input 2014-08-20 02:44:11 +02:00			`return (0, expanded_html)`
'no title' added 2014-08-01 20:49:07 +02:00			`else:`
			`return (2, 'no title')`
fixed some code style complaints 2014-08-09 23:39:00 +02:00
'no title' added 2014-08-01 20:49:07 +02:00			`return (-1, 'error')`
init() 2014-07-20 23:39:51 +02:00
hangup, 'ignore topic' added; chat_write(prefix=) added; off-by-one fixed 2014-07-21 09:39:59 +02:00			`def chat_write(message, prefix='/say '):`
scope fixes; put request_counter to conf() 2014-09-27 06:07:44 +02:00			`set_conf('request_counter', conf('request_counter') + 1)`
added 'uptime' statistics 2014-08-02 09:20:52 +02:00
cleanup 2014-07-21 08:28:46 +02:00			`if debug_enabled():`
			`print message`
			`else:`
			`try:`
			`fd = open(fifo_path, 'wb')`
wrapped unicode() and ''.encode('utf8') inside exception 2014-08-11 23:32:40 +02:00
			`# FIXME: somehow, unicode chars can end up inside a <str> message,`
			`# which seems to make both unicode() and ''.encode('utf8') fail.`
			`try:`
			`msg = unicode(prefix) + unicode(message) + '\n'`
fix "".encode("utf8") for unicode input 2014-08-20 18:34:23 +02:00			`msg = msg.encode('utf8')`
wrapped unicode() and ''.encode('utf8') inside exception 2014-08-11 23:32:40 +02:00			`except UnicodeDecodeError:`
			`msg = prefix + message + '\n'`

			`fd.write(msg)`
cleanup 2014-07-21 08:28:46 +02:00			`fd.close()`
			`except IOError:`
			`logger('err', "couldn't print to fifo " + fifo_path)`
KI, reply added; code moved to chat_write() 2014-07-21 02:27:54 +02:00
ratelimit fix/split up; bot_user fixes 2014-09-27 05:51:18 +02:00			`def ratelimit_touch(ignored=None): # FIXME: separate counters`
ratelimit_*(): fixes after splitup 2014-09-27 08:43:33 +02:00			`hist_ts.append(time.time())`
rate limiting implemented 2014-07-21 04:54:50 +02:00
scope fixes; put hist_max_* to conf() 2014-09-27 05:56:39 +02:00			`if conf('hist_max_count') < len(hist_ts):`
ratelimit fix/split up; bot_user fixes 2014-09-27 05:51:18 +02:00			`hist_ts.pop(0)`


			`def ratelimit_exceeded(ignored=None): # FIXME: separate counters`
			`global hist_flag`

scope fixes; put hist_max_* to conf() 2014-09-27 05:56:39 +02:00			`if conf('hist_max_count') < len(hist_ts):`
rate limiting implemented 2014-07-21 04:54:50 +02:00			`first = hist_ts.pop(0)`
ratelimit_*(): fixes after splitup 2014-09-27 08:43:33 +02:00			`if (time.time() - first) < conf('hist_max_time'):`
visible message for rate limiting added 2014-07-21 09:49:13 +02:00			`if hist_flag:`
			`hist_flag = False`
scope fixes; put hist_max_* to conf() 2014-09-27 05:56:39 +02:00			`chat_write('(rate limited to %d messages in %d seconds, try again at %s)' %(conf('hist_max_count'), conf('hist_max_time'), time.strftime('%T %Z', time.localtime(hist_ts[0] + conf('hist_max_time')))))`
visible message for rate limiting added 2014-07-21 09:49:13 +02:00
rate limiting implemented 2014-07-21 04:54:50 +02:00			`logger('warn', 'rate limiting exceeded: ' + pickle.dumps(hist_ts))`
			`return True`

visible message for rate limiting added 2014-07-21 09:49:13 +02:00			`hist_flag = True`
rate limiting implemented 2014-07-21 04:54:50 +02:00			`return False`

init() 2014-07-20 23:39:51 +02:00			`def extract_url(data):`
only run parse_* if no URL detected 2014-08-02 20:48:06 +02:00			`ret = None`
URLs may have a trailing '>' now 2014-07-21 09:59:09 +02:00			`result = re.findall("(https?://[^\s>]+)", data)`
init() 2014-07-20 23:39:51 +02:00			`if result:`
			`for r in result:`
ratelimit fix/split up; bot_user fixes 2014-09-27 05:51:18 +02:00			`ratelimit_touch()`
rate limiting implemented 2014-07-21 04:54:50 +02:00			`if ratelimit_exceeded():`
			`return False`

recognize image/* as content-type 2014-07-27 12:21:32 +02:00			`(status, title) = extract_title(r)`
fetching works 2014-07-21 00:53:26 +02:00
recognize image/* as content-type 2014-07-27 12:21:32 +02:00			`if 0 == status:`
'no title' added 2014-08-01 20:49:07 +02:00			`message = 'Title: %s: %s' % (title.strip(), e(r))`
			`elif 1 == status:`
don't write anything for non text/* URLs 2014-08-08 10:03:48 +02:00			`logger('info', 'no message sent for non-text %s (%s)' %(r, title))`
			`continue`
'no title' added 2014-08-01 20:49:07 +02:00			`elif 2 == status:`
			`message = 'No title: %s' % (e(r))`
print special message for 'repo/urlbot.git' 2014-08-09 20:29:38 +02:00			`elif 3 == status:`
			`message = title`
'no title' added 2014-08-01 20:49:07 +02:00			`else:`
			`message = 'some error occurred when fetching %s' % e(r)`
added some more error handling because error handling is cool 2014-07-21 02:58:29 +02:00
ignore whitespace, \n in output 2014-07-22 22:23:10 +02:00			`message = message.replace('\n', '\\n')`

init() 2014-07-20 23:39:51 +02:00			`logger('info', 'printing ' + message)`
cleanup 2014-07-21 08:28:46 +02:00			`chat_write(message)`
only run parse_* if no URL detected 2014-08-02 20:48:06 +02:00			`ret = True`
			`return ret`
KI, reply added; code moved to chat_write() 2014-07-21 02:27:54 +02:00
parse_pn(); some fixups for upcoming relaunch 2014-09-14 12:05:01 +02:00			`def parse_pn(data):`
			`## reply_user = data.split(' ')[0].strip('<>')`
			`# since we can't determine if a user named 'foo> ' just wrote ' > bar'`
			`# or a user 'foo' just wrote '> > bar', we can't safely answer here`
			`logger('warn', 'received PN: ' + data)`
			`return False`

init() 2014-07-20 23:39:51 +02:00			`def parse_delete(filepath):`
			`try:`
			`fd = open(filepath, 'rb')`
cleanup 2014-07-21 08:28:46 +02:00			`except IOError:`
init() 2014-07-20 23:39:51 +02:00			`logger('err', 'file has vanished: ' + filepath)`
cleanup 2014-07-21 08:28:46 +02:00			`return False`
init() 2014-07-20 23:39:51 +02:00
			`content = fd.read(BUFSIZ) # ignore more than BUFSIZ`
parse_pn(); some fixups for upcoming relaunch 2014-09-14 12:05:01 +02:00			`fd.close()`
			`os.remove(filepath) # probably better crash here`
init() 2014-07-20 23:39:51 +02:00
ratelimit fix/split up; bot_user fixes 2014-09-27 05:51:18 +02:00			`if content[1:1+len(conf('bot_user'))] == conf('bot_user'):`
parse_pn(); some fixups for upcoming relaunch 2014-09-14 12:05:01 +02:00			`return`
init() 2014-07-20 23:39:51 +02:00
parse_pn(); some fixups for upcoming relaunch 2014-09-14 12:05:01 +02:00			`if 'has set the subject to:' in content:`
			`return`

			`if content.startswith('PRIV#'):`
			`parse_pn(content)`
			`return`
"nospoiler" pseudo command added 2014-09-17 15:49:52 +02:00
			`if 'nospoiler' in content:`
			`logger('info', "no spoiler for: " + content)`
			`return`
init() 2014-07-20 23:39:51 +02:00
parse_pn(); some fixups for upcoming relaunch 2014-09-14 12:05:01 +02:00			`if True != extract_url(content):`
fixed stuff; dynamic plugin reg; debugging infrastructure 2014-09-27 05:32:35 +02:00			`plugins.data_parse_commands(content)`
			`plugins.data_parse_other(content)`
parse_pn(); some fixups for upcoming relaunch 2014-09-14 12:05:01 +02:00			`return`
init() 2014-07-20 23:39:51 +02:00
"version" added; print_version_git()->get_version_git() 2014-08-20 03:12:06 +02:00			`def get_version_git():`
fixed some code style complaints 2014-08-09 23:39:00 +02:00			`import subprocess`
alternative pong; version info on startup 2014-07-27 08:04:25 +02:00
			`cmd = ['git', 'log', '-n', '1', '--oneline', '--abbrev-commit']`

			`p = subprocess.Popen(cmd, bufsize=1, stdout=subprocess.PIPE)`
			`first_line = p.stdout.readline()`

			`if 0 == p.wait():`
"version" added; print_version_git()->get_version_git() 2014-08-20 03:12:06 +02:00			`return "version (Git) '%s'" % e(first_line.strip())`
alternative pong; version info on startup 2014-07-27 08:04:25 +02:00			`else:`
"version" added; print_version_git()->get_version_git() 2014-08-20 03:12:06 +02:00			`return "(unknown version)"`
alternative pong; version info on startup 2014-07-27 08:04:25 +02:00
first try with plugins. untested, might crash 2014-09-27 03:40:27 +02:00			`import plugins`
scope fixes: request_counter 2014-09-27 06:03:04 +02:00
first try with plugins. untested, might crash 2014-09-27 03:40:27 +02:00			`plugins.chat_write = chat_write`
			`plugins.ratelimit_exceeded = ratelimit_exceeded`
ratelimit fix/split up; bot_user fixes 2014-09-27 05:51:18 +02:00			`plugins.ratelimit_touch = ratelimit_touch`
scope fixes: request_counter 2014-09-27 06:03:04 +02:00
fixed stuff; dynamic plugin reg; debugging infrastructure 2014-09-27 05:32:35 +02:00			`plugins.register_all()`

__main__ check added 2014-08-09 23:50:40 +02:00			`if '__main__' == __name__:`
scope fixes; moved VERSION to conf() 2014-09-27 06:12:34 +02:00			`set_conf('version', get_version_git())`
			`print sys.argv[0] + ' ' + conf('version')`
"version" added; print_version_git()->get_version_git() 2014-08-20 03:12:06 +02:00
add check for cmdfifo at startup 2014-09-21 20:10:37 +02:00			`if not os.path.exists(fifo_path):`
			`logger('error', 'fifo_path "%s" does not exist, exiting' % fifo_path)`
			`exit(1)`

			`if not stat.S_ISFIFO(os.stat(fifo_path).st_mode):`
			`logger('error', 'fifo_path "%s" is not a FIFO, exiting' % fifo_path)`
			`exit(1)`

minor compability changes to code base, add simple unit test for totally critical bug 2014-08-10 00:30:58 +02:00			`while 1:`
			`try:`
			`for f in os.listdir(event_files_dir):`
			`if 'mcabber-' == f[:8]:`
			`parse_delete(os.path.join(event_files_dir, f))`
init() 2014-07-20 23:39:51 +02:00
minor compability changes to code base, add simple unit test for totally critical bug 2014-08-10 00:30:58 +02:00			`time.sleep(delay)`
			`except KeyboardInterrupt:`
parse_pn(); some fixups for upcoming relaunch 2014-09-14 12:05:01 +02:00			`print ""`
minor compability changes to code base, add simple unit test for totally critical bug 2014-08-10 00:30:58 +02:00			`exit(130)`