From d94d62335fa9e4c2030e84e6b1b1a472f092d043 Mon Sep 17 00:00:00 2001 From: Thorsten Date: Sat, 28 Nov 2015 18:41:40 +0100 Subject: [PATCH] move url extraction to plugins --- common.py | 6 ++-- plugins.py | 69 ++++++++++++++++++++++++++++++++++++++++++-- urlbot.py | 85 ------------------------------------------------------ 3 files changed, 70 insertions(+), 90 deletions(-) diff --git a/common.py b/common.py index 6a886b7..48b392b 100644 --- a/common.py +++ b/common.py @@ -50,9 +50,9 @@ def conf_set(key, value): conf_save(blob) -def conf_get(key): +def conf_get(key, default=None): blob = conf_load() - return blob.get(key) + return blob.get(key, default) Bucket = namedtuple("BucketConfig", ["history", "period", "max_hist_len"]) @@ -170,7 +170,7 @@ def extract_title(url): log = logging.getLogger(__name__) global parser - if 'repo/urlbot.git' in url: + if 'repo/urlbot-native.git' in url: log.info('repo URL found: ' + url) return 3, 'wee, that looks like my home repo!' diff --git a/plugins.py b/plugins.py index cb1f3ca..e2d3046 100644 --- a/plugins.py +++ b/plugins.py @@ -9,10 +9,9 @@ import types import unicodedata import urllib.parse import urllib.request -# from common import * from common import conf_load, conf_save, RATE_GLOBAL, RATE_NO_SILENCE, VERSION, RATE_INTERACTIVE, BUFSIZ, \ - USER_AGENT, extract_title, RATE_FUN, RATE_NO_LIMIT + USER_AGENT, extract_title, RATE_FUN, RATE_NO_LIMIT, conf_get, RATE_URL from local_config import set_conf, conf from string_constants import excuses, moin_strings_hi, moin_strings_bye, cakes @@ -1043,6 +1042,72 @@ def reset_jobs(argv, **args): return {'msg': 'done.'} +@pluginfunction('resolve-url-title', 'extract titles from urls', ptypes_PARSE, ratelimit_class=RATE_URL) +def resolve_url_title(**args): + user = args['reply_user'] + user_pref_nospoiler = conf_get('user_pref', {}).get(user, {}).get('spoiler', False) + if user_pref_nospoiler: + log.info('nospoiler in userconf') + return + + result = re.findall(r'(https?://[^\s>]+)', args['data']) + if not result: + return + + out = [] + for url in result: + if any([re.match(b, url) for b in conf('url_blacklist')]): + log.info('url blacklist match for ' + url) + break + + # urllib.request is broken: + # >>> '.'.encode('idna') + # .... + # UnicodeError: label empty or too long + # >>> '.a.'.encode('idna') + # .... + # UnicodeError: label empty or too long + # >>> 'a.a.'.encode('idna') + # b'a.a.' + + try: + (status, title) = extract_title(url) + except UnicodeError as e: + (status, title) = (4, str(e)) + + if 0 == status: + title = title.strip() + message = 'Title: %s' % title + elif 1 == status: + if conf('image_preview'): + # of course it's fake, but it looks interesting at least + char = r""",._-+=\|/*`~"'""" + message = 'No text but %s, 1-bit ASCII art preview: [%c]' % ( + title, random.choice(char) + ) + else: + log.info('no message sent for non-text %s (%s)' % (url, title)) + continue + elif 2 == status: + message = '(No title)' + elif 3 == status: + message = title + elif 4 == status: + message = 'Bug triggered (%s), invalid URL/domain part: %s' % (title, url) + log.warn(message) + else: + message = 'some error occurred when fetching %s' % url + + message = message.replace('\n', '\\n') + + log.info('adding to out buf: ' + message) + out.append(message) + + return { + 'msg': out + } + + def else_command(args): log.info('sent short info') return { diff --git a/urlbot.py b/urlbot.py index 141cbc5..0b983a5 100755 --- a/urlbot.py +++ b/urlbot.py @@ -155,77 +155,6 @@ class UrlBot(IdleBot): mtype='groupchat' ) - # TODO: plugin? - def extract_url(self, data, msg_obj): - result = re.findall(r'(https?://[^\s>]+)', data) - if not result: - return - - ret = None - out = [] - for url in result: - # if rate_limit(RATE_NO_SILENCE | RATE_GLOBAL): - # return False - - flag = False - for b in conf('url_blacklist'): - if re.match(b, url): - flag = True - self.logger.info('url blacklist match for ' + url) - break - - if flag: - # an URL has matched the blacklist, continue to the next URL - continue - - # urllib.request is broken: - # >>> '.'.encode('idna') - # .... - # UnicodeError: label empty or too long - # >>> '.a.'.encode('idna') - # .... - # UnicodeError: label empty or too long - # >>> 'a.a.'.encode('idna') - # b'a.a.' - - try: - (status, title) = extract_title(url) - except UnicodeError as e: - (status, title) = (4, str(e)) - - if 0 == status: - title = title.strip() - message = 'Title: %s' % title - elif 1 == status: - if conf('image_preview'): - # of course it's fake, but it looks interesting at least - char = r""",._-+=\|/*`~"'""" - message = 'No text but %s, 1-bit ASCII art preview: [%c]' % ( - title, random.choice(char) - ) - else: - self.logger.info('no message sent for non-text %s (%s)' % (url, title)) - continue - elif 2 == status: - message = '(No title)' - elif 3 == status: - message = title - elif 4 == status: - message = 'Bug triggered (%s), invalid URL/domain part: %s' % (title, url) - self.logger.warn(message) - else: - message = 'some error occurred when fetching %s' % url - - message = message.replace('\n', '\\n') - - self.logger.info('adding to out buf: ' + message) - out.append(message) - ret = True - - if ret and rate_limit(RATE_URL | RATE_GLOBAL): - self.send_reply(out, msg_obj) - return ret - def handle_msg(self, msg_obj): """ called for incoming messages @@ -245,20 +174,6 @@ class UrlBot(IdleBot): self.logger.info('no spoiler for: ' + content) return - arg_user = msg_obj['mucnick'] - blob_userpref = conf_load().get('user_pref', []) - nospoiler = False - - if arg_user in blob_userpref: - if 'spoiler' in blob_userpref[arg_user]: - if not blob_userpref[arg_user]['spoiler']: - self.logger.info('nospoiler from conf') - nospoiler = True - - if not nospoiler: - # TODO: why not make this a plugin? - self.extract_url(content, msg_obj) - self.data_parse_commands(msg_obj) self.data_parse_other(msg_obj)