diff --git a/common.py b/common.py index 0bc02df..2010b60 100644 --- a/common.py +++ b/common.py @@ -6,6 +6,7 @@ import re import time import urllib.request from collections import namedtuple +from urllib.error import URLError RATE_NO_LIMIT = 0x00 RATE_GLOBAL = 0x01 @@ -124,24 +125,20 @@ VERSION = get_version_git() def fetch_page(url): log = logging.getLogger(__name__) log.info('fetching page ' + url) - try: - request = urllib.request.Request(url) - request.add_header('User-Agent', USER_AGENT) - response = urllib.request.urlopen(request) - html_text = response.read(BUFSIZ) # ignore more than BUFSIZ - if html_text[0] == 0x1f and html_text[1] == 0x8b: - import zlib - try: - gzip_data = zlib.decompress(html_text, zlib.MAX_WBITS | 16) - except: - pass - else: - html_text = gzip_data - response.close() - return 0, html_text, response.headers - except Exception as e: - log.warn('failed: %s' % e) - return 1, str(e), 'dummy' + request = urllib.request.Request(url) + request.add_header('User-Agent', USER_AGENT) + response = urllib.request.urlopen(request) + html_text = response.read(BUFSIZ) # ignore more than BUFSIZ + if html_text[0] == 0x1f and html_text[1] == 0x8b: + import zlib + try: + gzip_data = zlib.decompress(html_text, zlib.MAX_WBITS | 16) + except: + pass + else: + html_text = gzip_data + response.close() + return html_text, response.headers def extract_title(url): @@ -150,19 +147,18 @@ def extract_title(url): if 'repo/urlbot-native.git' in url: log.info('repo URL found: ' + url) - return 3, 'wee, that looks like my home repo!' + return 'wee, that looks like my home repo!', [] log.info('extracting title from ' + url) - (code, html_text, headers) = fetch_page(url) + try: + (html_text, headers) = fetch_page(url) + except URLError as e: + return None + except Exception as e: + return 'failed: %s for %s' % (str(e), url) - if 1 == code: - return 3, 'failed: %s for %s' % (html_text, url) - - if not html_text: - return -1, 'error' - - charset = '' + charset = None if 'content-type' in headers: log.debug('content-type: ' + headers['content-type']) @@ -174,7 +170,7 @@ def extract_title(url): r'\g', headers['content-type'], re.IGNORECASE ) - if '' != charset: + if charset: try: html_text = html_text.decode(charset) except LookupError: @@ -193,6 +189,6 @@ def extract_title(url): except UnicodeDecodeError as e: # idk why this can happen, but it does log.warn('parser.unescape() expoded here: ' + str(e)) expanded_html = match - return 0, expanded_html + return expanded_html else: - return 2, 'no title' + return None diff --git a/plugins.py b/plugins.py index de15733..09c9e26 100644 --- a/plugins.py +++ b/plugins.py @@ -1055,6 +1055,7 @@ def flausch(argv, **args): 'msg': '{}: *flausch*'.format(argv[1]) } + @pluginfunction('resolve-url-title', 'extract titles from urls', ptypes_PARSE, ratelimit_class=RATE_URL) def resolve_url_title(**args): user = args['reply_user'] @@ -1086,37 +1087,17 @@ def resolve_url_title(**args): # b'a.a.' try: - (status, title) = extract_title(url) + title = extract_title(url) except UnicodeError as e: - (status, title) = (4, str(e)) + message = 'Bug triggered (%s), invalid URL/domain part: %s' % (str(e), url) + log.warn(message) + return {'msg': message} - if 0 == status: + if title: title = title.strip() message = 'Title: %s' % title - elif 1 == status: - if config.conf_get('image_preview'): - # of course it's fake, but it looks interesting at least - char = r""",._-+=\|/*`~"'""" - message = 'No text but %s, 1-bit ASCII art preview: [%c]' % ( - title, random.choice(char) - ) - else: - log.info('no message sent for non-text %s (%s)' % (url, title)) - continue - elif 2 == status: - message = '(No title)' - elif 3 == status: - message = title - elif 4 == status: - message = 'Bug triggered (%s), invalid URL/domain part: %s' % (title, url) - log.warn(message) - else: - message = 'some error occurred when fetching %s' % url - - message = message.replace('\n', '\\n') - - log.info('adding to out buf: ' + message) - out.append(message) + message = message.replace('\n', '\\n') + out.append(message) return { 'msg': out