remove error output for url resolving if its the url which is broken.

2017-09-06 15:25:38 +02:00 · 2015-12-21 19:39:09 +01:00
parent 603791b7bb
commit cd27764e37
2 changed files with 34 additions and 57 deletions
--- a/common.py
+++ b/common.py
@@ -6,6 +6,7 @@ import re
 import time
 import urllib.request
 from collections import namedtuple
 from urllib.error import URLError
 RATE_NO_LIMIT = 0x00
 RATE_GLOBAL = 0x01
@@ -124,24 +125,20 @@ VERSION = get_version_git()
 def fetch_page(url):
    log = logging.getLogger(__name__)
    log.info('fetching page ' + url)
-    try:
+    request = urllib.request.Request(url)
-        request = urllib.request.Request(url)
+    request.add_header('User-Agent', USER_AGENT)
-        request.add_header('User-Agent', USER_AGENT)
+    response = urllib.request.urlopen(request)
-        response = urllib.request.urlopen(request)
+    html_text = response.read(BUFSIZ)  # ignore more than BUFSIZ
-        html_text = response.read(BUFSIZ)  # ignore more than BUFSIZ
+    if html_text[0] == 0x1f and html_text[1] == 0x8b:
-        if html_text[0] == 0x1f and html_text[1] == 0x8b:
+        import zlib
-            import zlib
+        try:
-            try:
+            gzip_data = zlib.decompress(html_text, zlib.MAX_WBITS | 16)
-                gzip_data = zlib.decompress(html_text, zlib.MAX_WBITS | 16)
+        except:
-            except:
+            pass
-                pass
+        else:
-            else:
+            html_text = gzip_data
-                html_text = gzip_data
+    response.close()
-        response.close()
+    return html_text, response.headers
        return 0, html_text, response.headers
    except Exception as e:
        log.warn('failed: %s' % e)
        return 1, str(e), 'dummy'
 def extract_title(url):
@@ -150,19 +147,18 @@ def extract_title(url):
    if 'repo/urlbot-native.git' in url:
        log.info('repo URL found: ' + url)
-        return 3, 'wee, that looks like my home repo!'
+        return 'wee, that looks like my home repo!', []
    log.info('extracting title from ' + url)
-    (code, html_text, headers) = fetch_page(url)
+    try:
        (html_text, headers) = fetch_page(url)
    except URLError as e:
        return None
    except Exception as e:
        return 'failed: %s for %s' % (str(e), url)
-    if 1 == code:
+    charset = None
        return 3, 'failed: %s for %s' % (html_text, url)
    if not html_text:
        return -1, 'error'
    charset = ''
    if 'content-type' in headers:
        log.debug('content-type: ' + headers['content-type'])
@@ -174,7 +170,7 @@ def extract_title(url):
            r'\g<charset>', headers['content-type'], re.IGNORECASE
        )
-    if '' != charset:
+    if charset:
        try:
            html_text = html_text.decode(charset)
        except LookupError:
@@ -193,6 +189,6 @@ def extract_title(url):
        except UnicodeDecodeError as e:  # idk why this can happen, but it does
            log.warn('parser.unescape() expoded here: ' + str(e))
            expanded_html = match
-        return 0, expanded_html
+        return expanded_html
    else:
-        return 2, 'no title'
+        return None
--- a/plugins.py
+++ b/plugins.py
@@ -1055,6 +1055,7 @@ def flausch(argv, **args):
        'msg': '{}: *flausch*'.format(argv[1])
    }
@pluginfunction('resolve-url-title', 'extract titles from urls', ptypes_PARSE, ratelimit_class=RATE_URL)
 def resolve_url_title(**args):
    user = args['reply_user']
@@ -1086,37 +1087,17 @@ def resolve_url_title(**args):
        # b'a.a.'
        try:
-            (status, title) = extract_title(url)
+            title = extract_title(url)
        except UnicodeError as e:
-            (status, title) = (4, str(e))
+            message = 'Bug triggered (%s), invalid URL/domain part: %s' % (str(e), url)
            log.warn(message)
            return {'msg': message}
-        if 0 == status:
+        if title:
            title = title.strip()
            message = 'Title: %s' % title
-        elif 1 == status:
+            message = message.replace('\n', '\\n')
-            if config.conf_get('image_preview'):
+            out.append(message)
                # of course it's fake, but it looks interesting at least
                char = r""",._-+=\|/*`~"'"""
                message = 'No text but %s, 1-bit ASCII art preview: [%c]' % (
                    title, random.choice(char)
                )
            else:
                log.info('no message sent for non-text %s (%s)' % (url, title))
                continue
        elif 2 == status:
            message = '(No title)'
        elif 3 == status:
            message = title
        elif 4 == status:
            message = 'Bug triggered (%s), invalid URL/domain part: %s' % (title, url)
            log.warn(message)
        else:
            message = 'some error occurred when fetching %s' % url
        message = message.replace('\n', '\\n')
        log.info('adding to out buf: ' + message)
        out.append(message)
    return {
        'msg': out