From 76fd4645c8bed2cc3b4ceaecde4cf64f7e00233a Mon Sep 17 00:00:00 2001 From: urlbot Date: Fri, 10 Oct 2014 00:01:22 +0200 Subject: [PATCH] http://a../ also triggers, remove fix, wrap exception arround --- urlbot.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/urlbot.py b/urlbot.py index 5d383f6..928f3fd 100755 --- a/urlbot.py +++ b/urlbot.py @@ -130,6 +130,10 @@ def extract_url(data): result = re.findall("(https?://[^\s>]+)", data) if result: for url in result: + ratelimit_touch() + if ratelimit_exceeded(): + return False + # urllib.request is broken: # >>> '.'.encode('idna') # .... @@ -139,15 +143,11 @@ def extract_url(data): # UnicodeError: label empty or too long # >>> 'a.a.'.encode('idna') # b'a.a.' - if re.match(r'https?://\.', url): - logger('warn', 'bug tiggered, invalid url: %s' % url) - continue - ratelimit_touch() - if ratelimit_exceeded(): - return False - - (status, title) = extract_title(url) + try: + (status, title) = extract_title(url) + except UnicodeError: + (status, title) = (4, None) if 0 == status: title = title.strip() @@ -183,6 +183,9 @@ def extract_url(data): message = 'No title: %s' % url elif 3 == status: message = title + elif 4 == status: + message = 'Bug triggered, invalid URL/domain part: %s' % url + logger('warn', message) else: message = 'some error occurred when fetching %s' % url