diff --git a/eventlooper.py b/eventlooper.py index df3bcc8..d80361c 100755 --- a/eventlooper.py +++ b/eventlooper.py @@ -22,6 +22,8 @@ hist_flag = True uptime = -time.time() request_counter = 0 +parser = None + def debug_enabled(): # return True return False @@ -60,6 +62,8 @@ def fetch_page(url): return (None, None) def extract_title(url): + global parser + if 'repo/urlbot.git' in url: logger('info', 'repo URL found: ' + url) return (3, 'wee, that looks like my home repo!') @@ -68,10 +72,16 @@ def extract_title(url): (html, headers) = fetch_page(url) if html: + charset = '' if 'content-type' in headers: + logger('debug', 'content-type: ' + headers['content-type']) + if 'text/' != headers['content-type'][:len('text/')]: return (1, headers['content-type']) + charset = re.sub('.*charset=(?P\S+).*', + '\g', headers['content-type'], re.IGNORECASE) + result = re.match(r'.*?(.*?).*?', html, re.S | re.M | re.IGNORECASE) if result: match = result.groups()[0] @@ -79,6 +89,15 @@ def extract_title(url): # if 'charset=UTF-8' in headers['content-type']: # match = unicode(match) + if None == parser: + parser = HTMLParser.HTMLParser() + + if '' != charset: + try: + match = match.decode(charset) + except LookupError: + logger('warn', 'invalid charset in ' + header['content-type']) + try: expanded_html = parser.unescape(match) except UnicodeDecodeError as e: # idk why this can happen, but it does @@ -300,8 +319,6 @@ if '__main__' == __name__: VERSION = get_version_git() print sys.argv[0] + ' ' + VERSION - parser = HTMLParser.HTMLParser() - while 1: try: for f in os.listdir(event_files_dir):