workaround error in HTMLParser with unicode input

This commit is contained in:
urlbot
2014-08-20 02:44:11 +02:00
parent 1f3d2577f0
commit a70515ad6a

View File

@@ -73,7 +73,12 @@ def extract_title(url):
result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html, re.S | re.M | re.IGNORECASE) result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html, re.S | re.M | re.IGNORECASE)
if result: if result:
return (0, parser.unescape(result.groups()[0])) try:
expanded_html = parser.unescape(result.groups()[0])
except UnicodeDecodeError as e: # idk why this can happen, but it does
logger('warn', 'parser.unescape() expoded here: ' + str(e))
expanded_html = result.groups()[0]
return (0, expanded_html)
else: else:
return (2, 'no title') return (2, 'no title')