This commit is contained in:
urlbot
2015-07-19 20:47:57 +02:00
parent 52fb4d11d3
commit d1bbf13d6e

View File

@@ -58,45 +58,45 @@ def extract_title(url):
if 1 == code: if 1 == code:
return (3, 'failed: %s for %s' % (html_text, url)) return (3, 'failed: %s for %s' % (html_text, url))
if html_text: if not html_text:
charset = '' return (-1, 'error')
if 'content-type' in headers:
log.debug('content-type: ' + headers['content-type'])
if 'text/' != headers['content-type'][:len('text/')]: charset = ''
return (1, headers['content-type']) if 'content-type' in headers:
log.debug('content-type: ' + headers['content-type'])
charset = re.sub( if 'text/' != headers['content-type'][:len('text/')]:
'.*charset=(?P<charset>\S+).*', return (1, headers['content-type'])
'\g<charset>', headers['content-type'], re.IGNORECASE
)
if '' != charset: charset = re.sub(
try: '.*charset=(?P<charset>\S+).*',
html_text = html_text.decode(charset) '\g<charset>', headers['content-type'], re.IGNORECASE
except LookupError: )
log.warn("invalid charset in '%s': '%s'" % (headers['content-type'], charset))
if str != type(html_text): if '' != charset:
html_text = str(html_text) try:
html_text = html_text.decode(charset)
except LookupError:
log.warn("invalid charset in '%s': '%s'" % (headers['content-type'], charset))
result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html_text, re.S | re.M | re.IGNORECASE) if str != type(html_text):
if result: html_text = str(html_text)
match = result.groups()[0]
if None == parser: result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html_text, re.S | re.M | re.IGNORECASE)
parser = html.parser.HTMLParser() if result:
match = result.groups()[0]
try: if None == parser:
expanded_html = parser.unescape(match) parser = html.parser.HTMLParser()
except UnicodeDecodeError as e: # idk why this can happen, but it does
log.warn('parser.unescape() expoded here: ' + str(e))
expanded_html = match
return (0, expanded_html)
else:
return (2, 'no title')
return (-1, 'error') try:
expanded_html = parser.unescape(match)
except UnicodeDecodeError as e: # idk why this can happen, but it does
log.warn('parser.unescape() expoded here: ' + str(e))
expanded_html = match
return (0, expanded_html)
else:
return (2, 'no title')
def send_reply(message, msg_obj=None): def send_reply(message, msg_obj=None):
set_conf('request_counter', conf('request_counter') + 1) set_conf('request_counter', conf('request_counter') + 1)