cleanup
This commit is contained in:
62
urlbot.py
62
urlbot.py
@@ -58,45 +58,45 @@ def extract_title(url):
|
|||||||
if 1 == code:
|
if 1 == code:
|
||||||
return (3, 'failed: %s for %s' % (html_text, url))
|
return (3, 'failed: %s for %s' % (html_text, url))
|
||||||
|
|
||||||
if html_text:
|
if not html_text:
|
||||||
charset = ''
|
return (-1, 'error')
|
||||||
if 'content-type' in headers:
|
|
||||||
log.debug('content-type: ' + headers['content-type'])
|
|
||||||
|
|
||||||
if 'text/' != headers['content-type'][:len('text/')]:
|
charset = ''
|
||||||
return (1, headers['content-type'])
|
if 'content-type' in headers:
|
||||||
|
log.debug('content-type: ' + headers['content-type'])
|
||||||
|
|
||||||
charset = re.sub(
|
if 'text/' != headers['content-type'][:len('text/')]:
|
||||||
'.*charset=(?P<charset>\S+).*',
|
return (1, headers['content-type'])
|
||||||
'\g<charset>', headers['content-type'], re.IGNORECASE
|
|
||||||
)
|
|
||||||
|
|
||||||
if '' != charset:
|
charset = re.sub(
|
||||||
try:
|
'.*charset=(?P<charset>\S+).*',
|
||||||
html_text = html_text.decode(charset)
|
'\g<charset>', headers['content-type'], re.IGNORECASE
|
||||||
except LookupError:
|
)
|
||||||
log.warn("invalid charset in '%s': '%s'" % (headers['content-type'], charset))
|
|
||||||
|
|
||||||
if str != type(html_text):
|
if '' != charset:
|
||||||
html_text = str(html_text)
|
try:
|
||||||
|
html_text = html_text.decode(charset)
|
||||||
|
except LookupError:
|
||||||
|
log.warn("invalid charset in '%s': '%s'" % (headers['content-type'], charset))
|
||||||
|
|
||||||
result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html_text, re.S | re.M | re.IGNORECASE)
|
if str != type(html_text):
|
||||||
if result:
|
html_text = str(html_text)
|
||||||
match = result.groups()[0]
|
|
||||||
|
|
||||||
if None == parser:
|
result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html_text, re.S | re.M | re.IGNORECASE)
|
||||||
parser = html.parser.HTMLParser()
|
if result:
|
||||||
|
match = result.groups()[0]
|
||||||
|
|
||||||
try:
|
if None == parser:
|
||||||
expanded_html = parser.unescape(match)
|
parser = html.parser.HTMLParser()
|
||||||
except UnicodeDecodeError as e: # idk why this can happen, but it does
|
|
||||||
log.warn('parser.unescape() expoded here: ' + str(e))
|
|
||||||
expanded_html = match
|
|
||||||
return (0, expanded_html)
|
|
||||||
else:
|
|
||||||
return (2, 'no title')
|
|
||||||
|
|
||||||
return (-1, 'error')
|
try:
|
||||||
|
expanded_html = parser.unescape(match)
|
||||||
|
except UnicodeDecodeError as e: # idk why this can happen, but it does
|
||||||
|
log.warn('parser.unescape() expoded here: ' + str(e))
|
||||||
|
expanded_html = match
|
||||||
|
return (0, expanded_html)
|
||||||
|
else:
|
||||||
|
return (2, 'no title')
|
||||||
|
|
||||||
def send_reply(message, msg_obj=None):
|
def send_reply(message, msg_obj=None):
|
||||||
set_conf('request_counter', conf('request_counter') + 1)
|
set_conf('request_counter', conf('request_counter') + 1)
|
||||||
|
|||||||
Reference in New Issue
Block a user