1
0
mirror of http://aero2k.de/t/repos/urlbot-native.git synced 2017-09-06 15:25:38 +02:00

"".decode() from content-type charset; HTMLParser caching for interactive import

This commit is contained in:
urlbot
2014-09-21 17:39:06 +02:00
parent cbb602ad4f
commit ab5f211345

View File

@@ -22,6 +22,8 @@ hist_flag = True
uptime = -time.time() uptime = -time.time()
request_counter = 0 request_counter = 0
parser = None
def debug_enabled(): def debug_enabled():
# return True # return True
return False return False
@@ -60,6 +62,8 @@ def fetch_page(url):
return (None, None) return (None, None)
def extract_title(url): def extract_title(url):
global parser
if 'repo/urlbot.git' in url: if 'repo/urlbot.git' in url:
logger('info', 'repo URL found: ' + url) logger('info', 'repo URL found: ' + url)
return (3, 'wee, that looks like my home repo!') return (3, 'wee, that looks like my home repo!')
@@ -68,10 +72,16 @@ def extract_title(url):
(html, headers) = fetch_page(url) (html, headers) = fetch_page(url)
if html: if html:
charset = ''
if 'content-type' in headers: if 'content-type' in headers:
logger('debug', 'content-type: ' + headers['content-type'])
if 'text/' != headers['content-type'][:len('text/')]: if 'text/' != headers['content-type'][:len('text/')]:
return (1, headers['content-type']) return (1, headers['content-type'])
charset = re.sub('.*charset=(?P<charset>\S+).*',
'\g<charset>', headers['content-type'], re.IGNORECASE)
result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html, re.S | re.M | re.IGNORECASE) result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html, re.S | re.M | re.IGNORECASE)
if result: if result:
match = result.groups()[0] match = result.groups()[0]
@@ -79,6 +89,15 @@ def extract_title(url):
# if 'charset=UTF-8' in headers['content-type']: # if 'charset=UTF-8' in headers['content-type']:
# match = unicode(match) # match = unicode(match)
if None == parser:
parser = HTMLParser.HTMLParser()
if '' != charset:
try:
match = match.decode(charset)
except LookupError:
logger('warn', 'invalid charset in ' + header['content-type'])
try: try:
expanded_html = parser.unescape(match) expanded_html = parser.unescape(match)
except UnicodeDecodeError as e: # idk why this can happen, but it does except UnicodeDecodeError as e: # idk why this can happen, but it does
@@ -300,8 +319,6 @@ if '__main__' == __name__:
VERSION = get_version_git() VERSION = get_version_git()
print sys.argv[0] + ' ' + VERSION print sys.argv[0] + ' ' + VERSION
parser = HTMLParser.HTMLParser()
while 1: while 1:
try: try:
for f in os.listdir(event_files_dir): for f in os.listdir(event_files_dir):