mirror of
http://aero2k.de/t/repos/urlbot-native.git
synced 2017-09-06 15:25:38 +02:00
"".decode() from content-type charset; HTMLParser caching for interactive import
This commit is contained in:
@@ -22,6 +22,8 @@ hist_flag = True
|
|||||||
uptime = -time.time()
|
uptime = -time.time()
|
||||||
request_counter = 0
|
request_counter = 0
|
||||||
|
|
||||||
|
parser = None
|
||||||
|
|
||||||
def debug_enabled():
|
def debug_enabled():
|
||||||
# return True
|
# return True
|
||||||
return False
|
return False
|
||||||
@@ -60,6 +62,8 @@ def fetch_page(url):
|
|||||||
return (None, None)
|
return (None, None)
|
||||||
|
|
||||||
def extract_title(url):
|
def extract_title(url):
|
||||||
|
global parser
|
||||||
|
|
||||||
if 'repo/urlbot.git' in url:
|
if 'repo/urlbot.git' in url:
|
||||||
logger('info', 'repo URL found: ' + url)
|
logger('info', 'repo URL found: ' + url)
|
||||||
return (3, 'wee, that looks like my home repo!')
|
return (3, 'wee, that looks like my home repo!')
|
||||||
@@ -68,10 +72,16 @@ def extract_title(url):
|
|||||||
|
|
||||||
(html, headers) = fetch_page(url)
|
(html, headers) = fetch_page(url)
|
||||||
if html:
|
if html:
|
||||||
|
charset = ''
|
||||||
if 'content-type' in headers:
|
if 'content-type' in headers:
|
||||||
|
logger('debug', 'content-type: ' + headers['content-type'])
|
||||||
|
|
||||||
if 'text/' != headers['content-type'][:len('text/')]:
|
if 'text/' != headers['content-type'][:len('text/')]:
|
||||||
return (1, headers['content-type'])
|
return (1, headers['content-type'])
|
||||||
|
|
||||||
|
charset = re.sub('.*charset=(?P<charset>\S+).*',
|
||||||
|
'\g<charset>', headers['content-type'], re.IGNORECASE)
|
||||||
|
|
||||||
result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html, re.S | re.M | re.IGNORECASE)
|
result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html, re.S | re.M | re.IGNORECASE)
|
||||||
if result:
|
if result:
|
||||||
match = result.groups()[0]
|
match = result.groups()[0]
|
||||||
@@ -79,6 +89,15 @@ def extract_title(url):
|
|||||||
# if 'charset=UTF-8' in headers['content-type']:
|
# if 'charset=UTF-8' in headers['content-type']:
|
||||||
# match = unicode(match)
|
# match = unicode(match)
|
||||||
|
|
||||||
|
if None == parser:
|
||||||
|
parser = HTMLParser.HTMLParser()
|
||||||
|
|
||||||
|
if '' != charset:
|
||||||
|
try:
|
||||||
|
match = match.decode(charset)
|
||||||
|
except LookupError:
|
||||||
|
logger('warn', 'invalid charset in ' + header['content-type'])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
expanded_html = parser.unescape(match)
|
expanded_html = parser.unescape(match)
|
||||||
except UnicodeDecodeError as e: # idk why this can happen, but it does
|
except UnicodeDecodeError as e: # idk why this can happen, but it does
|
||||||
@@ -300,8 +319,6 @@ if '__main__' == __name__:
|
|||||||
VERSION = get_version_git()
|
VERSION = get_version_git()
|
||||||
print sys.argv[0] + ' ' + VERSION
|
print sys.argv[0] + ' ' + VERSION
|
||||||
|
|
||||||
parser = HTMLParser.HTMLParser()
|
|
||||||
|
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
for f in os.listdir(event_files_dir):
|
for f in os.listdir(event_files_dir):
|
||||||
|
|||||||
Reference in New Issue
Block a user