schei? encoding - doing it right

This commit is contained in:
Thorsten
2016-01-04 21:58:29 +01:00
parent 1a2b81e083
commit 98d5c82d82

View File

@@ -127,7 +127,7 @@ def fetch_page(url):
log.info('fetching page ' + url) log.info('fetching page ' + url)
response = requests.get(url, headers={'User-Agent': USER_AGENT}, stream=True) response = requests.get(url, headers={'User-Agent': USER_AGENT}, stream=True)
content = response.raw.read(BUFSIZ, decode_content=True) content = response.raw.read(BUFSIZ, decode_content=True)
return content, response.headers return content.decode(response.encoding), response.headers
def extract_title(url): def extract_title(url):
@@ -152,12 +152,6 @@ def extract_title(url):
if 'text/' != headers['content-type'][:len('text/')]: if 'text/' != headers['content-type'][:len('text/')]:
return 1, headers['content-type'] return 1, headers['content-type']
try:
charset = headers['content-type'].split(';')[1]
charset = charset.split("=")[1]
html_text = html_text.decode(charset)
except KeyError:
html_text = str(html_text)
result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html_text, re.S | re.M | re.IGNORECASE) result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html_text, re.S | re.M | re.IGNORECASE)
if result: if result: