schei? encoding - doing it right
This commit is contained in:
@@ -127,7 +127,7 @@ def fetch_page(url):
|
|||||||
log.info('fetching page ' + url)
|
log.info('fetching page ' + url)
|
||||||
response = requests.get(url, headers={'User-Agent': USER_AGENT}, stream=True)
|
response = requests.get(url, headers={'User-Agent': USER_AGENT}, stream=True)
|
||||||
content = response.raw.read(BUFSIZ, decode_content=True)
|
content = response.raw.read(BUFSIZ, decode_content=True)
|
||||||
return content, response.headers
|
return content.decode(response.encoding), response.headers
|
||||||
|
|
||||||
|
|
||||||
def extract_title(url):
|
def extract_title(url):
|
||||||
@@ -152,12 +152,6 @@ def extract_title(url):
|
|||||||
|
|
||||||
if 'text/' != headers['content-type'][:len('text/')]:
|
if 'text/' != headers['content-type'][:len('text/')]:
|
||||||
return 1, headers['content-type']
|
return 1, headers['content-type']
|
||||||
try:
|
|
||||||
charset = headers['content-type'].split(';')[1]
|
|
||||||
charset = charset.split("=")[1]
|
|
||||||
html_text = html_text.decode(charset)
|
|
||||||
except KeyError:
|
|
||||||
html_text = str(html_text)
|
|
||||||
|
|
||||||
result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html_text, re.S | re.M | re.IGNORECASE)
|
result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html_text, re.S | re.M | re.IGNORECASE)
|
||||||
if result:
|
if result:
|
||||||
|
|||||||
Reference in New Issue
Block a user