diff --git a/common.py b/common.py index cc9ac04..15a6845 100644 --- a/common.py +++ b/common.py @@ -127,7 +127,7 @@ def fetch_page(url): log.info('fetching page ' + url) response = requests.get(url, headers={'User-Agent': USER_AGENT}, stream=True) content = response.raw.read(BUFSIZ, decode_content=True) - return content, response.headers + return content.decode(response.encoding), response.headers def extract_title(url): @@ -152,12 +152,6 @@ def extract_title(url): if 'text/' != headers['content-type'][:len('text/')]: return 1, headers['content-type'] - try: - charset = headers['content-type'].split(';')[1] - charset = charset.split("=")[1] - html_text = html_text.decode(charset) - except KeyError: - html_text = str(html_text) result = re.match(r'.*?