urllib cleanup
This commit is contained in:
28
common.py
28
common.py
@@ -1,7 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
""" Common functions for urlbot """
|
""" Common functions for urlbot """
|
||||||
import html.parser
|
import html.parser
|
||||||
import json
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
@@ -126,22 +125,8 @@ VERSION = get_version_git()
|
|||||||
def fetch_page(url):
|
def fetch_page(url):
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
log.info('fetching page ' + url)
|
log.info('fetching page ' + url)
|
||||||
# request = urllib.request.Request(url)
|
|
||||||
# request.add_header('User-Agent', USER_AGENT)
|
|
||||||
# response = urllib.request.urlopen(request)
|
|
||||||
# html_text = response.read(BUFSIZ) # ignore more than BUFSIZ
|
|
||||||
# if html_text[0] == 0x1f and html_text[1] == 0x8b:
|
|
||||||
# import zlib
|
|
||||||
# try:
|
|
||||||
# gzip_data = zlib.decompress(html_text, zlib.MAX_WBITS | 16)
|
|
||||||
# except:
|
|
||||||
# pass
|
|
||||||
# else:
|
|
||||||
# html_text = gzip_data
|
|
||||||
# response.close()
|
|
||||||
response = requests.get(url, headers={'User-Agent': USER_AGENT}, stream=True)
|
response = requests.get(url, headers={'User-Agent': USER_AGENT}, stream=True)
|
||||||
content = response.raw.read(BUFSIZ, decode_content=True)
|
content = response.raw.read(BUFSIZ, decode_content=True)
|
||||||
# return html_text, response.headers
|
|
||||||
return content, response.headers
|
return content, response.headers
|
||||||
|
|
||||||
|
|
||||||
@@ -162,24 +147,11 @@ def extract_title(url):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return 'failed: %s for %s' % (str(e), url)
|
return 'failed: %s for %s' % (str(e), url)
|
||||||
|
|
||||||
charset = None
|
|
||||||
if 'content-type' in headers:
|
if 'content-type' in headers:
|
||||||
log.debug('content-type: ' + headers['content-type'])
|
log.debug('content-type: ' + headers['content-type'])
|
||||||
|
|
||||||
if 'text/' != headers['content-type'][:len('text/')]:
|
if 'text/' != headers['content-type'][:len('text/')]:
|
||||||
return 1, headers['content-type']
|
return 1, headers['content-type']
|
||||||
|
|
||||||
charset = re.sub(
|
|
||||||
r'.*charset=(?P<charset>\S+).*',
|
|
||||||
r'\g<charset>', headers['content-type'], re.IGNORECASE
|
|
||||||
)
|
|
||||||
|
|
||||||
# if charset:
|
|
||||||
# try:
|
|
||||||
# html_text = html_text.decode(charset)
|
|
||||||
# except LookupError:
|
|
||||||
# log.warn("invalid charset in '%s': '%s'" % (headers['content-type'], charset))
|
|
||||||
|
|
||||||
if str != type(html_text):
|
if str != type(html_text):
|
||||||
html_text = str(html_text)
|
html_text = str(html_text)
|
||||||
|
|
||||||
|
|||||||
@@ -203,17 +203,6 @@ def resolve_url_title(**args):
|
|||||||
if any([re.match(b, url) for b in url_blacklist]):
|
if any([re.match(b, url) for b in url_blacklist]):
|
||||||
log.info('url blacklist match for ' + url)
|
log.info('url blacklist match for ' + url)
|
||||||
break
|
break
|
||||||
|
|
||||||
# urllib.request is broken:
|
|
||||||
# >>> '.'.encode('idna')
|
|
||||||
# ....
|
|
||||||
# UnicodeError: label empty or too long
|
|
||||||
# >>> '.a.'.encode('idna')
|
|
||||||
# ....
|
|
||||||
# UnicodeError: label empty or too long
|
|
||||||
# >>> 'a.a.'.encode('idna')
|
|
||||||
# b'a.a.'
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
title = extract_title(url)
|
title = extract_title(url)
|
||||||
except UnicodeError as e:
|
except UnicodeError as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user