1
0
mirror of http://aero2k.de/t/repos/urlbot-native.git synced 2017-09-06 15:25:38 +02:00

urllib cleanup

This commit is contained in:
Thorsten
2015-12-31 21:03:11 +01:00
parent bd984d0aa1
commit 837e1e16ce
2 changed files with 0 additions and 39 deletions

View File

@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" Common functions for urlbot """ """ Common functions for urlbot """
import html.parser import html.parser
import json
import logging import logging
import re import re
import time import time
@@ -126,22 +125,8 @@ VERSION = get_version_git()
def fetch_page(url): def fetch_page(url):
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
log.info('fetching page ' + url) log.info('fetching page ' + url)
# request = urllib.request.Request(url)
# request.add_header('User-Agent', USER_AGENT)
# response = urllib.request.urlopen(request)
# html_text = response.read(BUFSIZ) # ignore more than BUFSIZ
# if html_text[0] == 0x1f and html_text[1] == 0x8b:
# import zlib
# try:
# gzip_data = zlib.decompress(html_text, zlib.MAX_WBITS | 16)
# except:
# pass
# else:
# html_text = gzip_data
# response.close()
response = requests.get(url, headers={'User-Agent': USER_AGENT}, stream=True) response = requests.get(url, headers={'User-Agent': USER_AGENT}, stream=True)
content = response.raw.read(BUFSIZ, decode_content=True) content = response.raw.read(BUFSIZ, decode_content=True)
# return html_text, response.headers
return content, response.headers return content, response.headers
@@ -162,24 +147,11 @@ def extract_title(url):
except Exception as e: except Exception as e:
return 'failed: %s for %s' % (str(e), url) return 'failed: %s for %s' % (str(e), url)
charset = None
if 'content-type' in headers: if 'content-type' in headers:
log.debug('content-type: ' + headers['content-type']) log.debug('content-type: ' + headers['content-type'])
if 'text/' != headers['content-type'][:len('text/')]: if 'text/' != headers['content-type'][:len('text/')]:
return 1, headers['content-type'] return 1, headers['content-type']
charset = re.sub(
r'.*charset=(?P<charset>\S+).*',
r'\g<charset>', headers['content-type'], re.IGNORECASE
)
# if charset:
# try:
# html_text = html_text.decode(charset)
# except LookupError:
# log.warn("invalid charset in '%s': '%s'" % (headers['content-type'], charset))
if str != type(html_text): if str != type(html_text):
html_text = str(html_text) html_text = str(html_text)

View File

@@ -203,17 +203,6 @@ def resolve_url_title(**args):
if any([re.match(b, url) for b in url_blacklist]): if any([re.match(b, url) for b in url_blacklist]):
log.info('url blacklist match for ' + url) log.info('url blacklist match for ' + url)
break break
# urllib.request is broken:
# >>> '.'.encode('idna')
# ....
# UnicodeError: label empty or too long
# >>> '.a.'.encode('idna')
# ....
# UnicodeError: label empty or too long
# >>> 'a.a.'.encode('idna')
# b'a.a.'
try: try:
title = extract_title(url) title = extract_title(url)
except UnicodeError as e: except UnicodeError as e: