133 lines
3.1 KiB
Python
133 lines
3.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
import html.parser
|
|
import logging
|
|
import os
|
|
import pickle
|
|
import re
|
|
import sys
|
|
import urllib.request
|
|
|
|
from local_config import conf
|
|
|
|
RATE_GLOBAL = 0x01
|
|
RATE_NO_SILENCE = 0x02
|
|
RATE_INTERACTIVE = 0x04
|
|
RATE_CHAT = 0x08
|
|
RATE_URL = 0x10
|
|
|
|
rate_limit_classes = (RATE_CHAT, RATE_GLOBAL, RATE_NO_SILENCE, RATE_INTERACTIVE, RATE_URL)
|
|
|
|
BUFSIZ = 8192
|
|
EVENTLOOP_DELAY = 0.100 # seconds
|
|
USER_AGENT = '''Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0 Iceweasel/31.0'''
|
|
|
|
basedir = '.'
|
|
if 2 == len(sys.argv):
|
|
basedir = sys.argv[1]
|
|
|
|
|
|
def conf_save(obj):
|
|
with open(conf('persistent_storage'), 'wb') as fd:
|
|
return pickle.dump(obj, fd)
|
|
|
|
|
|
def conf_load():
|
|
path = conf('persistent_storage')
|
|
if os.path.isfile(path):
|
|
with open(path, 'rb') as fd:
|
|
fd.seek(0)
|
|
return pickle.load(fd)
|
|
else:
|
|
return {}
|
|
|
|
|
|
def get_version_git():
|
|
import subprocess
|
|
|
|
cmd = ['git', 'log', '--oneline', '--abbrev-commit']
|
|
|
|
p = subprocess.Popen(cmd, bufsize=1, stdout=subprocess.PIPE)
|
|
first_line = p.stdout.readline()
|
|
line_count = len(p.stdout.readlines()) + 1
|
|
|
|
if 0 == p.wait():
|
|
# skip this 1st, 2nd, 3rd stuff and use always [0-9]th
|
|
return "version (Git, %dth rev) '%s'" % (
|
|
line_count, str(first_line.strip(), encoding='utf8')
|
|
)
|
|
else:
|
|
return "(unknown version)"
|
|
|
|
|
|
VERSION = get_version_git()
|
|
|
|
|
|
def fetch_page(url):
|
|
log = logging.getLogger(__name__)
|
|
log.info('fetching page ' + url)
|
|
try:
|
|
request = urllib.request.Request(url)
|
|
request.add_header('User-Agent', USER_AGENT)
|
|
response = urllib.request.urlopen(request)
|
|
html_text = response.read(BUFSIZ) # ignore more than BUFSIZ
|
|
response.close()
|
|
return 0, html_text, response.headers
|
|
except Exception as e:
|
|
log.warn('failed: %s' % e)
|
|
return 1, str(e), 'dummy'
|
|
|
|
|
|
def extract_title(url):
|
|
log = logging.getLogger(__name__)
|
|
global parser
|
|
|
|
if 'repo/urlbot.git' in url:
|
|
log.info('repo URL found: ' + url)
|
|
return 3, 'wee, that looks like my home repo!'
|
|
|
|
log.info('extracting title from ' + url)
|
|
|
|
(code, html_text, headers) = fetch_page(url)
|
|
|
|
if 1 == code:
|
|
return 3, 'failed: %s for %s' % (html_text, url)
|
|
|
|
if not html_text:
|
|
return -1, 'error'
|
|
|
|
charset = ''
|
|
if 'content-type' in headers:
|
|
log.debug('content-type: ' + headers['content-type'])
|
|
|
|
if 'text/' != headers['content-type'][:len('text/')]:
|
|
return 1, headers['content-type']
|
|
|
|
charset = re.sub(
|
|
r'.*charset=(?P<charset>\S+).*',
|
|
r'\g<charset>', headers['content-type'], re.IGNORECASE
|
|
)
|
|
|
|
if '' != charset:
|
|
try:
|
|
html_text = html_text.decode(charset)
|
|
except LookupError:
|
|
log.warn("invalid charset in '%s': '%s'" % (headers['content-type'], charset))
|
|
|
|
if str != type(html_text):
|
|
html_text = str(html_text)
|
|
|
|
result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html_text, re.S | re.M | re.IGNORECASE)
|
|
if result:
|
|
match = result.groups()[0]
|
|
|
|
if not parser:
|
|
parser = html.parser.HTMLParser()
|
|
try:
|
|
expanded_html = parser.unescape(match)
|
|
except UnicodeDecodeError as e: # idk why this can happen, but it does
|
|
log.warn('parser.unescape() expoded here: ' + str(e))
|
|
expanded_html = match
|
|
return 0, expanded_html
|
|
else:
|
|
return 2, 'no title'
|