urlbot-native/common.py

# -*- coding: utf-8 -*-
""" Common functions for urlbot """
import html.parser
import logging
import re
import requests
from urllib.error import URLError

BUFSIZ = 8192
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) ' \
             'Gecko/20100101 Firefox/31.0 Iceweasel/31.0'


def get_version_git():
    import subprocess

    cmd = ['git', 'log', '--oneline', '--abbrev-commit']

    try:
        p = subprocess.Popen(cmd, bufsize=1, stdout=subprocess.PIPE)
        first_line = p.stdout.readline()
        line_count = len(p.stdout.readlines()) + 1

        if 0 == p.wait():
            # skip this 1st, 2nd, 3rd stuff and use always [0-9]th
            return "version (Git, %dth rev) '%s'" % (
                line_count, str(first_line.strip(), encoding='utf8')
            )
        else:
            return "(unknown version)"
    except:
        return "cannot determine version"


VERSION = get_version_git()


def fetch_page(url):
    log = logging.getLogger(__name__)
    log.info('fetching page ' + url)
    response = requests.get(url, headers={'User-Agent': USER_AGENT}, stream=True)
    content = response.raw.read(BUFSIZ, decode_content=True)
    return content.decode(response.encoding or 'utf-8'), response.headers


def extract_title(url):
    log = logging.getLogger(__name__)
    global parser

    if 'repo/urlbot-native.git' in url:
        log.info('repo URL found: ' + url)
        return 'wee, that looks like my home repo!', []

    log.info('extracting title from ' + url)

    try:
        (html_text, headers) = fetch_page(url)
    except URLError as e:
        return None
    except UnicodeDecodeError:
        return None
    except Exception as e:
        return 'failed: %s for %s' % (str(e), url)

    if 'content-type' in headers:
        log.debug('content-type: ' + headers['content-type'])

        if 'text/' != headers['content-type'][:len('text/')]:
            return 1, headers['content-type']

    result = re.match(r'.*?<title.*?>(.*?)</title>.*?', html_text, re.S | re.M | re.IGNORECASE)
    if result:
        match = result.groups()[0]

        parser = html.parser.HTMLParser()
        try:
            expanded_html = parser.unescape(match)
        except UnicodeDecodeError as e:  # idk why this can happen, but it does
            log.warn('parser.unescape() expoded here: ' + str(e))
            expanded_html = match
        return expanded_html
    else:
        return None


def giphy(subject, api_key):
    url = 'http://api.giphy.com/v1/gifs/random?tag={}&api_key={}&limit=1&offset=0'.format(subject, api_key)
    response = requests.get(url)
    giphy_url = None
    try:
        data = response.json()
        giphy_url = data['data']['image_url']
    except:
        pass
    return giphy_url


def get_nick_from_object(message_obj):
    """
    not quite correct yet, also the private property access isn't nice.
    """
    nick = message_obj['mucnick'] or message_obj['from']._jid[2]
    return nick


def else_command(args):
    log = logging.getLogger(__name__)
    log.info('sent short info')
    return {
        'msg': args['reply_user'] + ''': I'm a bot (highlight me with 'info' for more information).'''
    }
moved common stuff to common.py; import adjustments 2014-09-27 09:19:46 +02:00			`# -- coding: utf-8 --`
minor cleanup 2015-11-30 19:50:11 +01:00			`""" Common functions for urlbot """`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00			`import html.parser`
			`import logging`
			`import re`
replace urllib with requests 2015-12-31 15:31:34 +01:00			`import requests`
remove error output for url resolving if its the url which is broken. 2015-12-21 19:39:09 +01:00			`from urllib.error import URLError`
fix totally wrong rate limit storage initialization 2015-11-20 22:23:31 +01:00
moved common stuff to common.py; import adjustments 2014-09-27 09:19:46 +02:00			`BUFSIZ = 8192`
minor cleanup 2015-11-30 19:50:11 +01:00			`USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) ' \`
			`'Gecko/20100101 Firefox/31.0 Iceweasel/31.0'`
moved common stuff to common.py; import adjustments 2014-09-27 09:19:46 +02:00
whitespace and import fixes 2016-04-05 18:40:31 +02:00
moved VERSION stuff to common.py 2014-09-27 09:41:29 +02:00			`def get_version_git():`
all death to the tab character 2015-11-30 19:17:40 +01:00			`import subprocess`
moved VERSION stuff to common.py 2014-09-27 09:41:29 +02:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`cmd = ['git', 'log', '--oneline', '--abbrev-commit']`
moved VERSION stuff to common.py 2014-09-27 09:41:29 +02:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`try:`
			`p = subprocess.Popen(cmd, bufsize=1, stdout=subprocess.PIPE)`
			`first_line = p.stdout.readline()`
			`line_count = len(p.stdout.readlines()) + 1`
moved VERSION stuff to common.py 2014-09-27 09:41:29 +02:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`if 0 == p.wait():`
			`# skip this 1st, 2nd, 3rd stuff and use always [0-9]th`
			`return "version (Git, %dth rev) '%s'" % (`
			`line_count, str(first_line.strip(), encoding='utf8')`
			`)`
			`else:`
			`return "(unknown version)"`
			`except:`
			`return "cannot determine version"`
moved VERSION stuff to common.py 2014-09-27 09:41:29 +02:00
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
moved VERSION stuff to common.py 2014-09-27 09:41:29 +02:00			`VERSION = get_version_git()`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00

			`def fetch_page(url):`
all death to the tab character 2015-11-30 19:17:40 +01:00			`log = logging.getLogger(__name__)`
			`log.info('fetching page ' + url)`
replace urllib with requests 2015-12-31 15:31:34 +01:00			`response = requests.get(url, headers={'User-Agent': USER_AGENT}, stream=True)`
			`content = response.raw.read(BUFSIZ, decode_content=True)`
schei? encoding - doing workarounds 2016-01-04 22:42:18 +01:00			`return content.decode(response.encoding or 'utf-8'), response.headers`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00

			`def extract_title(url):`
all death to the tab character 2015-11-30 19:17:40 +01:00			`log = logging.getLogger(__name__)`
			`global parser`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`if 'repo/urlbot-native.git' in url:`
			`log.info('repo URL found: ' + url)`
remove error output for url resolving if its the url which is broken. 2015-12-21 19:39:09 +01:00			`return 'wee, that looks like my home repo!', []`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`log.info('extracting title from ' + url)`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
remove error output for url resolving if its the url which is broken. 2015-12-21 19:39:09 +01:00			`try:`
			`(html_text, headers) = fetch_page(url)`
			`except URLError as e:`
			`return None`
catch unicode errors, returning nothing 2016-01-06 16:02:54 +01:00			`except UnicodeDecodeError:`
			`return None`
remove error output for url resolving if its the url which is broken. 2015-12-21 19:39:09 +01:00			`except Exception as e:`
			`return 'failed: %s for %s' % (str(e), url)`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`if 'content-type' in headers:`
			`log.debug('content-type: ' + headers['content-type'])`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`if 'text/' != headers['content-type'][:len('text/')]:`
			`return 1, headers['content-type']`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`result = re.match(r'.?<title.?>(.?)</title>.?', html_text, re.S \| re.M \| re.IGNORECASE)`
			`if result:`
			`match = result.groups()[0]`
refactor urlbot plugin structure and code style 2015-11-20 21:07:48 +01:00
all death to the tab character 2015-11-30 19:17:40 +01:00			`parser = html.parser.HTMLParser()`
			`try:`
			`expanded_html = parser.unescape(match)`
			`except UnicodeDecodeError as e: # idk why this can happen, but it does`
			`log.warn('parser.unescape() expoded here: ' + str(e))`
			`expanded_html = match`
remove error output for url resolving if its the url which is broken. 2015-12-21 19:39:09 +01:00			`return expanded_html`
all death to the tab character 2015-11-30 19:17:40 +01:00			`else:`
remove error output for url resolving if its the url which is broken. 2015-12-21 19:39:09 +01:00			`return None`
more cake 2015-12-22 13:42:44 +01:00

			`def giphy(subject, api_key):`
			`url = 'http://api.giphy.com/v1/gifs/random?tag={}&api_key={}&limit=1&offset=0'.format(subject, api_key)`
replace urllib with requests 2015-12-31 15:31:34 +01:00			`response = requests.get(url)`
more cake 2015-12-22 13:42:44 +01:00			`giphy_url = None`
			`try:`
replace urllib with requests 2015-12-31 15:31:34 +01:00			`data = response.json()`
more cake 2015-12-22 13:42:44 +01:00			`giphy_url = data['data']['image_url']`
			`except:`
			`pass`
			`return giphy_url`
refactor plugins 2015-12-26 13:50:21 +01:00

strip : from user in record {user} 2016-01-28 20:18:26 +01:00			`def get_nick_from_object(message_obj):`
			`"""`
			`not quite correct yet, also the private property access isn't nice.`
			`"""`
			`nick = message_obj['mucnick'] or message_obj['from']._jid[2]`
			`return nick`


refactored pluginsystem 2016-04-05 14:18:22 +02:00			`def else_command(args):`
			`log = logging.getLogger(__name__)`
			`log.info('sent short info')`
			`return {`
			`'msg': args['reply_user'] + ''': I'm a bot (highlight me with 'info' for more information).'''`
			`}`