simple parsing of forum postings with example action inlined

This commit is contained in:
Thorsten
2015-12-31 15:32:13 +01:00
parent 5c0846ea69
commit 3460e73b56

View File

@@ -3,8 +3,13 @@
""" """
The URLBot - ready to strive for desaster in YOUR jabber MUC The URLBot - ready to strive for desaster in YOUR jabber MUC
""" """
import re
import sys import sys
import time import time
from lxml import etree
import requests
from common import ( from common import (
rate_limit_classes, rate_limit_classes,
RATE_GLOBAL, RATE_GLOBAL,
@@ -218,6 +223,8 @@ class UrlBot(IdleBot):
try: try:
reacted_on_command = self.data_parse_commands(msg_obj) reacted_on_command = self.data_parse_commands(msg_obj)
reacted_on_parse = self.data_parse_other(msg_obj) reacted_on_parse = self.data_parse_other(msg_obj)
self.data_parse_forum_thread(msg_obj)
self.data_parse_forum_post(msg_obj)
if (msg_obj['body'].startswith(config.conf_get('bot_nickname')) and not any( if (msg_obj['body'].startswith(config.conf_get('bot_nickname')) and not any(
[reacted_on_command, reacted_on_parse]) and rate_limit(RATE_GLOBAL)): [reacted_on_command, reacted_on_parse]) and rate_limit(RATE_GLOBAL)):
@@ -296,6 +303,34 @@ class UrlBot(IdleBot):
reacted = True reacted = True
return reacted return reacted
def data_parse_forum_thread(self, msg_obj):
return
def data_parse_forum_post(self, msg_obj):
links = re.findall(r'(https?://(?:www\.)?debianforum\.de/forum/[^\s>]+)', msg_obj['body'])
for link in links:
html = requests.get(link).text
tree = etree.XML(html, etree.HTMLParser())
postid = re.findall('p=?([0-9]{4,})', link)
if not postid:
return
postid = 'p{}'.format(postid[0])
post_path = '//div[@id="{}"]'.format(postid)
postelement = tree.xpath(post_path)
if postelement:
postelement = postelement[0]
else:
self.logger.warn("No post with id {} found!".format(postid))
return
# excludes any [code] and [quote] elements by only looking at direct text child nodes
username_xpath = '//dl[@class="postprofile"]//*[contains(@href, "memberlist")]/text()'
user = tree.xpath('{}{}'.format(post_path, username_xpath))[0]
posttext = postelement.xpath('{}//div[@class="content"]/text()'.format(post_path))
print(user, '\n'.join(posttext))
summary_action = {'msg': '{} posted {} words'.format(user, len('\n'.join(posttext).split()))}
self._run_action(summary_action, plugin=plugin_storage[ptypes_COMMAND][0], msg_obj=msg_obj)
return
def _run_action(self, action, plugin, msg_obj): def _run_action(self, action, plugin, msg_obj):
""" """
Execute the plugin's execution plan Execute the plugin's execution plan