2014-09-28 18:03:08 +02:00
#!/usr/bin/python3
2014-08-10 22:10:00 +02:00
# -*- coding: utf-8 -*-
2014-07-20 23:39:51 +02:00
2014-12-14 16:26:48 +01:00
import sys , re , time , pickle , random
2014-09-28 18:03:08 +02:00
import urllib . request , urllib . parse , urllib . error , html . parser
2014-09-27 09:19:46 +02:00
from common import *
2014-07-20 23:39:51 +02:00
2014-12-02 15:04:53 +01:00
try :
from local_config import conf , set_conf
except ImportError :
sys . stderr . write ( '''
% s : E : local_config . py isn ' t tracked because of included secrets and
% s site specific configurations . Rename local_config . py . skel and
% s adjust to you needs .
''' [1:] % (
sys . argv [ 0 ] ,
' ' * len ( sys . argv [ 0 ] ) ,
' ' * len ( sys . argv [ 0 ] )
)
)
sys . exit ( - 1 )
from sleekxmpp import ClientXMPP
2014-07-21 04:54:50 +02:00
# rate limiting to 5 messages per 10 minutes
hist_ts = [ ]
2014-07-21 09:49:13 +02:00
hist_flag = True
2014-07-21 04:54:50 +02:00
2014-09-21 17:39:06 +02:00
parser = None
2014-07-21 00:53:26 +02:00
def fetch_page ( url ) :
2015-06-21 00:50:42 +02:00
log . info ( ' fetching page ' + url )
2014-07-21 02:58:29 +02:00
try :
2014-10-05 13:58:44 +02:00
request = urllib . request . Request ( url )
request . add_header ( ' User-Agent ' , ''' Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0 Iceweasel/31.0 ''' )
response = urllib . request . urlopen ( request )
2014-12-02 17:01:40 +01:00
html_text = response . read ( BUFSIZ ) # ignore more than BUFSIZ
2014-07-21 02:58:29 +02:00
response . close ( )
2014-10-01 10:21:27 +02:00
return ( 0 , html_text , response . headers )
2014-10-11 16:22:19 +02:00
except Exception as e :
2015-06-21 00:50:42 +02:00
log . warn ( ' failed: %s ' % e )
2014-10-01 10:21:27 +02:00
return ( 1 , str ( e ) , ' dummy ' )
2014-08-09 23:39:00 +02:00
2014-10-01 10:21:27 +02:00
return ( - 1 , None , None )
2014-07-21 00:53:26 +02:00
def extract_title ( url ) :
2014-09-21 17:39:06 +02:00
global parser
2014-08-09 20:29:38 +02:00
if ' repo/urlbot.git ' in url :
2015-06-21 00:50:42 +02:00
log . info ( ' repo URL found: ' + url )
2014-08-09 20:29:38 +02:00
return ( 3 , ' wee, that looks like my home repo! ' )
2015-06-21 00:50:42 +02:00
log . info ( ' extracting title from ' + url )
2014-07-21 02:58:29 +02:00
2014-10-01 10:21:27 +02:00
( code , html_text , headers ) = fetch_page ( url )
2014-12-02 17:01:40 +01:00
2014-10-01 10:21:27 +02:00
if 1 == code :
2015-07-05 13:26:15 +02:00
return ( 3 , ' failed: %s for %s ' % ( html_text , url ) )
2014-10-01 10:21:27 +02:00
2014-09-28 18:03:08 +02:00
if html_text :
2014-09-21 17:39:06 +02:00
charset = ' '
2014-07-27 12:21:32 +02:00
if ' content-type ' in headers :
2015-06-21 01:31:55 +02:00
log . debug ( ' content-type: ' + headers [ ' content-type ' ] )
2014-09-21 17:39:06 +02:00
2014-07-27 12:21:32 +02:00
if ' text/ ' != headers [ ' content-type ' ] [ : len ( ' text/ ' ) ] :
return ( 1 , headers [ ' content-type ' ] )
2014-12-02 17:01:40 +01:00
charset = re . sub (
' .*charset=(?P<charset> \ S+).* ' ,
' \ g<charset> ' , headers [ ' content-type ' ] , re . IGNORECASE
)
2014-09-21 17:39:06 +02:00
2014-09-28 18:03:08 +02:00
if ' ' != charset :
try :
html_text = html_text . decode ( charset )
except LookupError :
2015-07-04 10:21:57 +02:00
log . warn ( " invalid charset in ' %s ' : ' %s ' " % ( headers [ ' content-type ' ] , charset ) )
2014-09-28 18:03:08 +02:00
if str != type ( html_text ) :
html_text = str ( html_text )
result = re . match ( r ' .*?<title.*?>(.*?)</title>.*? ' , html_text , re . S | re . M | re . IGNORECASE )
2014-07-21 02:58:29 +02:00
if result :
2014-09-14 12:05:01 +02:00
match = result . groups ( ) [ 0 ]
2014-09-21 17:39:06 +02:00
if None == parser :
2014-09-28 18:03:08 +02:00
parser = html . parser . HTMLParser ( )
2014-09-21 17:39:06 +02:00
2014-08-20 02:44:11 +02:00
try :
2014-09-14 12:05:01 +02:00
expanded_html = parser . unescape ( match )
2014-12-02 17:01:40 +01:00
except UnicodeDecodeError as e : # idk why this can happen, but it does
2015-06-21 00:50:42 +02:00
log . warn ( ' parser.unescape() expoded here: ' + str ( e ) )
2014-09-14 12:05:01 +02:00
expanded_html = match
2014-08-20 02:44:11 +02:00
return ( 0 , expanded_html )
2014-08-01 20:49:07 +02:00
else :
return ( 2 , ' no title ' )
2014-08-09 23:39:00 +02:00
2014-08-01 20:49:07 +02:00
return ( - 1 , ' error ' )
2014-07-20 23:39:51 +02:00
2014-12-13 22:46:23 +01:00
def send_reply ( message , msg_obj ) :
2014-09-27 06:07:44 +02:00
set_conf ( ' request_counter ' , conf ( ' request_counter ' ) + 1 )
2014-08-02 09:20:52 +02:00
2014-12-14 15:54:57 +01:00
if str is not type ( message ) :
message = ' \n ' . join ( message )
2014-10-29 13:25:09 +01:00
2014-07-21 08:28:46 +02:00
if debug_enabled ( ) :
2014-09-28 18:03:08 +02:00
print ( message )
2014-07-21 08:28:46 +02:00
else :
2014-12-14 01:21:30 +01:00
msg_obj . reply ( body = message ) . send ( )
2014-07-21 02:27:54 +02:00
2014-12-02 17:01:40 +01:00
def ratelimit_touch ( ignored = None ) : # FIXME: separate counters
2014-09-27 08:43:33 +02:00
hist_ts . append ( time . time ( ) )
2014-07-21 04:54:50 +02:00
2014-09-27 05:56:39 +02:00
if conf ( ' hist_max_count ' ) < len ( hist_ts ) :
2014-09-27 05:51:18 +02:00
hist_ts . pop ( 0 )
2014-12-02 17:01:40 +01:00
def ratelimit_exceeded ( ignored = None ) : # FIXME: separate counters
2014-09-27 05:51:18 +02:00
global hist_flag
2014-09-27 05:56:39 +02:00
if conf ( ' hist_max_count ' ) < len ( hist_ts ) :
2014-07-21 04:54:50 +02:00
first = hist_ts . pop ( 0 )
2014-09-27 08:43:33 +02:00
if ( time . time ( ) - first ) < conf ( ' hist_max_time ' ) :
2014-07-21 09:49:13 +02:00
if hist_flag :
hist_flag = False
2014-12-13 22:46:23 +01:00
# FIXME: this is very likely broken now
2014-12-14 16:26:48 +01:00
send_reply ( ' (rate limited to %d messages in %d seconds, try again at %s ) ' % ( conf ( ' hist_max_count ' ) , conf ( ' hist_max_time ' ) , time . strftime ( ' % T % Z ' , time . localtime ( hist_ts [ 0 ] + conf ( ' hist_max_time ' ) ) ) ) )
2014-07-21 09:49:13 +02:00
2015-06-21 00:50:42 +02:00
log . warn ( ' rate limiting exceeded: ' + pickle . dumps ( hist_ts ) )
2014-07-21 04:54:50 +02:00
return True
2014-07-21 09:49:13 +02:00
hist_flag = True
2014-07-21 04:54:50 +02:00
return False
2014-12-13 22:46:23 +01:00
def extract_url ( data , msg_obj ) :
2014-07-21 09:59:09 +02:00
result = re . findall ( " (https?://[^ \ s>]+) " , data )
2014-12-14 15:54:57 +01:00
if not result :
return
ret = None
2014-12-14 16:01:09 +01:00
out = [ ]
2014-12-14 15:54:57 +01:00
for url in result :
ratelimit_touch ( )
if ratelimit_exceeded ( msg_obj ) :
return False
flag = False
for b in conf ( ' url_blacklist ' ) :
if not None is re . match ( b , url ) :
flag = True
2015-06-21 00:50:42 +02:00
log . info ( ' url blacklist match for ' + url )
2014-12-14 16:01:09 +01:00
break
2014-12-14 15:54:57 +01:00
if flag :
# an URL has matched the blacklist, continue to the next URL
continue
2014-11-28 19:13:45 +01:00
2014-10-09 22:48:23 +02:00
# urllib.request is broken:
# >>> '.'.encode('idna')
# ....
# UnicodeError: label empty or too long
# >>> '.a.'.encode('idna')
# ....
# UnicodeError: label empty or too long
# >>> 'a.a.'.encode('idna')
# b'a.a.'
2014-12-14 15:54:57 +01:00
try :
( status , title ) = extract_title ( url )
except UnicodeError as e :
( status , title ) = ( 4 , str ( e ) )
if 0 == status :
title = title . strip ( )
2015-07-04 10:22:30 +02:00
message = ' Title: %s ' % title
2014-12-14 15:54:57 +01:00
elif 1 == status :
if conf ( ' image_preview ' ) :
# of course it's fake, but it looks interesting at least
char = """ ,._-+= \ |/*`~ " ' """
2015-07-04 10:22:30 +02:00
message = ' No text but %s , 1-bit ASCII art preview: [ %c ] ' % (
title , random . choice ( char )
2014-12-14 15:54:57 +01:00
)
2014-08-01 20:49:07 +02:00
else :
2015-06-21 00:50:42 +02:00
log . info ( ' no message sent for non-text %s ( %s ) ' % ( url , title ) )
2014-12-14 15:54:57 +01:00
continue
elif 2 == status :
2015-07-04 10:22:30 +02:00
message = ' (No title) '
2014-12-14 15:54:57 +01:00
elif 3 == status :
message = title
elif 4 == status :
message = ' Bug triggered ( %s ), invalid URL/domain part: %s ' % ( title , url )
2015-06-21 00:50:42 +02:00
log . warn ( message )
2014-12-14 15:54:57 +01:00
else :
2015-07-05 13:26:15 +02:00
message = ' some error occurred when fetching %s ' % url
2014-12-14 15:54:57 +01:00
message = message . replace ( ' \n ' , ' \\ n ' )
2014-07-21 02:58:29 +02:00
2015-06-21 00:50:42 +02:00
log . info ( ' adding to out buf: ' + message )
2014-12-14 16:01:09 +01:00
out . append ( message )
2014-12-14 15:54:57 +01:00
ret = True
2014-07-22 22:23:10 +02:00
2014-12-14 16:01:09 +01:00
if True == ret :
send_reply ( out , msg_obj )
2014-08-02 20:48:06 +02:00
return ret
2014-07-21 02:27:54 +02:00
2014-12-13 22:46:23 +01:00
def handle_msg ( msg_obj ) :
content = msg_obj [ ' body ' ]
2014-07-20 23:39:51 +02:00
2014-09-14 12:05:01 +02:00
if ' has set the subject to: ' in content :
return
2014-10-11 16:31:09 +02:00
if sys . argv [ 0 ] in content :
2015-06-21 00:50:42 +02:00
log . info ( ' silenced, this is my own log ' )
2014-10-11 16:31:09 +02:00
return
2014-12-14 03:57:46 +01:00
if ' nospoiler ' in content :
2015-06-21 00:50:42 +02:00
log . info ( ' no spoiler for: ' + content )
2014-12-14 03:57:46 +01:00
return
2015-03-07 20:49:00 +01:00
# don't react to itself
if str ( msg_obj [ ' from ' ] ) . startswith ( conf ( ' bot_user ' ) ) :
return
2014-12-14 03:57:46 +01:00
arg_user = msg_obj [ ' mucnick ' ]
2015-07-11 00:27:46 +02:00
blob_userpref = conf_load ( ) . get ( ' user_pref ' , [ ] )
2014-12-14 03:57:46 +01:00
nospoiler = False
if arg_user in blob_userpref :
if ' spoiler ' in blob_userpref [ arg_user ] :
if not blob_userpref [ arg_user ] [ ' spoiler ' ] :
2015-06-21 00:50:42 +02:00
log . info ( ' nospoiler from conf ' )
2014-12-14 03:57:46 +01:00
nospoiler = True
ret = None
if not nospoiler :
ret = extract_url ( content , msg_obj )
2015-02-22 20:15:23 +01:00
# print(' '.join(["%s->%s" % (x, msg_obj[x]) for x in msg_obj.keys()]))
2014-12-14 03:57:46 +01:00
if True != ret :
2014-12-13 22:46:23 +01:00
plugins . data_parse_commands ( msg_obj )
plugins . data_parse_other ( msg_obj )
2014-09-14 12:05:01 +02:00
return
2014-07-20 23:39:51 +02:00
2014-12-02 15:04:53 +01:00
class bot ( ClientXMPP ) :
2014-12-13 22:46:23 +01:00
def __init__ ( self , jid , password , rooms , nick ) :
2014-12-02 15:04:53 +01:00
ClientXMPP . __init__ ( self , jid , password )
2014-12-13 22:46:23 +01:00
self . rooms = rooms
2014-12-02 15:04:53 +01:00
self . nick = nick
self . add_event_handler ( ' session_start ' , self . session_start )
self . add_event_handler ( ' groupchat_message ' , self . muc_message )
2014-12-13 23:29:51 +01:00
self . add_event_handler ( ' message ' , self . message )
2014-12-02 15:04:53 +01:00
2015-07-08 01:42:22 +02:00
for r in self . rooms :
self . add_event_handler ( ' muc:: %s ::got_online ' % r , self . muc_online )
2014-12-02 15:04:53 +01:00
def session_start ( self , event ) :
self . get_roster ( )
self . send_presence ( )
2014-12-13 22:46:23 +01:00
for room in self . rooms :
2015-06-21 00:50:42 +02:00
log . info ( ' joining %s ' % room )
2014-12-13 22:46:23 +01:00
self . plugin [ ' xep_0045 ' ] . joinMUC (
room ,
self . nick ,
wait = True
)
2014-12-02 15:04:53 +01:00
2014-12-13 22:46:23 +01:00
def muc_message ( self , msg_obj ) :
2014-12-02 15:04:53 +01:00
# don't talk to yourself
2014-12-13 22:46:23 +01:00
if msg_obj [ ' mucnick ' ] == self . nick :
2014-12-02 15:04:53 +01:00
return
2014-12-13 22:46:23 +01:00
return handle_msg ( msg_obj )
2014-12-02 15:04:53 +01:00
2014-12-13 23:29:51 +01:00
def message ( self , msg_obj ) :
if ' groupchat ' == msg_obj [ ' type ' ] :
return
2015-07-08 01:42:22 +02:00
def muc_online ( self , msg_obj ) :
# don't react to yourself
2015-07-11 00:27:46 +02:00
if msg_obj [ ' muc ' ] [ ' nick ' ] == self . nick :
2015-07-08 01:42:22 +02:00
return
2015-07-11 00:27:46 +02:00
arg_user = msg_obj [ ' muc ' ] [ ' nick ' ]
2015-07-11 11:34:18 +02:00
arg_user_key = arg_user . lower ( )
2015-07-11 00:27:46 +02:00
blob_userrecords = conf_load ( ) . get ( ' user_records ' , { } )
2015-07-11 11:34:18 +02:00
if arg_user_key in blob_userrecords :
records = blob_userrecords [ arg_user_key ]
2015-07-11 00:27:46 +02:00
if not records :
return
self . send_message (
mto = msg_obj [ ' from ' ] . bare ,
mbody = ' %s , there %s %d message %s for you: \n %s ' % (
arg_user ,
' is ' if 1 == len ( records ) else ' are ' ,
len ( records ) ,
' ' if 1 == len ( records ) else ' s ' ,
2015-07-11 11:34:18 +02:00
' \n ' . join ( records )
2015-07-11 00:27:46 +02:00
) ,
mtype = ' groupchat '
)
log . info ( ' sent %d offline records to room %s ' % (
len ( records ) , msg_obj [ ' from ' ] . bare
) )
2015-07-08 01:42:22 +02:00
2015-07-11 00:27:46 +02:00
if conf ( ' persistent_locked ' ) :
log . warn ( " couldn ' t get exclusive lock " )
return False
2015-07-08 01:42:22 +02:00
2015-07-11 00:27:46 +02:00
set_conf ( ' persistent_locked ' , True )
blob = conf_load ( )
if ' user_records ' not in blob :
blob [ ' user_records ' ] = { }
2014-12-14 01:21:30 +01:00
2015-07-11 13:01:22 +02:00
if arg_user_key in blob [ ' user_records ' ] :
blob [ ' user_records ' ] . pop ( arg_user_key )
2015-07-11 00:27:46 +02:00
conf_save ( blob )
set_conf ( ' persistent_locked ' , False )
return
2014-12-13 23:29:51 +01:00
2015-06-20 15:13:12 +02:00
# def set_presence(self, msg):
# for room in self.rooms:
# self.send_presence(pto=room, pstatus=msg)
2014-11-09 16:52:22 +01:00
if ' __main__ ' == __name__ :
2015-06-21 00:50:42 +02:00
log . info ( VERSION )
2014-11-09 16:52:22 +01:00
import plugins
2014-09-27 06:03:04 +02:00
2014-12-13 22:46:23 +01:00
plugins . send_reply = send_reply
2014-11-09 16:52:22 +01:00
plugins . ratelimit_exceeded = ratelimit_exceeded
plugins . ratelimit_touch = ratelimit_touch
2014-09-27 06:03:04 +02:00
2014-11-09 16:52:22 +01:00
plugins . register_all ( )
2014-09-27 05:32:35 +02:00
2014-12-02 15:04:53 +01:00
logging . basicConfig (
2014-12-02 16:32:53 +01:00
level = logging . INFO ,
2014-12-02 15:04:53 +01:00
format = ' %(levelname)-8s %(message)s '
)
xmpp = bot (
jid = conf ( ' jid ' ) ,
password = conf ( ' password ' ) ,
2014-12-13 22:46:23 +01:00
rooms = conf ( ' rooms ' ) ,
2014-12-02 15:04:53 +01:00
nick = conf ( ' bot_user ' )
)
xmpp . connect ( )
xmpp . register_plugin ( ' xep_0045 ' )
2014-12-02 17:15:22 +01:00
xmpp . process ( )
2014-12-02 15:04:53 +01:00
2014-08-10 00:30:58 +02:00
while 1 :
try :
2015-06-20 14:18:50 +02:00
if False == plugins . event_trigger ( ) :
xmpp . disconnect ( )
sys . exit ( 1 )
2014-09-29 19:15:00 +02:00
2014-08-10 00:30:58 +02:00
time . sleep ( delay )
except KeyboardInterrupt :
2014-09-28 22:44:42 +02:00
print ( ' ' )
2014-08-10 00:30:58 +02:00
exit ( 130 )