From 6a916c701acf7fac9dd8f7a2eb9d0f72c04d1230 Mon Sep 17 00:00:00 2001
From: urlbot <urlbot@eagle.local.yeeer.net>
Date: Sun, 5 Oct 2014 23:39:51 +0200
Subject: [PATCH] testing str_sim() for (title, lev_url)

---
 strsim.py | 53 ++++++++++++++++++++++++++++-------------------------
 urlbot.py | 11 ++++++++++-
 2 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/strsim.py b/strsim.py
index fbcf8e2..9a98749 100755
--- a/strsim.py
+++ b/strsim.py
@@ -1,8 +1,9 @@
 #!/usr/bin/python3
+# -*- coding: utf-8 -*-
 
 import re
 
-def str_sim(a, b):
+def str_sim(a, b, do_print=False):
 	a = a.lower()
 	b = b.lower()
 
@@ -17,14 +18,15 @@ def str_sim(a, b):
 			if a_parts[i] == b_parts[j]:
 				out[i][j] += 1
 	
-	i = 0
-	for j in range(0, len(b_parts)):
-		print('  |'*i + ' '*2 + '.- ' + b_parts[j])
-		i += 1
-	print('  |'*i)
+	if do_print:
+		i = 0
+		for j in range(0, len(b_parts)):
+			print('  |'*i + ' '*2 + '.- ' + b_parts[j])
+			i += 1
+		print('  |'*i)
 
-	for i in range(0, len(a_parts)):
-		print(' ' + str(out[i]) + ' ' + a_parts[i])
+		for i in range(0, len(a_parts)):
+			print(' ' + str(out[i]) + ' ' + a_parts[i])
 
 	return out
 
@@ -38,26 +40,27 @@ def sum_array(array):
 	return _sum
 
 def wrapper_print(a, b, comment=''):
-	ret = str_sim(a, b)
+	ret = str_sim(a, b, do_print=True)
 	if '' != comment: comment = ' ^ ' + comment
 	print('[%2dx%2d::%2d]%s' %(len(ret), len(ret[0]), sum_array(ret), comment))
 
-pairs = (
-	(
-		'http://de.wikipedia.org/wiki/Monte_Kali_%28Heringen%29',
-		'Monte Kali (Heringen)'
-	),
-	(
-		'http://www.spiegel.de/politik/ausland/buddhisten-treffen-in-colombo-blitzender-moench-a-994447.html',
-		'Buddhisten-Treffen in Colombo: Blitzender Mönch - SPIEGEL ONLINE'
+if '__main__' == __name__:
+	pairs = (
+		(
+			'http://de.wikipedia.org/wiki/Monte_Kali_%28Heringen%29',
+			'Monte Kali (Heringen)'
+		),
+		(
+			'http://www.spiegel.de/politik/ausland/buddhisten-treffen-in-colombo-blitzender-moench-a-994447.html',
+			'Buddhisten-Treffen in Colombo: Blitzender Mönch - SPIEGEL ONLINE'
+		)
 	)
-)
 
-wrapper_print('foo bar baz', 'foo bar boom')
+	wrapper_print('foo bar baz', 'foo bar boom')
 
-for (url, title) in pairs:
-	wrapper_print(title, url, comment='raw')
-	url_no_proto = re.sub(r'https?://[^/]*/', '', url)
-	wrapper_print(title, url_no_proto, comment='no proto/domain')
-	url_no_proto_no_digits = re.sub(r'[0-9]*', '', url_no_proto)
-	wrapper_print(title, url_no_proto_no_digits, comment='no proto/domain/[0-9]')
+	for (url, title) in pairs:
+		wrapper_print(title, url, comment='raw')
+		url_no_proto = re.sub(r'https?://[^/]*/', '', url)
+		wrapper_print(title, url_no_proto, comment='no proto/domain')
+		url_no_proto_no_digits = re.sub(r'[0-9]*', '', url_no_proto)
+		wrapper_print(title, url_no_proto_no_digits, comment='no proto/domain/[0-9]')
diff --git a/urlbot.py b/urlbot.py
index 92f1eb8..2b93e24 100755
--- a/urlbot.py
+++ b/urlbot.py
@@ -5,6 +5,7 @@ import sys, os, stat, re, time, pickle, random
 import urllib.request, urllib.parse, urllib.error, html.parser
 from local_config import conf, set_conf
 from common import *
+from strsim import str_sim
 
 # rate limiting to 5 messages per 10 minutes
 hist_ts = []
@@ -145,7 +146,15 @@ def extract_url(data):
 				conf_save(obj)
 
 				lev_str = 'lev=%d/%d:%d ' %(lev_res, len(title), len(lev_url))
-				message = lev_str + 'Title: %s: %s' %(title, r)
+
+				sim = str_sim(title, lev_url)
+				sim_len_title = len(sim)
+				sim_len_url = len(sim[0])
+				sim_sum = sum([sum(a) for a in sim])
+
+				sim_str = 'sim=%d/%d:%d ' %(sim_sum, sim_len_title, sim_len_url)
+
+				message = lev_str + sim_str + 'Title: %s: %s' %(title, r)
 			elif 1 == status:
 				if conf('image_preview'):
 					# of course it's fake, but it looks interesting at least