From 6a916c701acf7fac9dd8f7a2eb9d0f72c04d1230 Mon Sep 17 00:00:00 2001 From: urlbot Date: Sun, 5 Oct 2014 23:39:51 +0200 Subject: [PATCH] testing str_sim() for (title, lev_url) --- strsim.py | 53 ++++++++++++++++++++++++++++------------------------- urlbot.py | 11 ++++++++++- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/strsim.py b/strsim.py index fbcf8e2..9a98749 100755 --- a/strsim.py +++ b/strsim.py @@ -1,8 +1,9 @@ #!/usr/bin/python3 +# -*- coding: utf-8 -*- import re -def str_sim(a, b): +def str_sim(a, b, do_print=False): a = a.lower() b = b.lower() @@ -17,14 +18,15 @@ def str_sim(a, b): if a_parts[i] == b_parts[j]: out[i][j] += 1 - i = 0 - for j in range(0, len(b_parts)): - print(' |'*i + ' '*2 + '.- ' + b_parts[j]) - i += 1 - print(' |'*i) + if do_print: + i = 0 + for j in range(0, len(b_parts)): + print(' |'*i + ' '*2 + '.- ' + b_parts[j]) + i += 1 + print(' |'*i) - for i in range(0, len(a_parts)): - print(' ' + str(out[i]) + ' ' + a_parts[i]) + for i in range(0, len(a_parts)): + print(' ' + str(out[i]) + ' ' + a_parts[i]) return out @@ -38,26 +40,27 @@ def sum_array(array): return _sum def wrapper_print(a, b, comment=''): - ret = str_sim(a, b) + ret = str_sim(a, b, do_print=True) if '' != comment: comment = ' ^ ' + comment print('[%2dx%2d::%2d]%s' %(len(ret), len(ret[0]), sum_array(ret), comment)) -pairs = ( - ( - 'http://de.wikipedia.org/wiki/Monte_Kali_%28Heringen%29', - 'Monte Kali (Heringen)' - ), - ( - 'http://www.spiegel.de/politik/ausland/buddhisten-treffen-in-colombo-blitzender-moench-a-994447.html', - 'Buddhisten-Treffen in Colombo: Blitzender Mönch - SPIEGEL ONLINE' +if '__main__' == __name__: + pairs = ( + ( + 'http://de.wikipedia.org/wiki/Monte_Kali_%28Heringen%29', + 'Monte Kali (Heringen)' + ), + ( + 'http://www.spiegel.de/politik/ausland/buddhisten-treffen-in-colombo-blitzender-moench-a-994447.html', + 'Buddhisten-Treffen in Colombo: Blitzender Mönch - SPIEGEL ONLINE' + ) ) -) -wrapper_print('foo bar baz', 'foo bar boom') + wrapper_print('foo bar baz', 'foo bar boom') -for (url, title) in pairs: - wrapper_print(title, url, comment='raw') - url_no_proto = re.sub(r'https?://[^/]*/', '', url) - wrapper_print(title, url_no_proto, comment='no proto/domain') - url_no_proto_no_digits = re.sub(r'[0-9]*', '', url_no_proto) - wrapper_print(title, url_no_proto_no_digits, comment='no proto/domain/[0-9]') + for (url, title) in pairs: + wrapper_print(title, url, comment='raw') + url_no_proto = re.sub(r'https?://[^/]*/', '', url) + wrapper_print(title, url_no_proto, comment='no proto/domain') + url_no_proto_no_digits = re.sub(r'[0-9]*', '', url_no_proto) + wrapper_print(title, url_no_proto_no_digits, comment='no proto/domain/[0-9]') diff --git a/urlbot.py b/urlbot.py index 92f1eb8..2b93e24 100755 --- a/urlbot.py +++ b/urlbot.py @@ -5,6 +5,7 @@ import sys, os, stat, re, time, pickle, random import urllib.request, urllib.parse, urllib.error, html.parser from local_config import conf, set_conf from common import * +from strsim import str_sim # rate limiting to 5 messages per 10 minutes hist_ts = [] @@ -145,7 +146,15 @@ def extract_url(data): conf_save(obj) lev_str = 'lev=%d/%d:%d ' %(lev_res, len(title), len(lev_url)) - message = lev_str + 'Title: %s: %s' %(title, r) + + sim = str_sim(title, lev_url) + sim_len_title = len(sim) + sim_len_url = len(sim[0]) + sim_sum = sum([sum(a) for a in sim]) + + sim_str = 'sim=%d/%d:%d ' %(sim_sum, sim_len_title, sim_len_url) + + message = lev_str + sim_str + 'Title: %s: %s' %(title, r) elif 1 == status: if conf('image_preview'): # of course it's fake, but it looks interesting at least