testing str_sim() for (title, lev_url)

This commit is contained in:
urlbot
2014-10-05 23:39:51 +02:00
parent 09f96101f0
commit 6a916c701a
2 changed files with 38 additions and 26 deletions

View File

@@ -1,8 +1,9 @@
#!/usr/bin/python3 #!/usr/bin/python3
# -*- coding: utf-8 -*-
import re import re
def str_sim(a, b): def str_sim(a, b, do_print=False):
a = a.lower() a = a.lower()
b = b.lower() b = b.lower()
@@ -17,14 +18,15 @@ def str_sim(a, b):
if a_parts[i] == b_parts[j]: if a_parts[i] == b_parts[j]:
out[i][j] += 1 out[i][j] += 1
i = 0 if do_print:
for j in range(0, len(b_parts)): i = 0
print(' |'*i + ' '*2 + '.- ' + b_parts[j]) for j in range(0, len(b_parts)):
i += 1 print(' |'*i + ' '*2 + '.- ' + b_parts[j])
print(' |'*i) i += 1
print(' |'*i)
for i in range(0, len(a_parts)): for i in range(0, len(a_parts)):
print(' ' + str(out[i]) + ' ' + a_parts[i]) print(' ' + str(out[i]) + ' ' + a_parts[i])
return out return out
@@ -38,26 +40,27 @@ def sum_array(array):
return _sum return _sum
def wrapper_print(a, b, comment=''): def wrapper_print(a, b, comment=''):
ret = str_sim(a, b) ret = str_sim(a, b, do_print=True)
if '' != comment: comment = ' ^ ' + comment if '' != comment: comment = ' ^ ' + comment
print('[%2dx%2d::%2d]%s' %(len(ret), len(ret[0]), sum_array(ret), comment)) print('[%2dx%2d::%2d]%s' %(len(ret), len(ret[0]), sum_array(ret), comment))
pairs = ( if '__main__' == __name__:
( pairs = (
'http://de.wikipedia.org/wiki/Monte_Kali_%28Heringen%29', (
'Monte Kali (Heringen)' 'http://de.wikipedia.org/wiki/Monte_Kali_%28Heringen%29',
), 'Monte Kali (Heringen)'
( ),
'http://www.spiegel.de/politik/ausland/buddhisten-treffen-in-colombo-blitzender-moench-a-994447.html', (
'Buddhisten-Treffen in Colombo: Blitzender Mönch - SPIEGEL ONLINE' 'http://www.spiegel.de/politik/ausland/buddhisten-treffen-in-colombo-blitzender-moench-a-994447.html',
'Buddhisten-Treffen in Colombo: Blitzender Mönch - SPIEGEL ONLINE'
)
) )
)
wrapper_print('foo bar baz', 'foo bar boom') wrapper_print('foo bar baz', 'foo bar boom')
for (url, title) in pairs: for (url, title) in pairs:
wrapper_print(title, url, comment='raw') wrapper_print(title, url, comment='raw')
url_no_proto = re.sub(r'https?://[^/]*/', '', url) url_no_proto = re.sub(r'https?://[^/]*/', '', url)
wrapper_print(title, url_no_proto, comment='no proto/domain') wrapper_print(title, url_no_proto, comment='no proto/domain')
url_no_proto_no_digits = re.sub(r'[0-9]*', '', url_no_proto) url_no_proto_no_digits = re.sub(r'[0-9]*', '', url_no_proto)
wrapper_print(title, url_no_proto_no_digits, comment='no proto/domain/[0-9]') wrapper_print(title, url_no_proto_no_digits, comment='no proto/domain/[0-9]')

View File

@@ -5,6 +5,7 @@ import sys, os, stat, re, time, pickle, random
import urllib.request, urllib.parse, urllib.error, html.parser import urllib.request, urllib.parse, urllib.error, html.parser
from local_config import conf, set_conf from local_config import conf, set_conf
from common import * from common import *
from strsim import str_sim
# rate limiting to 5 messages per 10 minutes # rate limiting to 5 messages per 10 minutes
hist_ts = [] hist_ts = []
@@ -145,7 +146,15 @@ def extract_url(data):
conf_save(obj) conf_save(obj)
lev_str = 'lev=%d/%d:%d ' %(lev_res, len(title), len(lev_url)) lev_str = 'lev=%d/%d:%d ' %(lev_res, len(title), len(lev_url))
message = lev_str + 'Title: %s: %s' %(title, r)
sim = str_sim(title, lev_url)
sim_len_title = len(sim)
sim_len_url = len(sim[0])
sim_sum = sum([sum(a) for a in sim])
sim_str = 'sim=%d/%d:%d ' %(sim_sum, sim_len_title, sim_len_url)
message = lev_str + sim_str + 'Title: %s: %s' %(title, r)
elif 1 == status: elif 1 == status:
if conf('image_preview'): if conf('image_preview'):
# of course it's fake, but it looks interesting at least # of course it's fake, but it looks interesting at least