1
0
mirror of http://aero2k.de/t/repos/urlbot-native.git synced 2017-09-06 15:25:38 +02:00

testing levenshtein distance for (url, title)

This commit is contained in:
urlbot
2014-09-29 00:11:40 +02:00
parent eba70a5ed0
commit 98dd94fc63
2 changed files with 24 additions and 1 deletions

View File

@@ -42,6 +42,27 @@ def conf_load():
fd.seek(0) fd.seek(0)
return pickle.load(fd) return pickle.load(fd)
def levenshtein(a, b, return_table=False):
'''returns the levenshtein distance between a and b'''
# initialisize a table with 0, but the 0-rows/cols with their index
d = [[ (i if 0 == j else j if 0 == i else 0) for j in range(len(b)+1) ] for i in range(len(a)+1) ]
for i in range(1, len(a)+1):
for j in range(1, len(b)+1):
if a[i-1] == b[j-1]:
d[i][j] = d[i-1][j-1]
else:
d[i][j] = min(
d[i-1][j] + 1, # deletion
d[i][j-1] + 1, # insertion
d[i-1][j-1] + 1, # substitution
)
if return_table:
return (d, d[i][j])
else:
return d[i][j]
def get_version_git(): def get_version_git():
import subprocess import subprocess

View File

@@ -133,7 +133,9 @@ def extract_url(data):
(status, title) = extract_title(r) (status, title) = extract_title(r)
if 0 == status: if 0 == status:
message = 'Title: %s: %s' % (title.strip(), r) message = 'lev=%d/%d:%d Title: %s: %s' %(
levenshtein(r, title.strip()), len(title.strip()), len(r), title.strip(), r
)
elif 1 == status: elif 1 == status:
logger('info', 'no message sent for non-text %s (%s)' %(r, title)) logger('info', 'no message sent for non-text %s (%s)' %(r, title))
continue continue