mirror of
http://aero2k.de/t/repos/urlbot-native.git
synced 2017-09-06 15:25:38 +02:00
testing levenshtein distance for (url, title)
This commit is contained in:
21
common.py
21
common.py
@@ -42,6 +42,27 @@ def conf_load():
|
|||||||
fd.seek(0)
|
fd.seek(0)
|
||||||
return pickle.load(fd)
|
return pickle.load(fd)
|
||||||
|
|
||||||
|
def levenshtein(a, b, return_table=False):
|
||||||
|
'''returns the levenshtein distance between a and b'''
|
||||||
|
# initialisize a table with 0, but the 0-rows/cols with their index
|
||||||
|
d = [[ (i if 0 == j else j if 0 == i else 0) for j in range(len(b)+1) ] for i in range(len(a)+1) ]
|
||||||
|
|
||||||
|
for i in range(1, len(a)+1):
|
||||||
|
for j in range(1, len(b)+1):
|
||||||
|
if a[i-1] == b[j-1]:
|
||||||
|
d[i][j] = d[i-1][j-1]
|
||||||
|
else:
|
||||||
|
d[i][j] = min(
|
||||||
|
d[i-1][j] + 1, # deletion
|
||||||
|
d[i][j-1] + 1, # insertion
|
||||||
|
d[i-1][j-1] + 1, # substitution
|
||||||
|
)
|
||||||
|
|
||||||
|
if return_table:
|
||||||
|
return (d, d[i][j])
|
||||||
|
else:
|
||||||
|
return d[i][j]
|
||||||
|
|
||||||
def get_version_git():
|
def get_version_git():
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
|
|||||||
@@ -133,7 +133,9 @@ def extract_url(data):
|
|||||||
(status, title) = extract_title(r)
|
(status, title) = extract_title(r)
|
||||||
|
|
||||||
if 0 == status:
|
if 0 == status:
|
||||||
message = 'Title: %s: %s' % (title.strip(), r)
|
message = 'lev=%d/%d:%d Title: %s: %s' %(
|
||||||
|
levenshtein(r, title.strip()), len(title.strip()), len(r), title.strip(), r
|
||||||
|
)
|
||||||
elif 1 == status:
|
elif 1 == status:
|
||||||
logger('info', 'no message sent for non-text %s (%s)' %(r, title))
|
logger('info', 'no message sent for non-text %s (%s)' %(r, title))
|
||||||
continue
|
continue
|
||||||
|
|||||||
Reference in New Issue
Block a user