diff --git a/common.py b/common.py index 59d7754..d1dcb30 100644 --- a/common.py +++ b/common.py @@ -42,6 +42,27 @@ def conf_load(): fd.seek(0) return pickle.load(fd) +def levenshtein(a, b, return_table=False): + '''returns the levenshtein distance between a and b''' + # initialisize a table with 0, but the 0-rows/cols with their index + d = [[ (i if 0 == j else j if 0 == i else 0) for j in range(len(b)+1) ] for i in range(len(a)+1) ] + + for i in range(1, len(a)+1): + for j in range(1, len(b)+1): + if a[i-1] == b[j-1]: + d[i][j] = d[i-1][j-1] + else: + d[i][j] = min( + d[i-1][j] + 1, # deletion + d[i][j-1] + 1, # insertion + d[i-1][j-1] + 1, # substitution + ) + + if return_table: + return (d, d[i][j]) + else: + return d[i][j] + def get_version_git(): import subprocess diff --git a/urlbot.py b/urlbot.py index 77ca3a4..62db411 100755 --- a/urlbot.py +++ b/urlbot.py @@ -133,7 +133,9 @@ def extract_url(data): (status, title) = extract_title(r) if 0 == status: - message = 'Title: %s: %s' % (title.strip(), r) + message = 'lev=%d/%d:%d Title: %s: %s' %( + levenshtein(r, title.strip()), len(title.strip()), len(r), title.strip(), r + ) elif 1 == status: logger('info', 'no message sent for non-text %s (%s)' %(r, title)) continue