From 98dd94fc63c898e2da67f6568fc5e3a5b7702ad9 Mon Sep 17 00:00:00 2001 From: urlbot Date: Mon, 29 Sep 2014 00:11:40 +0200 Subject: [PATCH] testing levenshtein distance for (url, title) --- common.py | 21 +++++++++++++++++++++ urlbot.py | 4 +++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/common.py b/common.py index 59d7754..d1dcb30 100644 --- a/common.py +++ b/common.py @@ -42,6 +42,27 @@ def conf_load(): fd.seek(0) return pickle.load(fd) +def levenshtein(a, b, return_table=False): + '''returns the levenshtein distance between a and b''' + # initialisize a table with 0, but the 0-rows/cols with their index + d = [[ (i if 0 == j else j if 0 == i else 0) for j in range(len(b)+1) ] for i in range(len(a)+1) ] + + for i in range(1, len(a)+1): + for j in range(1, len(b)+1): + if a[i-1] == b[j-1]: + d[i][j] = d[i-1][j-1] + else: + d[i][j] = min( + d[i-1][j] + 1, # deletion + d[i][j-1] + 1, # insertion + d[i-1][j-1] + 1, # substitution + ) + + if return_table: + return (d, d[i][j]) + else: + return d[i][j] + def get_version_git(): import subprocess diff --git a/urlbot.py b/urlbot.py index 77ca3a4..62db411 100755 --- a/urlbot.py +++ b/urlbot.py @@ -133,7 +133,9 @@ def extract_url(data): (status, title) = extract_title(r) if 0 == status: - message = 'Title: %s: %s' % (title.strip(), r) + message = 'lev=%d/%d:%d Title: %s: %s' %( + levenshtein(r, title.strip()), len(title.strip()), len(r), title.strip(), r + ) elif 1 == status: logger('info', 'no message sent for non-text %s (%s)' %(r, title)) continue