testing str_sim() for (title, lev_url)

This commit is contained in:
urlbot
2014-10-05 23:39:51 +02:00
parent 09f96101f0
commit 6a916c701a
2 changed files with 38 additions and 26 deletions

View File

@@ -1,8 +1,9 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import re
def str_sim(a, b):
def str_sim(a, b, do_print=False):
a = a.lower()
b = b.lower()
@@ -17,6 +18,7 @@ def str_sim(a, b):
if a_parts[i] == b_parts[j]:
out[i][j] += 1
if do_print:
i = 0
for j in range(0, len(b_parts)):
print(' |'*i + ' '*2 + '.- ' + b_parts[j])
@@ -38,10 +40,11 @@ def sum_array(array):
return _sum
def wrapper_print(a, b, comment=''):
ret = str_sim(a, b)
ret = str_sim(a, b, do_print=True)
if '' != comment: comment = ' ^ ' + comment
print('[%2dx%2d::%2d]%s' %(len(ret), len(ret[0]), sum_array(ret), comment))
if '__main__' == __name__:
pairs = (
(
'http://de.wikipedia.org/wiki/Monte_Kali_%28Heringen%29',

View File

@@ -5,6 +5,7 @@ import sys, os, stat, re, time, pickle, random
import urllib.request, urllib.parse, urllib.error, html.parser
from local_config import conf, set_conf
from common import *
from strsim import str_sim
# rate limiting to 5 messages per 10 minutes
hist_ts = []
@@ -145,7 +146,15 @@ def extract_url(data):
conf_save(obj)
lev_str = 'lev=%d/%d:%d ' %(lev_res, len(title), len(lev_url))
message = lev_str + 'Title: %s: %s' %(title, r)
sim = str_sim(title, lev_url)
sim_len_title = len(sim)
sim_len_url = len(sim[0])
sim_sum = sum([sum(a) for a in sim])
sim_str = 'sim=%d/%d:%d ' %(sim_sum, sim_len_title, sim_len_url)
message = lev_str + sim_str + 'Title: %s: %s' %(title, r)
elif 1 == status:
if conf('image_preview'):
# of course it's fake, but it looks interesting at least