testing str_sim() for (title, lev_url)
This commit is contained in:
53
strsim.py
53
strsim.py
@@ -1,8 +1,9 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
def str_sim(a, b):
|
def str_sim(a, b, do_print=False):
|
||||||
a = a.lower()
|
a = a.lower()
|
||||||
b = b.lower()
|
b = b.lower()
|
||||||
|
|
||||||
@@ -17,14 +18,15 @@ def str_sim(a, b):
|
|||||||
if a_parts[i] == b_parts[j]:
|
if a_parts[i] == b_parts[j]:
|
||||||
out[i][j] += 1
|
out[i][j] += 1
|
||||||
|
|
||||||
i = 0
|
if do_print:
|
||||||
for j in range(0, len(b_parts)):
|
i = 0
|
||||||
print(' |'*i + ' '*2 + '.- ' + b_parts[j])
|
for j in range(0, len(b_parts)):
|
||||||
i += 1
|
print(' |'*i + ' '*2 + '.- ' + b_parts[j])
|
||||||
print(' |'*i)
|
i += 1
|
||||||
|
print(' |'*i)
|
||||||
|
|
||||||
for i in range(0, len(a_parts)):
|
for i in range(0, len(a_parts)):
|
||||||
print(' ' + str(out[i]) + ' ' + a_parts[i])
|
print(' ' + str(out[i]) + ' ' + a_parts[i])
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
@@ -38,26 +40,27 @@ def sum_array(array):
|
|||||||
return _sum
|
return _sum
|
||||||
|
|
||||||
def wrapper_print(a, b, comment=''):
|
def wrapper_print(a, b, comment=''):
|
||||||
ret = str_sim(a, b)
|
ret = str_sim(a, b, do_print=True)
|
||||||
if '' != comment: comment = ' ^ ' + comment
|
if '' != comment: comment = ' ^ ' + comment
|
||||||
print('[%2dx%2d::%2d]%s' %(len(ret), len(ret[0]), sum_array(ret), comment))
|
print('[%2dx%2d::%2d]%s' %(len(ret), len(ret[0]), sum_array(ret), comment))
|
||||||
|
|
||||||
pairs = (
|
if '__main__' == __name__:
|
||||||
(
|
pairs = (
|
||||||
'http://de.wikipedia.org/wiki/Monte_Kali_%28Heringen%29',
|
(
|
||||||
'Monte Kali (Heringen)'
|
'http://de.wikipedia.org/wiki/Monte_Kali_%28Heringen%29',
|
||||||
),
|
'Monte Kali (Heringen)'
|
||||||
(
|
),
|
||||||
'http://www.spiegel.de/politik/ausland/buddhisten-treffen-in-colombo-blitzender-moench-a-994447.html',
|
(
|
||||||
'Buddhisten-Treffen in Colombo: Blitzender Mönch - SPIEGEL ONLINE'
|
'http://www.spiegel.de/politik/ausland/buddhisten-treffen-in-colombo-blitzender-moench-a-994447.html',
|
||||||
|
'Buddhisten-Treffen in Colombo: Blitzender Mönch - SPIEGEL ONLINE'
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
wrapper_print('foo bar baz', 'foo bar boom')
|
wrapper_print('foo bar baz', 'foo bar boom')
|
||||||
|
|
||||||
for (url, title) in pairs:
|
for (url, title) in pairs:
|
||||||
wrapper_print(title, url, comment='raw')
|
wrapper_print(title, url, comment='raw')
|
||||||
url_no_proto = re.sub(r'https?://[^/]*/', '', url)
|
url_no_proto = re.sub(r'https?://[^/]*/', '', url)
|
||||||
wrapper_print(title, url_no_proto, comment='no proto/domain')
|
wrapper_print(title, url_no_proto, comment='no proto/domain')
|
||||||
url_no_proto_no_digits = re.sub(r'[0-9]*', '', url_no_proto)
|
url_no_proto_no_digits = re.sub(r'[0-9]*', '', url_no_proto)
|
||||||
wrapper_print(title, url_no_proto_no_digits, comment='no proto/domain/[0-9]')
|
wrapper_print(title, url_no_proto_no_digits, comment='no proto/domain/[0-9]')
|
||||||
|
|||||||
11
urlbot.py
11
urlbot.py
@@ -5,6 +5,7 @@ import sys, os, stat, re, time, pickle, random
|
|||||||
import urllib.request, urllib.parse, urllib.error, html.parser
|
import urllib.request, urllib.parse, urllib.error, html.parser
|
||||||
from local_config import conf, set_conf
|
from local_config import conf, set_conf
|
||||||
from common import *
|
from common import *
|
||||||
|
from strsim import str_sim
|
||||||
|
|
||||||
# rate limiting to 5 messages per 10 minutes
|
# rate limiting to 5 messages per 10 minutes
|
||||||
hist_ts = []
|
hist_ts = []
|
||||||
@@ -145,7 +146,15 @@ def extract_url(data):
|
|||||||
conf_save(obj)
|
conf_save(obj)
|
||||||
|
|
||||||
lev_str = 'lev=%d/%d:%d ' %(lev_res, len(title), len(lev_url))
|
lev_str = 'lev=%d/%d:%d ' %(lev_res, len(title), len(lev_url))
|
||||||
message = lev_str + 'Title: %s: %s' %(title, r)
|
|
||||||
|
sim = str_sim(title, lev_url)
|
||||||
|
sim_len_title = len(sim)
|
||||||
|
sim_len_url = len(sim[0])
|
||||||
|
sim_sum = sum([sum(a) for a in sim])
|
||||||
|
|
||||||
|
sim_str = 'sim=%d/%d:%d ' %(sim_sum, sim_len_title, sim_len_url)
|
||||||
|
|
||||||
|
message = lev_str + sim_str + 'Title: %s: %s' %(title, r)
|
||||||
elif 1 == status:
|
elif 1 == status:
|
||||||
if conf('image_preview'):
|
if conf('image_preview'):
|
||||||
# of course it's fake, but it looks interesting at least
|
# of course it's fake, but it looks interesting at least
|
||||||
|
|||||||
Reference in New Issue
Block a user