mirror of
http://aero2k.de/t/repos/urlbot-native.git
synced 2017-09-06 15:25:38 +02:00
testing str_sim() for (title, lev_url)
This commit is contained in:
53
strsim.py
53
strsim.py
@@ -1,8 +1,9 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
|
||||
def str_sim(a, b):
|
||||
def str_sim(a, b, do_print=False):
|
||||
a = a.lower()
|
||||
b = b.lower()
|
||||
|
||||
@@ -17,14 +18,15 @@ def str_sim(a, b):
|
||||
if a_parts[i] == b_parts[j]:
|
||||
out[i][j] += 1
|
||||
|
||||
i = 0
|
||||
for j in range(0, len(b_parts)):
|
||||
print(' |'*i + ' '*2 + '.- ' + b_parts[j])
|
||||
i += 1
|
||||
print(' |'*i)
|
||||
if do_print:
|
||||
i = 0
|
||||
for j in range(0, len(b_parts)):
|
||||
print(' |'*i + ' '*2 + '.- ' + b_parts[j])
|
||||
i += 1
|
||||
print(' |'*i)
|
||||
|
||||
for i in range(0, len(a_parts)):
|
||||
print(' ' + str(out[i]) + ' ' + a_parts[i])
|
||||
for i in range(0, len(a_parts)):
|
||||
print(' ' + str(out[i]) + ' ' + a_parts[i])
|
||||
|
||||
return out
|
||||
|
||||
@@ -38,26 +40,27 @@ def sum_array(array):
|
||||
return _sum
|
||||
|
||||
def wrapper_print(a, b, comment=''):
|
||||
ret = str_sim(a, b)
|
||||
ret = str_sim(a, b, do_print=True)
|
||||
if '' != comment: comment = ' ^ ' + comment
|
||||
print('[%2dx%2d::%2d]%s' %(len(ret), len(ret[0]), sum_array(ret), comment))
|
||||
|
||||
pairs = (
|
||||
(
|
||||
'http://de.wikipedia.org/wiki/Monte_Kali_%28Heringen%29',
|
||||
'Monte Kali (Heringen)'
|
||||
),
|
||||
(
|
||||
'http://www.spiegel.de/politik/ausland/buddhisten-treffen-in-colombo-blitzender-moench-a-994447.html',
|
||||
'Buddhisten-Treffen in Colombo: Blitzender Mönch - SPIEGEL ONLINE'
|
||||
if '__main__' == __name__:
|
||||
pairs = (
|
||||
(
|
||||
'http://de.wikipedia.org/wiki/Monte_Kali_%28Heringen%29',
|
||||
'Monte Kali (Heringen)'
|
||||
),
|
||||
(
|
||||
'http://www.spiegel.de/politik/ausland/buddhisten-treffen-in-colombo-blitzender-moench-a-994447.html',
|
||||
'Buddhisten-Treffen in Colombo: Blitzender Mönch - SPIEGEL ONLINE'
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
wrapper_print('foo bar baz', 'foo bar boom')
|
||||
wrapper_print('foo bar baz', 'foo bar boom')
|
||||
|
||||
for (url, title) in pairs:
|
||||
wrapper_print(title, url, comment='raw')
|
||||
url_no_proto = re.sub(r'https?://[^/]*/', '', url)
|
||||
wrapper_print(title, url_no_proto, comment='no proto/domain')
|
||||
url_no_proto_no_digits = re.sub(r'[0-9]*', '', url_no_proto)
|
||||
wrapper_print(title, url_no_proto_no_digits, comment='no proto/domain/[0-9]')
|
||||
for (url, title) in pairs:
|
||||
wrapper_print(title, url, comment='raw')
|
||||
url_no_proto = re.sub(r'https?://[^/]*/', '', url)
|
||||
wrapper_print(title, url_no_proto, comment='no proto/domain')
|
||||
url_no_proto_no_digits = re.sub(r'[0-9]*', '', url_no_proto)
|
||||
wrapper_print(title, url_no_proto_no_digits, comment='no proto/domain/[0-9]')
|
||||
|
||||
Reference in New Issue
Block a user