понедельник, 27 апреля 2009
теги видим?))import re
import codecs
#def mark(line, word, n):
# '''
# >>> mark("aa bbb ccc dd", 'bbb', 0)
# 'aa {bbb} ccc dd'
# >>> mark("aa aa aa", "aa", 1)
# 'aa {aa} aa'
# >>> mark("aa bbb aa aa", "aa", 1)
# 'aa bbb {aa} aa'
# >>> mark(" aa bbb aa aa", "aa", 1)
# ' aa bbb {aa} aa'
# '''
# res = ""
# for i, w in enumerate(re.findall('(\w+)', line, re.UNICODE)):
# if w != word:
# n += 1
# else:
# if i == n:
# w = '{' + word + '}'
# res += w
# res += ' '
# resstr = res[0
len(res)-1)]
# return resstr
def mark(line, word, n):
'''
>>> mark("aa bbb ccc dd bbb", 'bbb', 0)
'aa {bbb} ccc dd bbb'
>>> mark("aa aa aa", "aa", 1)
'aa {aa} aa'
>>> mark("aa bbb aa aa", "aa", 1)
'aa bbb {aa} aa'
>>> mark(" aa bbb aa", "aa", 1)
' aa bbb {aa}'
'''
matches = list(re.finditer('\w+', line, re.UNICODE))
f = 0
for m in matches:
if line[m.start() : m.end()] == word:
if n == f:
pos = m
break
else:
f += 1
resstr = line[0:pos.start()] + '{' + word + '}' + line[pos.end():len(line)]
return resstr
def nevozm(text):
'''
>>> sorted(nevozm(["Shakal shagal", "Shakal skakal", "Shakal ustal, no Shakal ne upal"]).items())
[('Shakal', ['{Shakal} shagal', '{Shakal} skakal', '{Shakal} ustal, no Shakal ne upal', 'Shakal ustal, no {Shakal} ne upal']), ('ne', ['Shakal ustal, no Shakal {ne} upal']), ('no', ['Shakal ustal, {no} Shakal ne upal']), ('shagal', ['Shakal {shagal}']), ('skakal', ['Shakal {skakal}']), ('upal', ['Shakal ustal, no Shakal ne {upal}']), ('ustal', ['Shakal {ustal}, no Shakal ne upal'])]
>>> nevozm(['Aadvark with lots of wings,', 'Yzarg with an interesting feature of formicating,']).get('') is None
True
'''
d = {}
for line in text:
list = re.findall('\w+', line, re.UNICODE)
for word in list:
if word not in d:
n = 0
d[word] = [mark(line, word, n)]
else:
probline = mark(line, word, n)
if probline not in d[word]:
d[word].append(probline)
else:
n += 1
d[word].append(mark(line, word, n))
return d
if __name__ == '__main__':
import doctest
doctest.testmod()
a=codecs.open('a.txt', encoding='utf-8', mode='r' )
text = []
for s in a.readlines():
text.append(s.rstrip())
print text
print nevozm(text)
output = codecs.open('c:\\result.txt', encoding = 'utf-8', mode = 'w')
for word, str in sorted(nevozm(text).items()):
word += '\n'
output.write(word)
for i in str:
i += '\n'
output.write(i)
output.close()
@темы:
ботанизм,
будни,
гордое до невозможности,
задушенный шакал
предлагаю флешмоб - перебери все 16 млн цветов)))