понедельник, 13 апреля 2009
читать дальшеimport re
import codecs
def mark(line, word, n):
'''
>>> mark("aa bbb ccc dd", 'bbb', 0)
'aa {bbb} ccc dd'
>>> mark("aa aa aa", "aa", 1)
'aa {aa} aa'
>>> mark("aa bbb aa aa", "aa", 1)
'aa bbb {aa} aa'
>>> mark(" aa bbb ccc dd", 'bbb', 0)
' aa {bbb} ccc dd'
'''
res = ""
for i, w in enumerate(re.findall('(\w+)', line, re.UNICODE)):
if w != word:
n += 1
else:
if i == n:
w = '{' + word + '}'
res += w
res += ' '
#удалить последний пробел!!!!!!!!!!!!!!!!!!!!!!!!!!1
return resstr
def nevozm(text):
'''
>>> sorted(nevozm(["Shakal shagal", "Shakal skakal", "Shakal ustal, no Shakal ne upal"]).items())
[('Shakal', ['{Shakal} shagal', '{Shakal} skakal', '{Shakal} ustal, no Shakal ne upal', 'Shakal ustal, no {Shakal} ne upal']), ('ne', ['Shakal ustal, no Shakal {ne} upal']), ('no', ['Shakal ustal, {no} Shakal ne upal']), ('shagal', ['Shakal {shagal}']), ('skakal', ['Shakal {skakal}']), ('upal', ['Shakal ustal, no Shakal ne {upal}']), ('ustal', ['Shakal {ustal}, no Shakal ne upal'])]
>>> nevozm(['Aadvark with lots of wings,', 'Yzarg with an interesting feature of formicating,']).get('') is None
True
'''
d = {}
for line in text:
list = re.findall('\w+', line, re.UNICODE)
for word in list:
if word not in d:
n = 0
d[word] = [mark(line, word, n)]
else:
probline = mark(line, word, n)
if probline not in d[word]:
d[word].append(probline)
else:
n += 1
d[word].append(mark(line, word, n))
return d
if __name__ == '__main__':
import doctest
doctest.testmod()
a=codecs.open('a.txt', encoding='utf-8', mode='r' )
text = []
for s in a.readlines():
text.append(s.rstrip())
print text
print nevozm(text)
output = codecs.open('c:\\result.txt', encoding = 'utf-8', mode = 'w')
for word, str in sorted(nevozm(text).items()):
word += '\n'
output.write(word)
for i in str:
i += '\n'
output.write(i)
output.close()
@темы:
ботанизм,
будни,
задушенный шакал