понедельник, 16 февраля 2009
читать дальшеimport codecs
from genshi.template import TemplateLoader
import os
class Entry(object):
def __init__(self, word, leftContext, rightContext,Linenumber):
self.word = word
self.leftContext = leftContext
self.rightContext =rightContext
self.LineNumber = Linenumber
GADOSTI = "! ?.,-;\n\"\'\t\r"
def razbivalka(line, lnum):
''' razbivalka is a function that tries
to split the string given...
>>> razbivalka("far away" , 1 )
{'far': [(1, 4)], 'away': [(1, 13)]}
>>> razbivalka("far;;;;;;;;!;;;;away", 1)
{'far': [(1, 4)], 'away': [(1, 20)]}
>>> razbivalka(' far away ', 1)
{'far': [(1, 6)], 'away': [(1, 12)]}
>>> razbivalka(' ', 1)
{}
>>> razbivalka('foobar,', 1)
{'foobar': [(1, 7)]}
>>> razbivalka(',,,,,', 1)
{}
'''
d = {}
def dobavlialka(word, pos):
d.setdefault(word, []).append((lnum, pos + 1))
current_word = ""
for pos, e in enumerate(line.lower()):
if e not in GADOSTI:
current_word = current_word + e
else:
if current_word != "":
dobavlialka(current_word, pos)
current_word = ""
if current_word != "":
dobavlialka(current_word, pos)
return d
def main():
f = codecs.open ("file2.txt", encoding = "utf-8")
fnum = codecs.open ("file2.htm", "w", encoding = "utf-8")
linesaved = f.readlines()
i=0
for line in linesaved:
i+=1
fnum.write ("" + line + '
')
f.close()
fnum.close()
final_dict = {}
for lnum, line in enumerate(linesaved):
for k, v in razbivalka(line, lnum).iteritems():
final_dict.setdefault(k, []).extend(v)
for v in final_dict.itervalues():
v.sort()
#a = raw_input("I am looking for a word: ").lower()
#word_needed = a.decode("cp1251")
word_needed = "*+*"
data = final_dict.get(word_needed)
def vyvodilka((lnum, sn), word):
word_beg = sn - len(word) - 1
line = linesaved[lnum]
context = Entry(line[word_beg:sn],line [:word_beg], line[sn: ],str(lnum))
return context
entries = []
newdict = {}
if data is not None or word_needed == '*+*':
for s, p in final_dict.iteritems():
for a in p:
newdict.setdefault(s, []).append(vyvodilka(a, s))
else:
data = None
final(word_needed, newdict, data)
def final(word_needed, newdict, data):
loader = TemplateLoader(
os.path.join(os.path.dirname(__file__), 'templates'),
auto_reload=True
)
if word_needed == "*+*":
tmpl = loader.load('output-genshi-all.htm')
f = codecs.open('example.html', 'w', encoding = 'utf16')
f.write(tmpl.generate(newdict = newdict).render('html', doctype='html', encoding = None))
else:
if data is None:
tmpl = loader.load('output-genshi-failed.htm')
f = codecs.open('example.html', 'w', encoding = 'utf16')
f.write(tmpl.generate(w = word_needed).render('html', doctype='html', encoding = None))
f.close()
if data is not None:
entries = []
for e in newdict.get(word_needed):
entries.append(e)
result = len(entries)
tmpl = loader.load('output-genshi-ok.htm')
f = codecs.open('example.html', 'w', encoding = 'utf16')
f.write(tmpl.generate(occurs = len(entries), w = word_needed, entries = entries ).render('html', doctype='html', encoding = None))
if __name__ == "__main__":
#import doctest
#doctest.testmod()
main()
@темы:
ботанизм,
задушенный шакал