వాడుకరి:Mpradeepbot/mpc.wkt.brown.py
స్వరూపం
ఈ ప్రోగ్రాముకు అనుబంధంగా ఈ ఫైలుని వాడండి. బ్రౌను పదకోశం డేటాబేసు ఈ విధంగా ఉంటుంది.
import wikipedia, time, config, codecs
# Replace the contents in the page 'pageTitle' with data 'pageData'
# and add the comment 'comment'
def writeData(pageTitle, pageData, comment):
page = wikipedia.Page(wikipedia.getSite(), pageTitle)
try:
# Load the page's text from the wiki
data = page.get()
except wikipedia.NoPage:
data = u''
data = pageData
try:
page.put(data, comment = comment)
except wikipedia.EditConflict:
wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
except wikipedia.SpamfilterError, url:
wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), url))
wikipedia.output(u'Waiting for 1 second(s)')
time.sleep(1)
# Appends the contents the contents to the logfile and writes to the wiktionary
def writeLogData(pageTitle, pageData, comment, logfile):
logfile.write(u'Title: '+pageTitle+u'\r\n')
logfile.write(u'comment: '+comment+u'\r\n')
logfile.write(pageData + u'\r\n\r\n\r\n')
writeData(pageTitle, pageData, comment)
# Retrieves the contents of the given page 'pageTitle'
# if page is not present then return an empty string
def getData(pageTitle):
page = wikipedia.Page(wikipedia.getSite(), pageTitle)
#get the page from wikipedia
try:
pageData = page.get(nofollow_redirects=True)
except wikipedia.NoPage:
pageData = u''
return pageData
# returns the element removing the quotes and the trailing and preceeding white.spaces
def getElement(line, position):
line = line.replace('\',', '\'-|-|-')
words = line.split('-|-|-')
fQuote = words[position-1].find('\'') + 1
lQuote = words[position-1].rfind('\'')
element = words[position-1][fQuote:lQuote]
element = element.strip()
return element
def replacePos(posWords, pos):
aPos = pos
if pos == 'a' or pos == 'adj':
aPos = posWords[0]
elif pos == 'n':
aPos = posWords[1]
elif pos == 'pron':
aPos = posWords[2]
elif pos == 'v':
aPos = posWords[3]
elif pos == 'p':
aPos = posWords[4]
elif pos == 'adv':
aPos = posWords[5]
elif pos == 'prep':
aPos = posWords[6]
elif pos == 'conj':
aPos = posWords[7]
elif pos == 'interj':
aPos = posWords[8]
return aPos
dataFile = open('mpc.wkt.brown.part62.txt', 'rb' )
inputFile = open('mpc.wkt.brown.musa.txt', 'rb' )
logfile = codecs.open('mpc.wkt.brown.log', encoding='utf-8', mode='wb')
#omit 3 characters if it is UTF-8
#dataFile.read(3)
inputFile.read(3)
engName = ''
meaning = ''
pos = ''
posType = ''
# initialize the parts of speech
posWords = []
count = 0
while count < 9:
line = u'' + unicode(inputFile.readline(), 'utf8')
line = line.replace(u'\n',u'')
line = line.replace(u'\r',u'')
posWords.append(u'\'\'\'' + line + u'\'\'\'')
count = count + 1
brownLine = u'' + unicode(inputFile.readline(), 'utf8')
refLine1 = u'' + unicode(inputFile.readline(), 'utf8')
refLine2 = u'' + unicode(inputFile.readline(), 'utf8')
catline = u'' + unicode(inputFile.readline(), 'utf8')
revLine = u'' + unicode(inputFile.readline(), 'utf8')
count = 0
site = wikipedia.getSite()
for line in dataFile:
line = u'' + unicode(line, 'utf8')
line = line.replace('INSERT INTO `eng2te` VALUES (','')
line = line.replace('\');','\'')
engName = getElement(line, 1)
pos = getElement(line, 2)
posType = getElement(line, 3)
meaning = getElement(line, 4)
# update the parts of speech
pos = replacePos(posWords, pos)
posType = replacePos(posWords, posType)
# Check if the current page becomes a redirect page
redirectTo = u''
if meaning[0:4] == u'See ' or meaning[0:4] == u'see ':
redirectTo = meaning[4:len(meaning.replace(u'.', u''))]
if redirectTo[0:3] == u'To' or redirectTo[0:3] == u'to':
redirectTo = redirectTo[3:len(redirectTo)]
# Check if current page will have redirects from any page
redirectFrom = u''
# the 'to' case
if engName[0:3] == u'To ' or engName[0:3] == u'to ':
redirectFrom = engName
engName = engName[3:len(engName)]
# the 'or' case
if engName.find(u' or ') != -1:
redirectFrom = engName.split(u' or ')[1]
engName = engName.split(u' or ')[0]
# the ',' case
if engName.find(u',') != -1:
redirectFrom = engName.split(u',')[1]
engName = engName.split(u',')[0]
engName = engName.replace(u'\'\'', u'\'')
engName = engName.lower()
redirectFrom = redirectFrom.lower()
redirectTo = redirectTo.lower()
# replace the * in meaning with engName
meaning = meaning.replace(u'*', u'\'\'' + engName + u'\'\'')
# divide the examples in the meaning
meaning = meaning.replace(u'. ', u'.')
meaning = meaning.replace(u'.', u'. ')
if meaning.count(u'. ') >= 2:
meaning = meaning.replace(u'. ', u'.\n* ', meaning.count(u'. ')-1)
# build the text for the pages
redirectFromData = u''
mainPageData = u''
if redirectFrom != u'':
redirectFromData = u'#REDIRECT [[' + engName + u']]\n'
if redirectTo != u'':
mainPageData = u'#REDIRECT [[' + redirectTo + u']]\n'
comment = u'Bot: creating redirect page'
else:
mainPageData = brownLine
if pos != u'':
mainPageData = mainPageData + pos + u', '
if posType != u'':
mainPageData = mainPageData + posType + u', '
mainPageData = mainPageData + meaning + u'\n\n\n'
mainPageData = mainPageData + refLine1 + refLine2 + u'\n'
mainPageData = mainPageData + catline + u'\n'
mainPageData = mainPageData + u'<!-- Interwiki Links -->\n[[en:' + engName + u']]'
comment = u'Bot: creating page for a word'
wikipedia.output(u'' + mainPageData)
wikipedia.output(u'')
wikipedia.output(u'')
wikipedia.output(u'')
#upload to wiktionary
#upload the redirectFrom page
if redirectFrom != u'':
data = getData(redirectFrom)
if (data+'\n') == redirectFromData:
wikipedia.output(u'no need to update any thing')
elif data == u'':
writeLogData(redirectFrom, redirectFromData, u'Bot: creating redirect page', logfile)
else:
writeLogData(u'Talk:' + redirectFrom, u'Add the following text to main page\n ' + redirectFromData + u'', u'Bot: creating redirect page', logfile)
#upload the main page
data = getData(engName)
if (data+u'\n') == mainPageData:
wikipedia.output(u'no need to update any thing')
elif data == u'':
writeLogData(engName, mainPageData, comment, logfile)
else:
if redirectTo != u'':
writeLogData(u'Talk:' + engName, u'Add the following text to main page\n ' + mainPageData + u'', comment, logfile)
else:
if data.find(mainPageData) != -1:
wikipedia.output(u'no need to do any update')
else:
writeLogData(engName, data + u'\n\n' + mainPageData + u'\n\n' + revLine, u'Bot: Updating word page with meaning from Brown dictionary', logfile)
count = count + 1
## uncomment the following lines while testing the BOT
# if count >= 10:
# break
print 'Total records uploaded - ' + str(count)
dataFile.close()
inputFile.close()
logfile.close()