The algorithm is supposed to work on a reverse logic, considering which word affects the least number of verses and learning it last. It's too slow to handle the category four input file (I reckon it could take a week). I wrote a modified version which is much faster but does not perform as well.
The major time code is recalculating the word frequencies in each step. If this could be sped up, the algorithm would be tractable for larger inputs. The modified version runs in just a few minutes.
import sys
def remove_partial_verses(known_words, verseDict):
known_set = set(known_words)
for verseID, wordSet in verseDict.items():
if known_set.intersection(wordSet):
del verseDict[verseID]
def countWord(word, verseDict):
'''
This method counts the number of verses which contain the word
'''
count = 0
for verse in verseDict:
# count the number of verses
wordSet = verseDict[verse]
if word in wordSet: count += 1
return count
if __name__ == "__main__":
importFile = sys.argv[1]
inputFile = file(importFile, 'r')
wordDict = {}
verseDict = {}
for line in inputFile:
verse, word = line.split()
if verse in verseDict:
wordSet = verseDict[verse]
wordSet.add(word)
verseDict[verse] = wordSet
else:
verseDict[verse] = set([word])
if word in wordDict:
verseSet = wordDict[word]
verseSet.add(verse)
else:
wordDict[word] = set([verse])
totalWordSet = set(wordDict.keys())
known_words = []
learn_order = []
while(len(totalWordSet) > 0):
# print "Length: ", len(totalWordSet)
pairs = [(countWord(word, verseDict), word) for word in totalWordSet]
pairs.sort()
theWord = pairs[0][1]
learn_order.append(theWord)
known_words.append(theWord)
totalWordSet.discard(theWord)
remove_partial_verses(known_words, verseDict)
for word in learn_order[::-1]:
print "learn ", word