import nltk import re import random from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import BracketParseCorpusReader, PropbankCorpusReader if False: # Use the full combined treebank data. treebank = LazyCorpusLoader( 'treebank/combined', BracketParseCorpusReader, r'wsj/\d\d/wsj_.*\.mrg') propbank = LazyCorpusLoader( 'propbank', PropbankCorpusReader, 'prop.txt', 'frames/.*\.xml', 'verbs.txt', None, treebank) else: # Use the partial treebank corpus that ships with the NLTK. from nltk.corpus import treebank,propbank def split_files(files, holdout): random.seed(12345) shuffled_files = list(files); random.shuffle(shuffled_files) split = int((1.0 - holdout) * len(files)) return (shuffled_files[:split], shuffled_files[split:]) def model(train=None, test=None, holdout=0.05): """Run TRAIN and TEST on propbank instances and their corresponding trees, using HOLDOUT (a percentage) files for testing.""" (training_files, test_files) = map(dict.fromkeys, split_files(treebank.files(), holdout)) file = None; sentnum = -1; tree = None for instance in propbank.instances(): if instance.sentnum != sentnum or instance.filename != file: sentnum = instance.sentnum tree = instance.tree if instance.filename != file: file = instance.filename if file in training_files and train: train(instance, tree) elif file in test_files and test: test(instance, tree) else: break