#!/usr/bin/env python3 """ ENLP A1 Part I: Naive Bayes (Adapted from Alan Ritter) """ import sys, os, glob import numpy as np from collections import Counter from math import log from nltk.stem.wordnet import WordNetLemmatizer from evaluation import Eval def load_docs(direc, lemmatize, labelMapFile='labels.csv'): """Return a list of word-token-lists, one per document. Words are optionally lemmatized with WordNet.""" labelMap = {} # docID => gold label, loaded from mapping file with open(os.path.join(direc, labelMapFile)) as inF: for ln in inF: docid, label = ln.strip().split(',') assert docid not in labelMap labelMap[docid] = label # create parallel lists of documents and labels docs, labels = [], [] for file_path in glob.glob(os.path.join(direc, '*.txt')): filename = os.path.basename(file_path) # open the file at file_path, construct a list of its word tokens, # and append that list to 'docs'. # look up the document's label and append it to 'labels'. ... return docs, labels class NaiveBayes: def __init__(self, train_docs, train_labels, ALPHA=1.0): # list of native language codes in the corpus self.CLASSES = ['ARA', 'DEU', 'FRA', 'HIN', 'ITA', 'JPN', 'KOR', 'SPA', 'TEL', 'TUR', 'ZHO'] self.ALPHA=ALPHA self.priorProbs = {l: 0 for l in self.CLASSES} self.likelihoodProbs = {l: Counter() for l in self.CLASSES} self.trainVocab = set() self.learn(train_docs, train_labels, alpha=self.ALPHA) def learn(self, docs, labels, alpha=1.0): """Estimate parameters for a naive Bayes bag-of-words model with the given training data and amount of add-alpha smoothing.""" assert len(docs)==len(labels) labelCounts = {l: 0 for l in self.CLASSES} wordCounts = {l: Counter() for l in self.CLASSES} totalWordCounts = {l: 0 for l in self.CLASSES} # iterate over documents in order to record # count(y) in labelCounts, count(y,word) in wordCounts, # count(y,w) for all words in totalWordCounts, # and to store the training vocabulary in self.trainVocab ... # compute and store prior distribution over classes # (unsmoothed) in self.priorProbs ... # compute and store p(w|y), with add-alpha smoothing, # in self.likelihoodProbs. Add '**OOV**' as a pseudo-word # for out-of-vocabulary items (but do not include it in self.trainVocab). ... # Sanity checks--do not modify assert len(self.priorProbs)==len(self.likelihoodProbs)==len(self.CLASSES)>2 assert .999 < sum(self.priorProbs.values()) < 1.001 for y in self.CLASSES: assert .999 < sum(self.likelihoodProbs[y].values()) < 1.001,sum(self.likelihoodProbs[y].values()) assert 0 <= self.likelihoodProbs[y]['**OOV**'] < 1.0,self.likelihoodProbs[y]['**OOV**'] def joint_prob(self, doc, y): # compute the log of the joint probability of the document and the class, # i.e., return p(y)*p(w1|y)*p(w2|y)*... (but in log domain) # should not make any changes to the model parameters ... return ... def predict(self, doc): # apply Bayes' rule: return the class that maximizes the # prior * likelihood probability of the test document # should not make any changes to the model parameters ... return ... def eval(self, test_docs, test_labels): """Evaluates performance on the given evaluation data.""" assert len(test_docs)==len(test_labels) preds = [] # predicted labels for doc,y_gold in zip(test_docs,test_labels): y_pred = self.predict(doc) preds.append(y_pred) ev = Eval(test_labels, preds) return ev.accuracy() if __name__ == "__main__": args = sys.argv[1:] lemmatize = False if args[0]=='-l': lemmatize = True args = args[1:] alpha = float(args[0]) train_docs, train_labels = load_docs('train', lemmatize) print(len(train_docs), 'training docs with', sum(len(d) for d in train_docs), 'tokens', file=sys.stderr) test_docs, test_labels = load_docs('dev', lemmatize) print(len(test_docs), 'eval docs with', sum(len(d) for d in test_docs), 'tokens', file=sys.stderr) nb = NaiveBayes(train_docs, train_labels, alpha) print(nb.eval(test_docs, test_labels))