After training a sklearn.BernoulliNB classifier on a corpus I'm getting sporadic errors when trying to predict lables for features with the stored classifier:
feats = {'and': True, (',', 'clean'): True, ('clean', 'and'): True, 'good': True, ('friendly', 'staff'): True, ',': True, '.': True, 'gyros': True, 'clean': True, ('gyros', ','): True, ('good', 'gyros'): True, ('and', 'friendly'): True, 'friendly': True, ('staff', '.'): True, 'staff': True}
clf = pickle.load(open('saved_classifier.pickle'))
p = clf.prob_classify(feats)
The above works. However if:
feats = {'and': True, 'fresh': True, ('fresh', 'and'): True, 'inexpensive': True, ('and', 'inexpensive'): True}
clf.prob_classify(feats) results in a type error... here's the trace:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-184-86c30997b740> in <module>()
----> 1 p = clf.prob_classify(feats)
2 p.prob('pos')
/Library/Python/2.7/site-packages/nltk/classify/api.pyc in prob_classify(self, featureset)
63 """
64 if overridden(self.batch_prob_classify):
---> 65 return self.batch_prob_classify([featureset])[0]
66 else:
67 raise NotImplementedError()
/Library/Python/2.7/site-packages/nltk/classify/scikitlearn.pyc in batch_prob_classify(self, featuresets)
71 def batch_prob_classify(self, featuresets):
72 X = self._convert(featuresets)
---> 73 y_proba = self._clf.predict_proba(X)
74 return [self._make_probdist(y_proba[i]) for i in xrange(len(y_proba))]
75
/Library/Python/2.7/site-packages/scikit_learn-0.14_git-py2.7-macosx-10.8-intel.egg/sklearn/pipeline.pyc in predict_proba(self, X)
154 for name, transform in self.steps[:-1]:
155 Xt = transform.transform(Xt)
--> 156 return self.steps[-1][-1].predict_proba(Xt)
157
158 def decision_function(self, X):
/Library/Python/2.7/site-packages/scikit_learn-0.14_git-py2.7-macosx-10.8-intel.egg/sklearn/naive_bayes.pyc in predict_proba(self, X)
96 the model, where classes are ordered arithmetically.
97 """
---> 98 return np.exp(self.predict_log_proba(X))
99
100
/Library/Python/2.7/site-packages/scikit_learn-0.14_git-py2.7-macosx-10.8-intel.egg/sklearn/naive_bayes.pyc in predict_log_proba(self, X)
77 in the model, where classes are ordered arithmetically.
78 """
---> 79 jll = self._joint_log_likelihood(X)
80 # normalize by P(x) = P(f_1, ..., f_n)
81 log_prob_x = logsumexp(jll, axis=1)
/Library/Python/2.7/site-packages/scikit_learn-0.14_git-py2.7-macosx-10.8-intel.egg/sklearn/naive_bayes.pyc in _joint_log_likelihood(self, X)
433
434 if self.binarize is not None:
--> 435 X = binarize(X, threshold=self.binarize)
436
437 n_classes, n_features = self.feature_log_prob_.shape
/Library/Python/2.7/site-packages/scikit_learn-0.14_git-py2.7-macosx-10.8-intel.egg/sklearn/preprocessing.pyc in binarize(X, threshold, copy)
537 X.data[cond] = 1
538 X.data[not_cond] = 0
--> 539 X.eliminate_zeros()
540 else:
541 cond = X > threshold
/Library/Python/2.7/site-packages/scipy-0.13.0.dev_c31f167_20130307-py2.7-macosx-10.8-intel.egg/scipy/sparse/compressed.pyc in eliminate_zeros(self)
572 fn = sparsetools.csr_eliminate_zeros
573 M,N = self._swap(self.shape)
--> 574 fn( M, N, self.indptr, self.indices, self.data)
575
576 self.prune() #nnz may have changed
/Library/Python/2.7/site-packages/scipy-0.13.0.dev_c31f167_20130307-py2.7-macosx-10.8-intel.egg/scipy/sparse/sparsetools/csr.pyc in csr_eliminate_zeros(*args)
565 csr_eliminate_zeros(int n_row, int n_col, int Ap, int Aj, npy_clongdouble_wrapper Ax)
566 """
--> 567 return _csr.csr_eliminate_zeros(*args)
568
569 def csr_sum_duplicates(*args):
TypeError: Array of type 'byte' required. Array of type 'bool' given