Hi, I'm new to Scikit-learn and python (though not to programming) and am working my way through the examples. Aim: Train a model based on textual data and use the trained model to classify individual text files. Issue: I end up with Unicode errors : UnicodeDecodeError: 'utf8' codec can't decode byte 0xaa in position 680: invalid start byte . What am I doing wrong? I've loaded up the training data but there's obviously something wrong I'm doing while attempting to vectorize it.
Thanks Vinay Output And Error: .... Number of files 2034 Categories trained: ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc'] Traceback (most recent call last): <============================================================FAIL File "/home/vinayb/python/HelloPython/abcd/TestPython.py", line 76, in <module> X = vectorizer.fit_transform(twenty_train.data) File "/usr/local/lib/python2.7/dist-packages/sklearn/feature_extraction/text.py", line 1156, in fit_transform X = super(TfidfVectorizer, self).fit_transform(raw_documents) File "/usr/local/lib/python2.7/dist-packages/sklearn/feature_extraction/text.py", line 691, in fit_transform term_count_current = Counter(analyze(doc)) File "/usr/local/lib/python2.7/dist-packages/sklearn/feature_extraction/text.py", line 224, in <lambda> tokenize(preprocess(self.decode(doc))), stop_words) File "/usr/local/lib/python2.7/dist-packages/sklearn/feature_extraction/text.py", line 106, in decode doc = doc.decode(self.charset, self.charset_error) File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode return codecs.utf_8_decode(input, errors, True) UnicodeDecodeError: 'utf8' codec can't decode byte 0xaa in position 680: invalid start byte ################################## import logging import numpy as np from optparse import OptionParser import sys from time import time import pylab as pl from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import HashingVectorizer #from sklearn.feature_extraction.text import RomanPreprocessor #from sklearn.feature_extraction.text import WordNGramAnalyzer from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.linear_model import RidgeClassifier from sklearn.svm import LinearSVC from sklearn.linear_model import SGDClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import NearestCentroid from sklearn.utils.extmath import density from sklearn import metrics from sklearn.datasets import load_files #options op = OptionParser() op.add_option("--no-minibatch", action="store_false", dest="minibatch", default=True, help="Use ordinary k-means algorithm (in batch mode).") op.add_option("--no-idf", action="store_false", dest="use_idf", default=True, help="Disable Inverse Document Frequency feature weighting.") op.add_option("--use-hashing", action="store_true", default=False, help="Use a hashing feature vectorizer") op.add_option("--n-features", type=int, default=10000, help="Maximum number of features (dimensions)" "to extract from text.") print __doc__ op.print_help() (opts, args) = op.parse_args() if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', ] twenty_train = load_files('/home/vinayb/data/20news/20news-bydate-train', categories=categories, shuffle=True,random_state=42) print "Number of files " + str(len(twenty_train.filenames)) print "Categories trained: " + str(twenty_train.target_names) #why doesnt this work #count_vect = CountVectorizer() #X_train_counts = count_vect.fit_transform(twenty_train.data) #<===================================== FAIL #print "Shape " + str(X_train_counts.shape) #also, why doesnt this work vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, stop_words='english', use_idf=opts.use_idf) X = vectorizer.fit_transform(twenty_train.data) # <=========================== FAIL ------------------------------------------------------------------------------ Everyone hates slow websites. So do we. Make your web apps faster with AppDynamics Download AppDynamics Lite for free today: http://p.sf.net/sfu/appdyn_d2d_jan _______________________________________________ Scikit-learn-general mailing list Scikit-learn-general@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/scikit-learn-general