From d8181339aaa4f268d05108a348d7dd18c2fc29c2 Mon Sep 17 00:00:00 2001 From: re4lfl0w Date: Tue, 27 Jan 2015 10:05:21 +0900 Subject: [PATCH 1/2] Modify UnicodeDecodeError text. You'll use utf-8 --- ch05/classify.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ch05/classify.py b/ch05/classify.py index ef5af419..26f3731a 100644 --- a/ch05/classify.py +++ b/ch05/classify.py @@ -54,6 +54,7 @@ def prepare_sent_features(): if not text: meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0 else: + text = text.decode('utf-8') sent_lens = [len(nltk.word_tokenize( sent)) for sent in nltk.sent_tokenize(text)] meta[pid]['AvgSentLen'] = np.mean(sent_lens) From 4de7593567e92c28f4e07c8fcf54ccf4135bab8e Mon Sep 17 00:00:00 2001 From: re4lfl0w Date: Wed, 28 Jan 2015 06:21:37 +0900 Subject: [PATCH 2/2] Modify UnicodeDecodeError text in Python 2.x --- ch05/classify.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ch05/classify.py b/ch05/classify.py index 26f3731a..76699819 100644 --- a/ch05/classify.py +++ b/ch05/classify.py @@ -54,7 +54,9 @@ def prepare_sent_features(): if not text: meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0 else: - text = text.decode('utf-8') + from platform import python_version + if python_version().startswith('2'): + text = text.decode('utf-8') sent_lens = [len(nltk.word_tokenize( sent)) for sent in nltk.sent_tokenize(text)] meta[pid]['AvgSentLen'] = np.mean(sent_lens)