from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
]
X = vectorizer.fit_transform(corpus)
X
analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.") == (
['this', 'is', 'text', 'document', 'to', 'analyze'])
vectorizer.get_feature_names() == (
['and', 'document', 'first', 'is', 'one',
'second', 'the', 'third', 'this'])
X.toarray()
vectorizer.vocabulary_.get('document')
vectorizer.transform(['Something completely new.']).toaaray()
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
token_pattern=r'\b\w+b', min_df=1)
analyze = bigram('Bi-grams are cool!') == (
['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
X_2
feature_index = bigram_vectorizer.vocabulary_.get('is this')
X_2[:, feature_index]