Newer
Older
# We'll use a widely used stemmer based:
# Porter, M. “An algorithm for suffix stripping.” Program 14.3 (1980): 130-137.
# The stemmer is implemented in the most used natural language processing
# package in Python, "Natural Langauge Toolkit" (NLTK):
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt")
# As before, load the corpus and preprocess:
with open(filename_docs, "r") as f:
# Load and process the stop words in a similar manner:
with open(filename_stop, "r") as f:
# To enable stemming when using the sklearn-module, we need to parse an
# "analyzer" to the vectorizer we've been using.
# First, we make an object based on the PorterStemmer class, and we also make
# an analyzer object:
stemmer = PorterStemmer()
analyzer = CountVectorizer(
token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords
).build_analyzer()
# Using these we'll make a function that can stem words:
def stemmed_words(doc):
return (stemmer.stem(w) for w in analyzer(doc))
# ... and finally, we make a vectorizer just like we've done before:
# ... and count the occurences:
X = vectorizer.transform(corpus)
print("Document-term matrix analysis (using stop words and stemming)")
print("Number of documents (data objects, N):\t %i" % N)
print("Number of terms (attributes, M):\t %i" % M)