Skip to content
Snippets Groups Projects
ex3_1_4.py 2.03 KiB
Newer Older
  • Learn to ignore specific revisions
  • bjje's avatar
    bjje committed
    # exercise 3.1.4
    
    Stas Syrota's avatar
    Stas Syrota committed
    import importlib_resources
    
    bjje's avatar
    bjje committed
    
    # We'll use a widely used stemmer based:
    # Porter, M. “An algorithm for suffix stripping.” Program 14.3 (1980): 130-137.
    # The stemmer is implemented in the most used natural language processing
    # package in Python, "Natural Langauge Toolkit" (NLTK):
    from nltk.stem import PorterStemmer
    
    Stas Syrota's avatar
    Stas Syrota committed
    from sklearn.feature_extraction.text import CountVectorizer
    
    filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
    
    filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt")
    
    bjje's avatar
    bjje committed
    
    
    Stas Syrota's avatar
    Stas Syrota committed
    # As before, load the corpus and preprocess:
    with open(filename_docs, "r") as f:
    
    bjje's avatar
    bjje committed
        raw_file = f.read()
    
    Stas Syrota's avatar
    Stas Syrota committed
    corpus = raw_file.split("\n")
    
    bjje's avatar
    bjje committed
    corpus = list(filter(None, corpus))
    
    
    Stas Syrota's avatar
    Stas Syrota committed
    # Load and process the stop words in a similar manner:
    with open(filename_stop, "r") as f:
    
    bjje's avatar
    bjje committed
        raw_file = f.read()
    
    Stas Syrota's avatar
    Stas Syrota committed
    stopwords = raw_file.split("\n")
    
    bjje's avatar
    bjje committed
    
    
    Stas Syrota's avatar
    Stas Syrota committed
    
    # To enable stemming when using the sklearn-module, we need to parse an
    # "analyzer" to the vectorizer we've been using.
    
    bjje's avatar
    bjje committed
    # First, we make an object based on the PorterStemmer class, and we also make
    # an analyzer object:
    stemmer = PorterStemmer()
    
    Stas Syrota's avatar
    Stas Syrota committed
    analyzer = CountVectorizer(
        token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords
    ).build_analyzer()
    
    
    
    bjje's avatar
    bjje committed
    # Using these we'll make a function that can stem words:
    def stemmed_words(doc):
        return (stemmer.stem(w) for w in analyzer(doc))
    
    bjje's avatar
    bjje committed
    # ... and finally, we make a vectorizer just like we've done before:
    
    Stas Syrota's avatar
    Stas Syrota committed
    vectorizer = CountVectorizer(analyzer=stemmed_words)
    
    bjje's avatar
    bjje committed
    
    # Determine the terms:
    vectorizer.fit(corpus)
    
    Stas Syrota's avatar
    Stas Syrota committed
    attributeNames = vectorizer.get_feature_names_out()
    
    bjje's avatar
    bjje committed
    
    # ... and count the occurences:
    X = vectorizer.transform(corpus)
    
    Stas Syrota's avatar
    Stas Syrota committed
    N, M = X.shape
    
    bjje's avatar
    bjje committed
    X = X.toarray()
    
    # Display the result
    
    Stas Syrota's avatar
    Stas Syrota committed
    print("Document-term matrix analysis (using stop words and stemming)")
    
    bjje's avatar
    bjje committed
    print()
    
    Stas Syrota's avatar
    Stas Syrota committed
    print("Number of documents (data objects, N):\t %i" % N)
    print("Number of terms (attributes, M):\t %i" % M)
    
    bjje's avatar
    bjje committed
    print()
    
    Stas Syrota's avatar
    Stas Syrota committed
    print("Found terms (no stop words, stemmed):")
    
    bjje's avatar
    bjje committed
    print(attributeNames)
    print()
    
    Stas Syrota's avatar
    Stas Syrota committed
    print("Document-term matrix:")
    
    bjje's avatar
    bjje committed
    print(X)
    print()
    
    Stas Syrota's avatar
    Stas Syrota committed
    print("Ran Exercise 3.1.4")
    print()