Skip to content
Snippets Groups Projects
ex3_1_3.py 1.3 KiB
Newer Older
  • Learn to ignore specific revisions
  • bjje's avatar
    bjje committed
    # exercise 3.1.4
    
    Stas Syrota's avatar
    Stas Syrota committed
    import importlib_resources
    
    bjje's avatar
    bjje committed
    from sklearn.feature_extraction.text import CountVectorizer
    
    
    Stas Syrota's avatar
    Stas Syrota committed
    filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
    
    filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt")
    
    Stas Syrota's avatar
    Stas Syrota committed
    
    
    bjje's avatar
    bjje committed
    # As before, load the corpus and preprocess:
    
    Stas Syrota's avatar
    Stas Syrota committed
    with open(filename_docs, "r") as f:
    
    bjje's avatar
    bjje committed
        raw_file = f.read()
    
    Stas Syrota's avatar
    Stas Syrota committed
    corpus = raw_file.split("\n")
    
    bjje's avatar
    bjje committed
    corpus = list(filter(None, corpus))
    
    # Load and process the stop words in a similar manner:
    
    Stas Syrota's avatar
    Stas Syrota committed
    with open(filename_stop, "r") as f:
    
    bjje's avatar
    bjje committed
        raw_file = f.read()
    
    Stas Syrota's avatar
    Stas Syrota committed
    stopwords = raw_file.split("\n")
    
    bjje's avatar
    bjje committed
    
    # When making the CountVectorizer, we now input the stop words:
    
    Stas Syrota's avatar
    Stas Syrota committed
    vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords)
    
    bjje's avatar
    bjje committed
    # Determine the terms in the corpus
    vectorizer.fit(corpus)
    # ... and count the frequency of each term within a document:
    X = vectorizer.transform(corpus)
    
    Stas Syrota's avatar
    Stas Syrota committed
    attributeNames = vectorizer.get_feature_names_out()
    N, M = X.shape
    
    bjje's avatar
    bjje committed
    
    # Display the result
    
    Stas Syrota's avatar
    Stas Syrota committed
    print("Document-term matrix analysis (using stop words)")
    
    bjje's avatar
    bjje committed
    print()
    
    Stas Syrota's avatar
    Stas Syrota committed
    print("Number of documents (data objects, N):\t %i" % N)
    print("Number of terms (attributes, M):\t %i" % M)
    
    bjje's avatar
    bjje committed
    print()
    
    Stas Syrota's avatar
    Stas Syrota committed
    print("Found terms (no stop words):")
    
    bjje's avatar
    bjje committed
    print(attributeNames)
    print()
    
    Stas Syrota's avatar
    Stas Syrota committed
    print("Document-term matrix:")
    
    bjje's avatar
    bjje committed
    print(X.toarray())
    print()
    
    Stas Syrota's avatar
    Stas Syrota committed
    print("Ran Exercise 3.1.3")