Skip to content
Snippets Groups Projects
Perform_pca.py 2.53 KiB
Newer Older
  • Learn to ignore specific revisions
  • mcoth's avatar
    mcoth committed
    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    """
    Created on Sun Sep  1 15:27:56 2024
    
    @author: Maya Coulson Theodorsen (mcoth@dtu.dk)
    
    
    This script performs Principal Component Analysis (PCA) on standardized data.
    
    It includes:
    - Bartlett's test for sphericity to check PCA suitability.
    - Kaiser-Meyer-Olkin (KMO) measure for sampling adequacy.
    - PCA scree plot visualization and explained variance analysis.
    - Calculation of PCA loadings and transformed data.
    
    Functions:
    - perform_pca: Performs PCA and diagnostic tests.
    
    Returns:
    - pca (PCA fitted model)
    - loadings (PCA loadings for each feature and component)
    - principleComponents 
    
    """
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.decomposition import PCA
    from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
    
    def perform_pca(std_data, PCAcolumns, columnNames):
    
        # Bartlett sphericity to check if suitable for PCA    
        chi_square_value, p_value = calculate_bartlett_sphericity(std_data)
        print("Bartlett's sphericity chi-square:", chi_square_value)
        print(f"p_value: {p_value:.30f}")
    
    
        # Kaiser-Meyer-Olkin(KMO) Test for sampling accuracy (0.8-1.0 is excellent)
        # Test if data is appropriate for FA/PCA
        kmo_all, kmo_model = calculate_kmo(std_data)
        print("KMO model:", kmo_model)
    
    
        # PCA (note: variance in eigenvalues, not percentage)
        pca = PCA()
        principleComponents = pd.DataFrame(pca.fit_transform(std_data))
        
        # Bar plot of the variances of PCA features
        features = range(pca.n_components_)
        plt.subplots(figsize=(20,15)) 
        plt.bar(features, pca.explained_variance_)
        plt.xticks(features)
        plt.ylabel('Eigenvalue', fontsize=16)
        plt.xlabel('PCA feature', fontsize=16)
        plt.axhline(y=1, linewidth=2, color='r')
        plt.title('Scree plot', fontsize=30)
        plt.show()
    
        # Display the actual amount of explained variance per component 
        print('PCA explained variance:')
        print(pca.explained_variance_)
        
        # Print the ratios of percentage explained by each feature
        print('PCA explained variance ratio:')
        print(pca.explained_variance_ratio_)
    
        # Cumulative summation of the ratio explained variance of each feature
        print('PCA explained variance ratio cumulative summation:')
        print(pca.explained_variance_ratio_.cumsum())
        
        # Loadings for each PC
        loadings = pd.DataFrame(pca.components_.T * np.sqrt(pca.explained_variance_))
        loadings.columns = PCAcolumns
        loadings.index = [columnNames]
        
        return pca, loadings, principleComponents