#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Sep 1 15:27:56 2024 @author: Maya Coulson Theodorsen (mcoth@dtu.dk) This script performs Principal Component Analysis (PCA) on standardized data. It includes: - Bartlett's test for sphericity to check PCA suitability. - Kaiser-Meyer-Olkin (KMO) measure for sampling adequacy. - PCA scree plot visualization and explained variance analysis. - Calculation of PCA loadings and transformed data. Functions: - perform_pca: Performs PCA and diagnostic tests. Returns: - pca (PCA fitted model) - loadings (PCA loadings for each feature and component) - principleComponents """ import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition import PCA from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo def perform_pca(std_data, PCAcolumns, columnNames): # Bartlett sphericity to check if suitable for PCA chi_square_value, p_value = calculate_bartlett_sphericity(std_data) print("Bartlett's sphericity chi-square:", chi_square_value) print(f"p_value: {p_value:.30f}") # Kaiser-Meyer-Olkin(KMO) Test for sampling accuracy (0.8-1.0 is excellent) # Test if data is appropriate for FA/PCA kmo_all, kmo_model = calculate_kmo(std_data) print("KMO model:", kmo_model) # PCA (note: variance in eigenvalues, not percentage) pca = PCA() principleComponents = pd.DataFrame(pca.fit_transform(std_data)) # Bar plot of the variances of PCA features features = range(pca.n_components_) plt.subplots(figsize=(20,15)) plt.bar(features, pca.explained_variance_) plt.xticks(features) plt.ylabel('Eigenvalue', fontsize=16) plt.xlabel('PCA feature', fontsize=16) plt.axhline(y=1, linewidth=2, color='r') plt.title('Scree plot', fontsize=30) plt.show() # Display the actual amount of explained variance per component print('PCA explained variance:') print(pca.explained_variance_) # Print the ratios of percentage explained by each feature print('PCA explained variance ratio:') print(pca.explained_variance_ratio_) # Cumulative summation of the ratio explained variance of each feature print('PCA explained variance ratio cumulative summation:') print(pca.explained_variance_ratio_.cumsum()) # Loadings for each PC loadings = pd.DataFrame(pca.components_.T * np.sqrt(pca.explained_variance_)) loadings.columns = PCAcolumns loadings.index = [columnNames] return pca, loadings, principleComponents