From 6f8ffe4d843c794bd940c4e08aa8b162efe6abc9 Mon Sep 17 00:00:00 2001 From: mcoth <mcoth@dtu.dk> Date: Sun, 17 Nov 2024 16:47:41 +0100 Subject: [PATCH] Upload New File --- Perform_pca.py | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 Perform_pca.py diff --git a/Perform_pca.py b/Perform_pca.py new file mode 100644 index 0000000..8b5ccc0 --- /dev/null +++ b/Perform_pca.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Sep 1 15:27:56 2024 + +@author: Maya Coulson Theodorsen (mcoth@dtu.dk) + + +This script performs Principal Component Analysis (PCA) on standardized data. + +It includes: +- Bartlett's test for sphericity to check PCA suitability. +- Kaiser-Meyer-Olkin (KMO) measure for sampling adequacy. +- PCA scree plot visualization and explained variance analysis. +- Calculation of PCA loadings and transformed data. + +Functions: +- perform_pca: Performs PCA and diagnostic tests. + +Returns: +- pca (PCA fitted model) +- loadings (PCA loadings for each feature and component) +- principleComponents + +""" +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.decomposition import PCA +from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo + +def perform_pca(std_data, PCAcolumns, columnNames): + + # Bartlett sphericity to check if suitable for PCA + chi_square_value, p_value = calculate_bartlett_sphericity(std_data) + print("Bartlett's sphericity chi-square:", chi_square_value) + print(f"p_value: {p_value:.30f}") + + + # Kaiser-Meyer-Olkin(KMO) Test for sampling accuracy (0.8-1.0 is excellent) + # Test if data is appropriate for FA/PCA + kmo_all, kmo_model = calculate_kmo(std_data) + print("KMO model:", kmo_model) + + + # PCA (note: variance in eigenvalues, not percentage) + pca = PCA() + principleComponents = pd.DataFrame(pca.fit_transform(std_data)) + + # Bar plot of the variances of PCA features + features = range(pca.n_components_) + plt.subplots(figsize=(20,15)) + plt.bar(features, pca.explained_variance_) + plt.xticks(features) + plt.ylabel('Eigenvalue', fontsize=16) + plt.xlabel('PCA feature', fontsize=16) + plt.axhline(y=1, linewidth=2, color='r') + plt.title('Scree plot', fontsize=30) + plt.show() + + # Display the actual amount of explained variance per component + print('PCA explained variance:') + print(pca.explained_variance_) + + # Print the ratios of percentage explained by each feature + print('PCA explained variance ratio:') + print(pca.explained_variance_ratio_) + + # Cumulative summation of the ratio explained variance of each feature + print('PCA explained variance ratio cumulative summation:') + print(pca.explained_variance_ratio_.cumsum()) + + # Loadings for each PC + loadings = pd.DataFrame(pca.components_.T * np.sqrt(pca.explained_variance_)) + loadings.columns = PCAcolumns + loadings.index = [columnNames] + + return pca, loadings, principleComponents \ No newline at end of file -- GitLab