Skip to content
Snippets Groups Projects
Commit 6f8ffe4d authored by mcoth's avatar mcoth
Browse files

Upload New File

parent ee497bbf
No related branches found
No related tags found
No related merge requests found
Pipeline #38564 passed with warnings
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 1 15:27:56 2024
@author: Maya Coulson Theodorsen (mcoth@dtu.dk)
This script performs Principal Component Analysis (PCA) on standardized data.
It includes:
- Bartlett's test for sphericity to check PCA suitability.
- Kaiser-Meyer-Olkin (KMO) measure for sampling adequacy.
- PCA scree plot visualization and explained variance analysis.
- Calculation of PCA loadings and transformed data.
Functions:
- perform_pca: Performs PCA and diagnostic tests.
Returns:
- pca (PCA fitted model)
- loadings (PCA loadings for each feature and component)
- principleComponents
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
def perform_pca(std_data, PCAcolumns, columnNames):
# Bartlett sphericity to check if suitable for PCA
chi_square_value, p_value = calculate_bartlett_sphericity(std_data)
print("Bartlett's sphericity chi-square:", chi_square_value)
print(f"p_value: {p_value:.30f}")
# Kaiser-Meyer-Olkin(KMO) Test for sampling accuracy (0.8-1.0 is excellent)
# Test if data is appropriate for FA/PCA
kmo_all, kmo_model = calculate_kmo(std_data)
print("KMO model:", kmo_model)
# PCA (note: variance in eigenvalues, not percentage)
pca = PCA()
principleComponents = pd.DataFrame(pca.fit_transform(std_data))
# Bar plot of the variances of PCA features
features = range(pca.n_components_)
plt.subplots(figsize=(20,15))
plt.bar(features, pca.explained_variance_)
plt.xticks(features)
plt.ylabel('Eigenvalue', fontsize=16)
plt.xlabel('PCA feature', fontsize=16)
plt.axhline(y=1, linewidth=2, color='r')
plt.title('Scree plot', fontsize=30)
plt.show()
# Display the actual amount of explained variance per component
print('PCA explained variance:')
print(pca.explained_variance_)
# Print the ratios of percentage explained by each feature
print('PCA explained variance ratio:')
print(pca.explained_variance_ratio_)
# Cumulative summation of the ratio explained variance of each feature
print('PCA explained variance ratio cumulative summation:')
print(pca.explained_variance_ratio_.cumsum())
# Loadings for each PC
loadings = pd.DataFrame(pca.components_.T * np.sqrt(pca.explained_variance_))
loadings.columns = PCAcolumns
loadings.index = [columnNames]
return pca, loadings, principleComponents
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment