import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
iris = load_iris()
X = iris.data
y = iris.target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA()
X_pca = pca.fit_transform(X_scaled)
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
plt.figure(figsize=(8, 6))
plt.plot(
range(1, len(explained_variance_ratio) + 1),
cumulative_explained_variance,
marker="o",
linestyle="--",
)
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("Explained Variance Ratio")
plt.grid(True)
plt.show()
n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1
print("Number of components to explain 95% variance:", n_components)
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)
df_pca = pd.DataFrame(data=X_pca, columns=[f"PC{i}" for i in range(1, n_components + 1)])
df_pca["Target"] = y
plt.figure(figsize=(8, 6))
for target in np.unique(y):
indices = df_pca["Target"] == target
plt.scatter(
df_pca.loc[indices, "PC1"], df_pca.loc[indices, "PC2"], label=f"Target {target}"
)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA of Iris Dataset")
plt.legend()
plt.grid(True)
plt.show()