Description for Comparison of Dimensionality Reduction Methods notebook.
Notebook Contents
This notebook covers:
Topic 1
Topic 2
Topic 3
Use the buttons above to download the notebook or open it in your preferred environment.
š Notebook Preview
Comparing PCA, Factor Analysis, and Autoencoders¶
This notebook demonstrates three popular dimensionality reduction techniques:
PCA (Principal Component Analysis): Linear transformation that maximizes variance
FA (Factor Analysis): Identifies latent factors that explain correlations
AE (Autoencoder): Neural network that learns nonlinear representations
We'll use the MNIST dataset for comparison.
InĀ [Ā ]:
# Import librariesimportnumpyasnpimportmatplotlib.pyplotaspltfromsklearn.decompositionimportPCA,FactorAnalysisfromsklearn.datasetsimportfetch_openmlfromsklearn.preprocessingimportStandardScalerfromsklearn.model_selectionimporttrain_test_splitimporttorchimporttorch.nnasnnimporttorch.optimasoptimfromtorch.utils.dataimportTensorDataset,DataLoader# Set random seeds for reproducibilitynp.random.seed(42)torch.manual_seed(42)plt.style.use('seaborn-v0_8-darkgrid')%matplotlib inline
# Load MNIST dataset (using a subset for faster computation)print("Loading MNIST dataset...")mnist=fetch_openml('mnist_784',version=1,parser='auto')X,y=mnist.data[:5000],mnist.target[:5000]# Convert to numpy arrays and normalizeX=np.array(X,dtype=np.float32)/255.0y=np.array(y,dtype=int)# Split into train and test setsX_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)print(f"Training set: {X_train.shape}")print(f"Test set: {X_test.shape}")
InĀ [Ā ]:
# Visualize some examplesfig,axes=plt.subplots(2,5,figsize=(12,5))fori,axinenumerate(axes.flat):ax.imshow(X_train[i].reshape(28,28),cmap='gray')ax.set_title(f"Label: {y_train[i]}")ax.axis('off')plt.suptitle('Sample MNIST Digits',fontsize=14)plt.tight_layout()plt.show()
PCA finds orthogonal directions of maximum variance in the data.
InĀ [Ā ]:
# Apply PCA with 2 components for visualizationn_components=2pca=PCA(n_components=n_components)X_train_pca=pca.fit_transform(X_train)X_test_pca=pca.transform(X_test)print(f"Explained variance ratio: {pca.explained_variance_ratio_}")print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.4f}")
InĀ [Ā ]:
# Visualize PCA embeddingsplt.figure(figsize=(10,8))scatter=plt.scatter(X_train_pca[:,0],X_train_pca[:,1],c=y_train,cmap='tab10',alpha=0.6,s=10)plt.colorbar(scatter,label='Digit')plt.xlabel('First Principal Component')plt.ylabel('Second Principal Component')plt.title('PCA: 2D Projection of MNIST')plt.grid(True,alpha=0.3)plt.show()
InĀ [Ā ]:
# Reconstruction with PCA (using more components for better reconstruction)pca_reconstruct=PCA(n_components=50)X_train_pca_50=pca_reconstruct.fit_transform(X_train)X_reconstructed_pca=pca_reconstruct.inverse_transform(X_train_pca_50)# Calculate reconstruction errorpca_mse=np.mean((X_train-X_reconstructed_pca)**2)print(f"PCA Reconstruction MSE (50 components): {pca_mse:.6f}")
A neural network that learns a compressed representation through backpropagation.
InĀ [Ā ]:
# Define the Autoencoder architectureclassAutoencoder(nn.Module):def__init__(self,input_dim=784,encoding_dim=2):super(Autoencoder,self).__init__()# Encoderself.encoder=nn.Sequential(nn.Linear(input_dim,256),nn.ReLU(),nn.Linear(256,128),nn.ReLU(),nn.Linear(128,encoding_dim))# Decoderself.decoder=nn.Sequential(nn.Linear(encoding_dim,128),nn.ReLU(),nn.Linear(128,256),nn.ReLU(),nn.Linear(256,input_dim),nn.Sigmoid()# Output in [0, 1] range)defforward(self,x):encoded=self.encoder(x)decoded=self.decoder(encoded)returndecodeddefencode(self,x):returnself.encoder(x)# Initialize modeldevice=torch.device('cuda'iftorch.cuda.is_available()else'cpu')autoencoder=Autoencoder(input_dim=784,encoding_dim=2).to(device)print(autoencoder)
InĀ [Ā ]:
# Prepare data loadersX_train_tensor=torch.FloatTensor(X_train)X_test_tensor=torch.FloatTensor(X_test)train_dataset=TensorDataset(X_train_tensor,X_train_tensor)train_loader=DataLoader(train_dataset,batch_size=128,shuffle=True)# Training setupcriterion=nn.MSELoss()optimizer=optim.Adam(autoencoder.parameters(),lr=0.001)# Training loopnum_epochs=20train_losses=[]print("Training Autoencoder...")forepochinrange(num_epochs):autoencoder.train()epoch_loss=0forbatch_x,_intrain_loader:batch_x=batch_x.to(device)# Forward passoutputs=autoencoder(batch_x)loss=criterion(outputs,batch_x)# Backward passoptimizer.zero_grad()loss.backward()optimizer.step()epoch_loss+=loss.item()avg_loss=epoch_loss/len(train_loader)train_losses.append(avg_loss)if(epoch+1)%5==0:print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.6f}")print("Training complete!")
InĀ [Ā ]:
# Plot training lossplt.figure(figsize=(10,5))plt.plot(train_losses,linewidth=2)plt.xlabel('Epoch')plt.ylabel('Loss (MSE)')plt.title('Autoencoder Training Loss')plt.grid(True,alpha=0.3)plt.show()
InĀ [Ā ]:
# Encode the dataautoencoder.eval()withtorch.no_grad():X_train_ae=autoencoder.encode(X_train_tensor.to(device)).cpu().numpy()X_test_ae=autoencoder.encode(X_test_tensor.to(device)).cpu().numpy()# Visualize AE embeddingsplt.figure(figsize=(10,8))scatter=plt.scatter(X_train_ae[:,0],X_train_ae[:,1],c=y_train,cmap='tab10',alpha=0.6,s=10)plt.colorbar(scatter,label='Digit')plt.xlabel('First Encoding Dimension')plt.ylabel('Second Encoding Dimension')plt.title('Autoencoder: 2D Projection of MNIST')plt.grid(True,alpha=0.3)plt.show()