Plot Principal Component Analysis (PCA) of 1kGP

Ahmed Moustafa 23 December, 2024 02:05 +0200

Loading Libraries

library(tidyverse)
library(tidylog)
library(printr)
library(Rtsne)
library(umap)

Data Loading

Metadata (populations information)

samples = read_tsv("data/populations.tsv")

## Rows: 4978 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): sample, sex, biosample, population, population_name, superpopulatio...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(samples)

sample	sex	biosample	population	population_name	superpopulation	superpopulation_name
HG00271	male	SAME123417	FIN	Finnish	EUR	European Ancestry
HG00276	female	SAME123424	FIN	Finnish	EUR	European Ancestry
HG00288	female	SAME1839246	FIN	Finnish	EUR	European Ancestry
HG00290	male	SAME1839057	FIN	Finnish	EUR	European Ancestry
HG00303	male	SAME1840115	FIN	Finnish	EUR	European Ancestry
HG00308	male	SAME124161	FIN	Finnish	EUR	European Ancestry

Eigenvalues

# Load the eigenvalues
eigenval = read_tsv("working/1kGP_pca.eigenval", col_names = c("value"))

## Rows: 10 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (1): value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(eigenval)

value
346.99400
151.24600
42.58400
31.44490
5.55535
5.18856

Eigenvectors

# Load the eigenvectors
eigenvec = read.table("working/1kGP_pca.eigenvec", sep = " ")
dim(eigenvec)

## [1] 3202   12

# Display the first few rows of the PCA data to inspect the structure
head(eigenvec)

V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12
HG00096	HG00096	-0.0097912	0.0246988	0.0030017	0.0175045	-0.0001772	-0.0215443	-0.0071070	0.0040074	-0.0118802	-0.0035593
HG00097	HG00097	-0.0086030	0.0246735	0.0021655	0.0173607	-0.0055367	0.0030765	-0.0026379	0.0000937	0.0077455	0.0066996
HG00099	HG00099	-0.0094013	0.0242093	0.0042851	0.0207030	-0.0072768	-0.0173237	-0.0015438	-0.0044063	-0.0047394	0.0044840
HG00100	HG00100	-0.0097434	0.0227575	-0.0006287	0.0176841	-0.0082152	-0.0113509	-0.0065315	0.0004087	-0.0045692	-0.0093612
HG00101	HG00101	-0.0094940	0.0236118	0.0040419	0.0183175	0.0007958	-0.0149010	-0.0088209	0.0077011	0.0024636	0.0284314
HG00102	HG00102	-0.0100465	0.0232639	0.0028574	0.0144682	-0.0013663	-0.0188499	0.0036854	-0.0134158	-0.0000633	-0.0032662

pcs = eigenvec[, 3:ncol(eigenvec)]
colnames(pcs) = paste0("PC", 1:ncol(pcs))
rownames(pcs) = eigenvec$V1
head(pcs)

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10
HG00096	-0.0097912	0.0246988	0.0030017	0.0175045	-0.0001772	-0.0215443	-0.0071070	0.0040074	-0.0118802	-0.0035593
HG00097	-0.0086030	0.0246735	0.0021655	0.0173607	-0.0055367	0.0030765	-0.0026379	0.0000937	0.0077455	0.0066996
HG00099	-0.0094013	0.0242093	0.0042851	0.0207030	-0.0072768	-0.0173237	-0.0015438	-0.0044063	-0.0047394	0.0044840
HG00100	-0.0097434	0.0227575	-0.0006287	0.0176841	-0.0082152	-0.0113509	-0.0065315	0.0004087	-0.0045692	-0.0093612
HG00101	-0.0094940	0.0236118	0.0040419	0.0183175	0.0007958	-0.0149010	-0.0088209	0.0077011	0.0024636	0.0284314
HG00102	-0.0100465	0.0232639	0.0028574	0.0144682	-0.0013663	-0.0188499	0.0036854	-0.0134158	-0.0000633	-0.0032662

pca_df = data.frame(sample = row.names(pcs), pcs) %>% inner_join(samples)

## Joining with `by = join_by(sample)`
## inner_join: added 6 columns (sex, biosample, population, population_name,
## superpopulation, …)
## > rows only in x ( 0)
## > rows only in samples (1,776)
## > matched rows 3,202
## > =======
## > rows total 3,202

head(pca_df)

sample	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	sex	biosample	population	population_name	superpopulation	superpopulation_name
HG00096	-0.0097912	0.0246988	0.0030017	0.0175045	-0.0001772	-0.0215443	-0.0071070	0.0040074	-0.0118802	-0.0035593	male	SAME123268	GBR	British	EUR	European Ancestry
HG00097	-0.0086030	0.0246735	0.0021655	0.0173607	-0.0055367	0.0030765	-0.0026379	0.0000937	0.0077455	0.0066996	female	SAME123267	GBR	British	EUR	European Ancestry
HG00099	-0.0094013	0.0242093	0.0042851	0.0207030	-0.0072768	-0.0173237	-0.0015438	-0.0044063	-0.0047394	0.0044840	female	SAME123271	GBR	British	EUR	European Ancestry
HG00100	-0.0097434	0.0227575	-0.0006287	0.0176841	-0.0082152	-0.0113509	-0.0065315	0.0004087	-0.0045692	-0.0093612	female	SAME125154	GBR	British	EUR	European Ancestry
HG00101	-0.0094940	0.0236118	0.0040419	0.0183175	0.0007958	-0.0149010	-0.0088209	0.0077011	0.0024636	0.0284314	male	SAME125153	GBR	British	EUR	European Ancestry
HG00102	-0.0100465	0.0232639	0.0028574	0.0144682	-0.0013663	-0.0188499	0.0036854	-0.0134158	-0.0000633	-0.0032662	female	SAME123945	GBR	British	EUR	European Ancestry

Variance Explained

variance_explained = tibble(pc = paste0("PC", 1:nrow(eigenval)), var = eigenval$value / sum(eigenval))
head(variance_explained)

pc	var
PC1	0.5805956
PC2	0.2530671
PC3	0.0712522
PC4	0.0526141
PC5	0.0092953
PC6	0.0086816

Plotting the Variance Explained

ggplot(variance_explained %>% top_n(3)) + 
  geom_bar(aes(x = pc, y = var), stat = "identity") + 
  theme_light() +
  labs (x = "PC", y = "Variance Explained") +
  scale_y_continuous(labels = scales::percent)

## Selecting by var
## top_n: removed 7 rows (70%), 3 rows remaining

Visualizing PCA Clustering

By superpopulation

ggplot(pca_df) + 
  geom_jitter(aes(x = PC1, y = PC2, color = superpopulation), alpha = 0.7) + 
  theme_light()

By population

ggplot(pca_df) + 
  geom_jitter(aes(x = PC1, y = PC2, color = population), alpha = 0.7) + 
  theme_light()

Alternative Clustering using t-SNE

Perform t-SNE

# Perform t-SNE
set.seed(123)  # Set a seed for reproducibility
tsne_result = Rtsne(as.matrix(pcs[,1:3]), dims = 2, perplexity = 30, theta = 0.5, verbose = TRUE)

## Performing PCA
## Read the 3202 x 3 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.09 seconds (sparsity = 0.035335)!
## Learning embedding...
## Iteration 50: error is 79.204083 (50 iterations in 0.16 seconds)
## Iteration 100: error is 64.772105 (50 iterations in 0.14 seconds)
## Iteration 150: error is 62.385642 (50 iterations in 0.16 seconds)
## Iteration 200: error is 61.401533 (50 iterations in 0.16 seconds)
## Iteration 250: error is 60.846001 (50 iterations in 0.16 seconds)
## Iteration 300: error is 1.500679 (50 iterations in 0.14 seconds)
## Iteration 350: error is 1.140660 (50 iterations in 0.15 seconds)
## Iteration 400: error is 0.968229 (50 iterations in 0.14 seconds)
## Iteration 450: error is 0.873545 (50 iterations in 0.12 seconds)
## Iteration 500: error is 0.827076 (50 iterations in 0.12 seconds)
## Iteration 550: error is 0.806732 (50 iterations in 0.13 seconds)
## Iteration 600: error is 0.790258 (50 iterations in 0.13 seconds)
## Iteration 650: error is 0.777408 (50 iterations in 0.13 seconds)
## Iteration 700: error is 0.768881 (50 iterations in 0.12 seconds)
## Iteration 750: error is 0.760994 (50 iterations in 0.13 seconds)
## Iteration 800: error is 0.753550 (50 iterations in 0.12 seconds)
## Iteration 850: error is 0.746623 (50 iterations in 0.12 seconds)
## Iteration 900: error is 0.739908 (50 iterations in 0.12 seconds)
## Iteration 950: error is 0.734250 (50 iterations in 0.13 seconds)
## Iteration 1000: error is 0.729056 (50 iterations in 0.13 seconds)
## Fitting performed in 2.73 seconds.

# The results are stored in tsne_result$Y
head(tsne_result$Y)


13.45648	-6.278075
20.47906	-9.302024
11.36764	1.970537
34.88028	-7.055738
13.59423	1.797826
20.18693	-0.624645

Visualizing t-SNE Clustering

# Create a data frame for plotting
tsne_df = data.frame(TSNE1 = tsne_result$Y[, 1], 
                     TSNE2 = tsne_result$Y[, 2], 
                     sample = rownames(pcs)) %>% inner_join(samples)

## Joining with `by = join_by(sample)`
## inner_join: added 6 columns (sex, biosample, population, population_name,
## superpopulation, …)
## > rows only in x ( 0)
## > rows only in samples (1,776)
## > matched rows 3,202
## > =======
## > rows total 3,202

head(tsne_df)

TSNE1	TSNE2	sample	sex	biosample	population	population_name	superpopulation	superpopulation_name
13.45648	-6.278075	HG00096	male	SAME123268	GBR	British	EUR	European Ancestry
20.47906	-9.302024	HG00097	female	SAME123267	GBR	British	EUR	European Ancestry
11.36764	1.970537	HG00099	female	SAME123271	GBR	British	EUR	European Ancestry
34.88028	-7.055738	HG00100	female	SAME125154	GBR	British	EUR	European Ancestry
13.59423	1.797826	HG00101	male	SAME125153	GBR	British	EUR	European Ancestry
20.18693	-0.624645	HG00102	female	SAME123945	GBR	British	EUR	European Ancestry

By superpopulation

ggplot(tsne_df, aes(x = TSNE1, y = TSNE2, color = superpopulation), alpha = 0.7) +
  geom_point() +
  theme_light() +
  labs(x = "t-SNE1", y = "t-SNE2")

By population

ggplot(tsne_df, aes(x = TSNE1, y = TSNE2, color = population), alpha = 0.7) +
  geom_point() +
  theme_light() +
  labs(x = "t-SNE1", y = "t-SNE2")

Alternative Clustering using UMAP

Setting UMAP parameters

# Set parameters for UMAP
umap_config = umap::umap.defaults
umap_config$n_neighbors = 20
umap_config$min_dist = 0.9
umap_config$metric = "euclidean"

Perform UMAP

# Perform UMAP
set.seed(123)  # Set a seed for reproducibility
umap_result = umap(as.matrix(pcs[, 1:3]), config = umap_config)

# The results are stored in umap_result$layout
head(umap_result$layout)


HG00096	1.2041862	-2.8331560
HG00097	3.3251365	-2.3108645
HG00099	-0.9658144	-0.7895916
HG00100	6.8363274	0.7601992
HG00101	-0.0701956	-0.3116174
HG00102	1.6608518	0.2403880

Visualizing UMAP Clustering

# Create a data frame for plotting
umap_df = data.frame(UMAP1 = umap_result$layout[, 1], 
                     UMAP2 = umap_result$layout[, 2], 
                     sample = rownames(pcs)) %>% inner_join(samples)

## Joining with `by = join_by(sample)`
## inner_join: added 6 columns (sex, biosample, population, population_name,
## superpopulation, …)
## > rows only in x ( 0)
## > rows only in samples (1,776)
## > matched rows 3,202
## > =======
## > rows total 3,202

head(umap_df)

UMAP1	UMAP2	sample	sex	biosample	population	population_name	superpopulation	superpopulation_name
1.2041862	-2.8331560	HG00096	male	SAME123268	GBR	British	EUR	European Ancestry
3.3251365	-2.3108645	HG00097	female	SAME123267	GBR	British	EUR	European Ancestry
-0.9658144	-0.7895916	HG00099	female	SAME123271	GBR	British	EUR	European Ancestry
6.8363274	0.7601992	HG00100	female	SAME125154	GBR	British	EUR	European Ancestry
-0.0701956	-0.3116174	HG00101	male	SAME125153	GBR	British	EUR	European Ancestry
1.6608518	0.2403880	HG00102	female	SAME123945	GBR	British	EUR	European Ancestry

By superpopulation

# Plot the UMAP results
ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = superpopulation), alpha = 0.7) +
  geom_point() +
  theme_light() +
  labs(x = "UMAP1", y = "UMAP2")

By population

# Plot the UMAP results
ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = population), alpha = 0.7) +
  geom_point() +
  theme_light() +
  labs(x = "UMAP1", y = "UMAP2")

Session Info

sessionInfo()

## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin24.1.0
## Running under: macOS Sequoia 15.2
## 
## Matrix products: default
## BLAS:   /opt/homebrew/Cellar/openblas/0.3.28/lib/libopenblasp-r0.3.28.dylib 
## LAPACK: /opt/homebrew/Cellar/r/4.4.2_2/lib/R/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Africa/Cairo
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] umap_0.2.10.0   Rtsne_0.17      printr_0.3      tidylog_1.1.0  
##  [5] lubridate_1.9.4 forcats_1.0.0   stringr_1.5.1   dplyr_1.1.4    
##  [9] purrr_1.0.2     readr_2.1.5     tidyr_1.3.1     tibble_3.2.1   
## [13] ggplot2_3.5.1   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] generics_0.1.3    stringi_1.8.4     lattice_0.22-6    hms_1.1.3        
##  [5] digest_0.6.37     magrittr_2.0.3    evaluate_1.0.1    grid_4.4.2       
##  [9] timechange_0.3.0  fastmap_1.2.0     jsonlite_1.8.9    Matrix_1.7-1     
## [13] RSpectra_0.16-2   scales_1.3.0      cli_3.6.3         crayon_1.5.3     
## [17] rlang_1.1.4       bit64_4.5.2       munsell_0.5.1     withr_3.0.2      
## [21] yaml_2.3.10       parallel_4.4.2    tools_4.4.2       tzdb_0.4.0       
## [25] colorspace_2.1-1  reticulate_1.39.0 png_0.1-8         vctrs_0.6.5      
## [29] R6_2.5.1          lifecycle_1.0.4   bit_4.5.0.1       vroom_1.6.5      
## [33] clisymbols_1.2.0  pkgconfig_2.0.3   pillar_1.10.0     gtable_0.3.6     
## [37] glue_1.7.0        Rcpp_1.0.13       highr_0.11        xfun_0.47        
## [41] tidyselect_1.2.1  rstudioapi_0.17.1 knitr_1.48        farver_2.1.2     
## [45] htmltools_0.5.8.1 labeling_0.4.3    rmarkdown_2.28    compiler_4.4.2   
## [49] askpass_1.2.1     openssl_2.3.0

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

plot-1kGP-pca.md

plot-1kGP-pca.md

Plot Principal Component Analysis (PCA) of 1kGP

Loading Libraries

Data Loading

Metadata (populations information)

Eigenvalues

Eigenvectors

Variance Explained

Plotting the Variance Explained

Visualizing PCA Clustering

By superpopulation

By population

Alternative Clustering using t-SNE

Perform t-SNE

Visualizing t-SNE Clustering

By superpopulation

By population

Alternative Clustering using UMAP

Setting UMAP parameters

Perform UMAP

Visualizing UMAP Clustering

By superpopulation

By population

Further Readings

Session Info

Files

plot-1kGP-pca.md

Latest commit

History

plot-1kGP-pca.md

File metadata and controls

Plot Principal Component Analysis (PCA) of 1kGP

Loading Libraries

Data Loading

Metadata (populations information)

Eigenvalues

Eigenvectors

Variance Explained

Plotting the Variance Explained

Visualizing PCA Clustering

By superpopulation

By population

Alternative Clustering using t-SNE

Perform t-SNE

Visualizing t-SNE Clustering

By superpopulation

By population

Alternative Clustering using UMAP

Setting UMAP parameters

Perform UMAP

Visualizing UMAP Clustering

By superpopulation

By population

Further Readings

Session Info