-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathGSE50409.Rmd
281 lines (244 loc) · 8.43 KB
/
GSE50409.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
---
title: 'GSE50409'
output: html_document
---
# Authors
- Leonas Rėčkus
- Liza Tofan
- Aurimas Vilys
- Karolina Butrimaitė
- Ignas Labalaukis
- Domas Motiejūnas
# Analysis
```{r setup, echo=FALSE, include=FALSE}
library(GEOquery)
library(data.table)
library(impute)
library(limma)
library(DT)
```
- Automatically download the data from GEO
```{r echo=FALSE, include=FALSE}
gse50409 <- getGEO("gse50409", destdir="./")
```
- Obtain the matrix of beta values where each row corresponds to probes and each column corresponds to samples
```{r}
matrix <- exprs(gse50409[[1]])
head(rownames(matrix))
head(colnames(matrix))
```
- How many samples and how many probes do you have in the data?
Number of probes:
```{r}
probes <- rownames(matrix)
length(probes)
```
Number of samples:
```{r}
samplesNames <- colnames(matrix)
length(samplesNames)
```
- How are the beta values distributed?
```{r out.width = '100%'}
hist(matrix, breaks=1000)
```
- Do your probes have names?
These are the probe names:
```{r}
head(rownames(matrix))
```
- Do you have annotation that tells the coordinate (in hg19) of each probe and its genomic features (such as related gene name)?
```{r}
annotation <- getGEO("GPL13534", destdir = "./")
annotation <- Table(annotation)
# setDT(annotation)
# Now match the rows in annotation that are present in our data
commonProbes <- intersect(annotation$ID, rownames(matrix))
```
Number of probes for which hg19 annotation is available: `r length(commonProbes)`
- Cell count estimates
```{r}
fname <- "GSE50409_cellCounts.csv"
if (!file.exists(fname)) {
require(meffil)
estimates <- meffil.estimate.cell.counts.from.betas(matrix,
cell.type.reference="blood gse35069", verbose = TRUE)
write.csv(estimates, file=fname, row.names=TRUE)
}
estimates <- read.csv(fname)
head(estimates)
```
```{r}
i <- match(commonProbes, annotation$ID)
annotation <- annotation[i, ]
i <- match(commonProbes, rownames(matrix))
matrix <- matrix[i,]
stopifnot(all(rownames(matrix) == annotation$ID))
head(annotation)
```
- Do you know which samples correspond to healthy individuals, and which samples correspond to the sick ones?
```{r}
disease <- pData(phenoData(gse50409[[1]]))
as.character(disease[1:20,1])
```
- For each probe compute a t-test to verify if the distributions of beta values within the probe significantly differ between the two groups.
```{r}
sick <- which(disease$source_name_ch1 == "Bladder cancer case")
control <- which(disease$source_name_ch1 != "Bladder cancer case")
t.test(matrix[1,control], matrix[1,sick])
```
- From the t-test, obtain the p value.
```{r}
pvals <- apply(matrix,1,function(x) {t.test(x[sick],x[control])$p.value})
hist(pvals)
observed <- mean(pvals < 0.05)
set.seed(123)
## >>> PG: groups wasn't defined (created dummy object)
groups <- sample(0:1, ncol(matrix), replace = TRUE)
expected <- numeric(100)
## >>> PG: decreased number of permutations
for (iteration in 1:2) {
groupsRandom <- sample(groups)
pvalsRandom <- apply(matrix[1:1000,],1,function(x) {t.test(x ~ groupsRandom)$p.value})
expected[iteration] <- mean(pvalsRandom < 0.05)
print(iteration)
}
hist(expected, xlim=c(0, 0.5), xlab="Expected fraction of significant probes")
abline(v=observed, col="red")
mean(observed < expected)
```
- Plot the distribution of p values. What is the expected distribution? How dows it differ from what you get?
```{r out.width = '100%'}
hist(pvals, breaks = 1000)
#The p-value graph should be distributed around zero. What we got appears to be the expected result, because in the graph it is clear that frequency of probes with p-value zero or very close to it is the highest, while the frequency of probes with bigger p-value plummets extremely.
```
- Performance-wise, how long will it take to compute the test for all probes?
```{r}
system.time(apply(matrix,1,function(x) {t.test(x[sick],x[control])$p.value}))
```
- What is multiple hypothesis testing?
Using and manipulating statistical methods it is more likely to get “significant” data. Due to that, Type 1 Error can occur more often , when we reject correct null hypothesis.
- How should we adjust for multiple hypothesis testing in our case?
```{r}
pvals.adjusted <- p.adjust(pvals, method="BH", n = length(pvals))
# We are using BH/FDR (correction method of Benjamini and Hochberg) correction. This lets us control the false discovery rate, the expected proportion of false discoveries amongst the rejected hypotheses.
```
- Did you find any probes that show statistically significant modification difference between healthy and sick individuals?
```{r}
pvals.significant <- pvals.adjusted[pvals.adjusted < 0.05]
names(pvals.significant)
```
- Where are these probes? What genes are they related to?
```{r}
# UCSC Gene symbols
annotation$UCSC_RefGene_Name[match(names(pvals.significant), annotation$ID)]
# Chromosomes and coordinates
annotation$UCSC_CpG_Islands_Name[match(names(pvals.significant), annotation$ID)]
```
- NEXT STEPS
- Normalizavimas, kvantiliu normalizacija
```{r}
normalized <- normalizeBetweenArrays(matrix)
```
- PCA
```{r out.width = '100%'}
ageSex <- read.csv("./GSE50409_agesexMatrix.csv")
stopifnot(all(ageSex$SampleID == colnames(normalized)))
imputed <- impute.knn(normalized)
pca <- prcomp(t(imputed$data), scale=FALSE)
pairs(pca$x[,1:4], col=as.factor(ageSex$predictedGender))
```
- Reikia atmesti lytiniu chromosomu probus
```{r out.width = '100%'}
sexProbes <- which(annotation$CHR %in% c("X", "Y"))
imputed <- impute.knn(normalized[-sexProbes,])
pca <- prcomp(t(imputed$data), scale=FALSE)
pairs(pca$x[,1:4], col=as.factor(ageSex$predictedGender))
```
- Some samples could be outliers messing up our results. How to detect outliers?
Outlier'iai yra surandami iš pca x reikšmių atėmus vidurkius ir paėmus gauto skaičiaus modulį. Jį palyginame su standartiniu nuokrypiu padaugintu iš 3. Jeigu mūsų gautas skaičius yra didesnis negu standartinis nuokrypis padaugintas iš 3, laikome šį tašką outlieriu.
```{r}
out1 <- abs(pca$x[,1] - mean(pca$x[,1])) > 3*sd(pca$x[,1])
out2 <- abs(pca$x[,2] - mean(pca$x[,2])) > 3*sd(pca$x[,2])
outs <- which(out1 | out2)
```
- Normalizacija
```{r}
normalized <- normalizeBetweenArrays(matrix[-sexProbes, -outs])
```
Atvaizduojame normalizuotą matricą, panaikinus outlierių taškus, bei lytinių chromosomų probus.
```{r}
imputed <- impute.knn(normalized)
pca <- prcomp(t(imputed$data), scale=FALSE)
pairs(pca$x[,1:4], col=as.factor(ageSex$predictedGender))
```
- Thousands of t-tests can be computed at once within seconds. How? limma
```{r out.width = '100%'}
group <- as.factor(as.character(disease$source_name_ch1))
ageSex$Group <- group
cellCounts <- read.csv("./GSE50409_cellCounts.csv")
stopifnot(all(ageSex$SampleID == cellCounts$X))
ageSex$Bcell <- cellCounts$Bcell
ageSex$CD4T <- cellCounts$CD4T
ageSex$CD8T <- cellCounts$CD8T
ageSex$Gran <- cellCounts$Gran
ageSex$Mono <- cellCounts$Mono
ageSex$NK <- cellCounts$NK
model <- model.matrix(~Group + predictedGender + DNAmAge + Bcell + CD4T + CD8T + Gran + Mono + NK , data=ageSex[-outs,])
fit <- lmFit(t(pca$x[, 1:10]), model)
fit <- eBayes(fit)
toptable(fit)
decideTests(fit)
plot(pca$x[,1], pca$x[,3], col=as.factor(ageSex$Group[-outs]))
boxplot(pca$x[,1] ~ as.factor(ageSex$Group[-outs]))
```
```{r}
computeFit <- function(permute = FALSE) {
if (!permute)
{
model <- model.matrix(~ Group + predictedGender + DNAmAge + Bcell + CD4T + CD8T + Gran + Mono + NK, data=ageSex[-outs,])
}
else
{
model <- model.matrix(~ sample(Group) + predictedGender + DNAmAge + Bcell + CD4T + CD8T + Gran + Mono + NK, data=ageSex[-outs,])
}
fit <- lmFit(imputed$data, model)
fit <- eBayes(fit)
fit <- topTable(fit, coef=2, number=nrow(imputed$data), sort.by="none")
return(fit)
}
fit <- computeFit()
```
```{r out.width = '100%'}
#Histogram of p-values
hist(fit$P.Value, breaks=1000)
```
- Permutations
```{r}
set.seed(123)
n <- 100
observed <- mean(fit$P.Value < 0.05)
expected <- numeric(n)
for (i in 1:n)
{
permutedFit <- computeFit(permute = TRUE)
expected[i] <- mean(permutedFit$P.Value < 0.05)
}
p <- mean(expected > observed)
hist(expected, breaks=20, main=paste("The test p =", p))
abline(v=observed, col="red")
```
```{r}
interestingColumns <- c("ID", "CHR", "MAPINFO", "UCSC_RefGene_Name", "UCSC_CpG_Islands_Name")
res <- cbind(fit, annotation[-sexProbes, interestingColumns])
i <- which(res$adj.P.Val < 0.05)
res <- res[i,]
o <- order(res$P.Value)
res <- res[o,]
datatable(res, class = 'cell-border stripe', options = list(
searching = TRUE,
pageLength = 5,
lengthMenu = c(5, 10, 15, 20),
scrollX = TRUE
))
```