From 5d13af80bbb92d14982b169a6c0ed428ddb6ea62 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Wed, 23 Jul 2014 12:06:24 +1000 Subject: [PATCH 01/12] Expanded on motivation for using `gsub` to strip extra information on column names --- analysis.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis.Rmd b/analysis.Rmd index 3edbf0e52..ee5f39fdd 100644 --- a/analysis.Rmd +++ b/analysis.Rmd @@ -50,7 +50,7 @@ paste0("ctl", 1:3) c(paste0("ctl", 1:4), paste0("uvb", 1:5)) ``` -Using `gsub` is a more reproducible way to do this. +An easier way to do this, especially for files with many columns, is to use the `gsub` command to strip out the extra information. This is also more robust to introduced errors, for example if the column order changes at some point in the future. ```{r rename_cols} ## Using gsub -- reproducible ?gsub From 5a01412a88cdabc53d52a1215687329c572539f3 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Wed, 23 Jul 2014 12:07:56 +1000 Subject: [PATCH 02/12] removed magic number, replacing with variable that stores the gene with the maximum number of aligning reads across all samples. --- analysis.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analysis.Rmd b/analysis.Rmd index ee5f39fdd..8a9b7b853 100644 --- a/analysis.Rmd +++ b/analysis.Rmd @@ -64,8 +64,8 @@ Find the gene with the highest expression in any sample. Extract the expression ```{r exercise_1} max(apply(countdata, 1, max)) #max expression is 7013 -which.max(apply(countdata, 1, max)) #gene is EEF1A1P9 -countdata[13514, ] #get other sample data - max is in uvb1 +top.gene <- which.max(apply(countdata, 1, max)) #gene is EEF1A1P9 +countdata[top.gene, ] #get other sample data - max is in uvb1 #this is a pseudogene - maybe an artefact of only aligning reads to a single chromosome? ``` From 0244f5001cd016a369fe114238e100f5fe3184d3 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Wed, 23 Jul 2014 14:16:02 +1000 Subject: [PATCH 03/12] Introduced the `rowMeans` function as a shortcut, while reinforcing the concept of the `apply` family of functions. --- analysis.Rmd | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/analysis.Rmd b/analysis.Rmd index 8a9b7b853..54da9ce3e 100644 --- a/analysis.Rmd +++ b/analysis.Rmd @@ -84,11 +84,13 @@ colnames(countdata2) grep("ctl", colnames(countdata2)) ctlCols <- grep("ctl", colnames(countdata2)) head(countdata2[,ctlCols]) -countdata2$ctlMean <- apply(countdata2[, ctlCols], 1, mean) +apply(countdata2[, ctlCols], 1, mean) +# here we'll use rowMeans instead, its a convenient shortcut, and also faster! +countdata2$ctlMean <- rowMeans(countdata2[, ctlCols]) #same for uvb uvbCols <- grep("uvb", colnames(countdata2)) -countdata2$uvbMean <- apply(countdata2[, uvbCols], 1, mean) +countdata2$uvbMean <- rowMeans(countdata2[, uvbCols] ``` Plot the mean expression of each gene in control against the UVB sample mean. Look for outliers. From 38572c32f2e294c462b6ba0b309633f19168adef Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Wed, 23 Jul 2014 15:43:19 +1000 Subject: [PATCH 04/12] Added required call to "dir.create" so that the Rmd file can be knit'd (how do I verb knitr?) --- analysis.Rmd | 1 + 1 file changed, 1 insertion(+) diff --git a/analysis.Rmd b/analysis.Rmd index 54da9ce3e..1c8022b12 100644 --- a/analysis.Rmd +++ b/analysis.Rmd @@ -217,6 +217,7 @@ names(resdata)[1] <- "GeneID" head(resdata) sig <- subset(resdata, padj<0.05) +dir.create("results") write.table(sig, file="results/sig.txt", sep="\t") #tab delim data ``` From 759651126ab9f40775190e4422a544f296d9f7e0 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Wed, 23 Jul 2014 15:56:03 +1000 Subject: [PATCH 05/12] Reconstructed count data from previously knit'd md file. --- data/counts.txt | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 data/counts.txt diff --git a/data/counts.txt b/data/counts.txt new file mode 100644 index 000000000..6f0f26a7c --- /dev/null +++ b/data/counts.txt @@ -0,0 +1,7 @@ +Chr Start End Strand Length ctl1.fastq_tophat.accepted_hits.bam ctl2.fastq_tophat.accepted_hits.bam ctl3.fastq_tophat.accepted_hits.bam uvb1.fastq_tophat.accepted_hits.bam uvb2.fastq_tophat.accepted_hits.bam uvb3.fastq_tophat.accepted_hits.bam +DDX11L1 1;1;1;1 11869;12595;12975;13221 12227;12721;13052;14412 +;+;+;+ 1756 0 0 0 0 0 0 +WASH7P 1;1;1;1;1;1;1;1;1;1;1;1;1 14363;14970;15796;16607;16854;17233;17498;17602;17915;18268;24734;29321;29534 14829;15038;15947;16765;17055;17368;17504;17742;18061;18379;24891;29370;29806 -;-;-;-;-;-;-;-;-;-;-;-;- 2073 0 0 0 0 0 0 +MIR1302-10 1;1;1 29554;30267;30976 30039;30667;31109 +;+;+ 1021 0 0 0 0 0 0 +FAM138A 1;1;1 34554;35245;35721 35174;35481;36081 -;-;- 1219 0 0 0 0 0 0 +OR4G4P 1;1 52473;54830 53312;54936 +;+ 947 0 0 0 0 0 0 +OR4G11P 1 62948 63887 + 940 0 0 0 0 0 0 From 025540e446876a2f2fd9074a81fd9ee988ee2612 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Wed, 23 Jul 2014 16:06:41 +1000 Subject: [PATCH 06/12] Spoofed some differential RNAseq count data --- data/counts.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/data/counts.txt b/data/counts.txt index 6f0f26a7c..cfbc57ad1 100644 --- a/data/counts.txt +++ b/data/counts.txt @@ -1,7 +1,7 @@ Chr Start End Strand Length ctl1.fastq_tophat.accepted_hits.bam ctl2.fastq_tophat.accepted_hits.bam ctl3.fastq_tophat.accepted_hits.bam uvb1.fastq_tophat.accepted_hits.bam uvb2.fastq_tophat.accepted_hits.bam uvb3.fastq_tophat.accepted_hits.bam -DDX11L1 1;1;1;1 11869;12595;12975;13221 12227;12721;13052;14412 +;+;+;+ 1756 0 0 0 0 0 0 -WASH7P 1;1;1;1;1;1;1;1;1;1;1;1;1 14363;14970;15796;16607;16854;17233;17498;17602;17915;18268;24734;29321;29534 14829;15038;15947;16765;17055;17368;17504;17742;18061;18379;24891;29370;29806 -;-;-;-;-;-;-;-;-;-;-;-;- 2073 0 0 0 0 0 0 -MIR1302-10 1;1;1 29554;30267;30976 30039;30667;31109 +;+;+ 1021 0 0 0 0 0 0 -FAM138A 1;1;1 34554;35245;35721 35174;35481;36081 -;-;- 1219 0 0 0 0 0 0 -OR4G4P 1;1 52473;54830 53312;54936 +;+ 947 0 0 0 0 0 0 -OR4G11P 1 62948 63887 + 940 0 0 0 0 0 0 +DDX11L1 1;1;1;1 11869;12595;12975;13221 12227;12721;13052;14412 +;+;+;+ 1756 337 322 325 131 158 136 +WASH7P 1;1;1;1;1;1;1;1;1;1;1;1;1 14363;14970;15796;16607;16854;17233;17498;17602;17915;18268;24734;29321;29534 14829;15038;15947;16765;17055;17368;17504;17742;18061;18379;24891;29370;29806 -;-;-;-;-;-;-;-;-;-;-;-;- 2073 319 377 299 213 194 203 +MIR1302-10 1;1;1 29554;30267;30976 30039;30667;31109 +;+;+ 1021 313 348 328 250 247 263 +FAM138A 1;1;1 34554;35245;35721 35174;35481;36081 -;-;- 1219 321 333 340 253 234 276 +OR4G4P 1;1 52473;54830 53312;54936 +;+ 947 322 311 364 378 315 405 +OR4G11P 1 62948 63887 + 940 343 339 335 418 450 437 From 5dec78c3c97531a1fae9f61723251077b06b10ce Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Wed, 23 Jul 2014 16:14:23 +1000 Subject: [PATCH 07/12] Changed some variable names to camelCase for consistency with rest of the naming scheme --- analysis.Rmd | 106 +++++++++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/analysis.Rmd b/analysis.Rmd index 1c8022b12..3eaac0910 100644 --- a/analysis.Rmd +++ b/analysis.Rmd @@ -21,20 +21,20 @@ Import the data as a `data.frame` and examine it. The data is stored in a text f ```{r data_input} ## Filename with output from featureCounts -countfile <- "data/counts.txt" +countFile <- "data/counts.txt" ## Read in the data -countdata <- read.table(countfile, header=TRUE, row.names=1) -head(countdata) -colnames(countdata) -class(countdata) +countData <- read.table(countFile, header=TRUE, row.names=1) +head(countData) +colnames(countData) +class(countData) ``` It contains information about genes (one gene per row) with the gene positions in the first five columns and then information about the number of reads aligning to the gene in each experimental sample. We don't need the information on gene position, so we can remove it from the data frame. ```{r} # Remove first five columns (chr, start, end, strand, length) -countdata <- countdata[ ,-(1:5)] -head(countdata) -colnames(countdata) +countData <- countData[ ,-(1:5)] +head(countData) +colnames(countData) ``` We can rename the columns to something a bit more readable. @@ -54,18 +54,18 @@ An easier way to do this, especially for files with many columns, is to use the ```{r rename_cols} ## Using gsub -- reproducible ?gsub -gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countdata)) -colnames(countdata) <- gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countdata)) -head(countdata) +gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countData)) +colnames(countData) <- gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countData)) +head(countData) ``` ### Exercise 1 Find the gene with the highest expression in any sample. Extract the expression data for this gene for all samples. In which sample does it have the highest expression? What is the function of the gene? Can you suggest why this is the top expressed gene? ```{r exercise_1} -max(apply(countdata, 1, max)) #max expression is 7013 -top.gene <- which.max(apply(countdata, 1, max)) #gene is EEF1A1P9 -countdata[top.gene, ] #get other sample data - max is in uvb1 +max(apply(countData, 1, max)) #max expression is 7013 +topGene <- which.max(apply(countData, 1, max)) #gene is EEF1A1P9 +countData[topGene, ] #get other sample data - max is in uvb1 #this is a pseudogene - maybe an artefact of only aligning reads to a single chromosome? ``` @@ -77,31 +77,31 @@ We can investigate this data a bit more using some of the basic R functions befo We will calculate the mean for each gene for each condition. First make a copy of the data, because we'll need it later. We will work on the copy. ```{r get_means} -countdata2 <- countdata +countData2 <- countData #get Control columns -colnames(countdata2) -grep("ctl", colnames(countdata2)) -ctlCols <- grep("ctl", colnames(countdata2)) -head(countdata2[,ctlCols]) -apply(countdata2[, ctlCols], 1, mean) +colnames(countData2) +grep("ctl", colnames(countData2)) +ctlCols <- grep("ctl", colnames(countData2)) +head(countData2[,ctlCols]) +apply(countData2[, ctlCols], 1, mean) # here we'll use rowMeans instead, its a convenient shortcut, and also faster! -countdata2$ctlMean <- rowMeans(countdata2[, ctlCols]) +countData2$ctlMean <- rowMeans(countData2[, ctlCols]) #same for uvb -uvbCols <- grep("uvb", colnames(countdata2)) -countdata2$uvbMean <- rowMeans(countdata2[, uvbCols] +uvbCols <- grep("uvb", colnames(countData2)) +countData2$uvbMean <- rowMeans(countData2[, uvbCols]) ``` Plot the mean expression of each gene in control against the UVB sample mean. Look for outliers. ```{r plot_means} -plot(countdata2$ctlMean, countdata2$uvbMean) +plot(countData2$ctlMean, countData2$uvbMean) ``` ```{r ggplot_means} library(ggplot2) -ggplot(countdata2, aes(x=ctlMean, y=uvbMean)) + geom_point() +ggplot(countData2, aes(x=ctlMean, y=uvbMean)) + geom_point() ``` ### Exercise 2 @@ -110,11 +110,11 @@ How could you make this plot more informative and look more professional? Hint: `help(par)` will give you information on lots of graphical parameters that can be set. Help for ggplot2 can be found [here](http://docs.ggplot2.org/current/). ```{r exercise2_1} -plot(countdata2$ctlMean, countdata2$uvbMean, log="xy") +plot(countData2$ctlMean, countData2$uvbMean, log="xy") ``` ```{r exercise2_2} -ggplot(countdata2, aes(x=ctlMean, y=uvbMean)) + geom_point() + scale_x_log10() + scale_y_log10() + theme_bw() +ggplot(countData2, aes(x=ctlMean, y=uvbMean)) + geom_point() + scale_x_log10() + scale_y_log10() + theme_bw() ``` There are lots more options you can use to alter the appearance of these plots. @@ -123,35 +123,35 @@ There are lots more options you can use to alter the appearance of these plots. We can find candidate differentially expressed genes by looking for genes with a large change between control and UVB samples. A common threshold used is log2 fold change more than 2 fold. We will calculate log2 fold change for all the genes and colour the genes with log2 fold change more than 2 fold on the plot. ```{r remove_unexpressed} -sum(countdata2$ctlMean > 0) -sum(countdata2$uvbMean > 0) +sum(countData2$ctlMean > 0) +sum(countData2$uvbMean > 0) -nrow(countdata2) -countdata2 <- subset(countdata2, (countdata2$ctlMean > 0 | countdata2$uvbMean > 0)) +nrow(countData2) +countData2 <- subset(countData2, (countData2$ctlMean > 0 | countData2$uvbMean > 0)) #explain | operator meaning OR in this context? -nrow(countdata2) +nrow(countData2) ``` ```{r log2FC} -countdata2$log2FC <- log2(countdata2$uvbMean / countdata2$ctlMean) -sum(countdata2$log2FC > 2) -sum(countdata2$log2FC < -2) +countData2$log2FC <- log2(countData2$uvbMean / countData2$ctlMean) +sum(countData2$log2FC > 2) +sum(countData2$log2FC < -2) ``` Make a new column to store this information in. ```{r outliers} -countdata2$outlier <- FALSE -countdata2$outlier[countdata2$log2FC > 2] <- TRUE -countdata2$outlier[countdata2$log2FC < -2] <- TRUE +countData2$outlier <- FALSE +countData2$outlier[countData2$log2FC > 2] <- TRUE +countData2$outlier[countData2$log2FC < -2] <- TRUE ``` ```{r plot_outliers} -plot(countdata2$ctlMean, countdata2$uvbMean, log="xy") -points(countdata2$ctlMean[countdata2$outlier==TRUE], countdata2$uvbMean[countdata2$outlier==TRUE], col="red") +plot(countData2$ctlMean, countData2$uvbMean, log="xy") +points(countData2$ctlMean[countData2$outlier==TRUE], countData2$uvbMean[countData2$outlier==TRUE], col="red") ``` ```{r ggplot_outliers} -ggplot(countdata2, aes(x=ctlMean, y=uvbMean, colour=outlier)) + geom_point() + scale_x_log10() + scale_y_log10() +ggplot(countData2, aes(x=ctlMean, y=uvbMean, colour=outlier)) + geom_point() + scale_x_log10() + scale_y_log10() ``` ## DESeq2 analysis @@ -174,14 +174,14 @@ It requires the count data to be in matrix form, and an additional dataframe des ```{r convert_to_matrix} # Convert to matrix -class(countdata) -countdata <- as.matrix(countdata) -class(countdata) -head(countdata) +class(countData) +countData <- as.matrix(countData) +class(countData) +head(countData) -# construct coldata dataframe +# construct colData dataframe #three replicates of control and UVB. -coldata <- data.frame(condition=c(rep("ctl", 3), rep("uvb",3)), row.names=colnames(countdata)) +colData <- data.frame(condition=c(rep("ctl", 3), rep("uvb",3)), row.names=colnames(countData)) ``` DESeq works on a particular type of object called a DESeqDataSet. @@ -189,7 +189,7 @@ DESeq works on a particular type of object called a DESeqDataSet. ```{r make_deseqdataset} #introduce how DESeq2 works - type of object it works on etc # instantiate the DESeqDataSet -dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition) +dds <- DESeqDataSetFromMatrix(countData=countData, colData=colData, design=~condition) dds ``` @@ -211,12 +211,12 @@ head(res) Combine DEseq results with the original counts data. Write significant results to a file. ```{r write_results} -resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE) -head(resdata) -names(resdata)[1] <- "GeneID" -head(resdata) +resData <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE) +head(resData) +names(resData)[1] <- "GeneID" +head(resData) -sig <- subset(resdata, padj<0.05) +sig <- subset(resData, padj<0.05) dir.create("results") write.table(sig, file="results/sig.txt", sep="\t") #tab delim data ``` From 89fafde3a87fc19ff16121c1be08fc6e287cb228 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Wed, 23 Jul 2014 16:15:05 +1000 Subject: [PATCH 08/12] knit'd md file now reflects Rmd file --- analysis.md | 560 +++++++++++++++++++++++++++------------------------- 1 file changed, 288 insertions(+), 272 deletions(-) diff --git a/analysis.md b/analysis.md index e83b463f1..f01d234cf 100644 --- a/analysis.md +++ b/analysis.md @@ -18,10 +18,10 @@ Import the data as a `data.frame` and examine it. The data is stored in a text f ```r ## Filename with output from featureCounts -countfile <- "data/counts.txt" +countFile <- "data/counts.txt" ## Read in the data -countdata <- read.table(countfile, header=TRUE, row.names=1) -head(countdata) +countData <- read.table(countFile, header=TRUE, row.names=1) +head(countData) ``` ``` @@ -54,51 +54,51 @@ head(countdata) ## OR4G4P +;+ 947 ## OR4G11P + 940 ## ctl1.fastq_tophat.accepted_hits.bam -## DDX11L1 0 -## WASH7P 0 -## MIR1302-10 0 -## FAM138A 0 -## OR4G4P 0 -## OR4G11P 0 +## DDX11L1 337 +## WASH7P 319 +## MIR1302-10 313 +## FAM138A 321 +## OR4G4P 322 +## OR4G11P 343 ## ctl2.fastq_tophat.accepted_hits.bam -## DDX11L1 0 -## WASH7P 0 -## MIR1302-10 0 -## FAM138A 0 -## OR4G4P 0 -## OR4G11P 0 +## DDX11L1 322 +## WASH7P 377 +## MIR1302-10 348 +## FAM138A 333 +## OR4G4P 311 +## OR4G11P 339 ## ctl3.fastq_tophat.accepted_hits.bam -## DDX11L1 0 -## WASH7P 0 -## MIR1302-10 0 -## FAM138A 0 -## OR4G4P 0 -## OR4G11P 0 +## DDX11L1 325 +## WASH7P 299 +## MIR1302-10 328 +## FAM138A 340 +## OR4G4P 364 +## OR4G11P 335 ## uvb1.fastq_tophat.accepted_hits.bam -## DDX11L1 0 -## WASH7P 0 -## MIR1302-10 0 -## FAM138A 0 -## OR4G4P 0 -## OR4G11P 0 +## DDX11L1 131 +## WASH7P 213 +## MIR1302-10 250 +## FAM138A 253 +## OR4G4P 378 +## OR4G11P 418 ## uvb2.fastq_tophat.accepted_hits.bam -## DDX11L1 0 -## WASH7P 0 -## MIR1302-10 0 -## FAM138A 0 -## OR4G4P 0 -## OR4G11P 0 +## DDX11L1 158 +## WASH7P 194 +## MIR1302-10 247 +## FAM138A 234 +## OR4G4P 315 +## OR4G11P 450 ## uvb3.fastq_tophat.accepted_hits.bam -## DDX11L1 0 -## WASH7P 0 -## MIR1302-10 0 -## FAM138A 0 -## OR4G4P 0 -## OR4G11P 0 +## DDX11L1 136 +## WASH7P 203 +## MIR1302-10 263 +## FAM138A 276 +## OR4G4P 405 +## OR4G11P 437 ``` ```r -colnames(countdata) +colnames(countData) ``` ``` @@ -116,7 +116,7 @@ colnames(countdata) ``` ```r -class(countdata) +class(countData) ``` ``` @@ -127,57 +127,57 @@ It contains information about genes (one gene per row) with the gene positions i ```r # Remove first five columns (chr, start, end, strand, length) -countdata <- countdata[ ,-(1:5)] -head(countdata) +countData <- countData[ ,-(1:5)] +head(countData) ``` ``` ## ctl1.fastq_tophat.accepted_hits.bam -## DDX11L1 0 -## WASH7P 0 -## MIR1302-10 0 -## FAM138A 0 -## OR4G4P 0 -## OR4G11P 0 +## DDX11L1 337 +## WASH7P 319 +## MIR1302-10 313 +## FAM138A 321 +## OR4G4P 322 +## OR4G11P 343 ## ctl2.fastq_tophat.accepted_hits.bam -## DDX11L1 0 -## WASH7P 0 -## MIR1302-10 0 -## FAM138A 0 -## OR4G4P 0 -## OR4G11P 0 +## DDX11L1 322 +## WASH7P 377 +## MIR1302-10 348 +## FAM138A 333 +## OR4G4P 311 +## OR4G11P 339 ## ctl3.fastq_tophat.accepted_hits.bam -## DDX11L1 0 -## WASH7P 0 -## MIR1302-10 0 -## FAM138A 0 -## OR4G4P 0 -## OR4G11P 0 +## DDX11L1 325 +## WASH7P 299 +## MIR1302-10 328 +## FAM138A 340 +## OR4G4P 364 +## OR4G11P 335 ## uvb1.fastq_tophat.accepted_hits.bam -## DDX11L1 0 -## WASH7P 0 -## MIR1302-10 0 -## FAM138A 0 -## OR4G4P 0 -## OR4G11P 0 +## DDX11L1 131 +## WASH7P 213 +## MIR1302-10 250 +## FAM138A 253 +## OR4G4P 378 +## OR4G11P 418 ## uvb2.fastq_tophat.accepted_hits.bam -## DDX11L1 0 -## WASH7P 0 -## MIR1302-10 0 -## FAM138A 0 -## OR4G4P 0 -## OR4G11P 0 +## DDX11L1 158 +## WASH7P 194 +## MIR1302-10 247 +## FAM138A 234 +## OR4G4P 315 +## OR4G11P 450 ## uvb3.fastq_tophat.accepted_hits.bam -## DDX11L1 0 -## WASH7P 0 -## MIR1302-10 0 -## FAM138A 0 -## OR4G4P 0 -## OR4G11P 0 +## DDX11L1 136 +## WASH7P 203 +## MIR1302-10 263 +## FAM138A 276 +## OR4G4P 405 +## OR4G11P 437 ``` ```r -colnames(countdata) +colnames(countData) ``` ``` @@ -203,12 +203,12 @@ paste0("ctl", 1:3) c(paste0("ctl", 1:4), paste0("uvb", 1:5)) ``` -Using `gsub` is a more reproducible way to do this. +An easier way to do this, especially for files with many columns, is to use the `gsub` command to strip out the extra information. This is also more robust to introduced errors, for example if the column order changes at some point in the future. ```r ## Using gsub -- reproducible ?gsub -gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countdata)) +gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countData)) ``` ``` @@ -216,18 +216,18 @@ gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(count ``` ```r -colnames(countdata) <- gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countdata)) -head(countdata) +colnames(countData) <- gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countData)) +head(countData) ``` ``` ## ctl1 ctl2 ctl3 uvb1 uvb2 uvb3 -## DDX11L1 0 0 0 0 0 0 -## WASH7P 0 0 0 0 0 0 -## MIR1302-10 0 0 0 0 0 0 -## FAM138A 0 0 0 0 0 0 -## OR4G4P 0 0 0 0 0 0 -## OR4G11P 0 0 0 0 0 0 +## DDX11L1 337 322 325 131 158 136 +## WASH7P 319 377 299 213 194 203 +## MIR1302-10 313 348 328 250 247 263 +## FAM138A 321 333 340 253 234 276 +## OR4G4P 322 311 364 378 315 405 +## OR4G11P 343 339 335 418 450 437 ``` ### Exercise 1 @@ -235,29 +235,21 @@ Find the gene with the highest expression in any sample. Extract the expression ```r -max(apply(countdata, 1, max)) #max expression is 7013 +max(apply(countData, 1, max)) #max expression is 7013 ``` ``` -## [1] 7013 +## [1] 450 ``` ```r -which.max(apply(countdata, 1, max)) #gene is EEF1A1P9 +topGene <- which.max(apply(countData, 1, max)) #gene is EEF1A1P9 +countData[topGene, ] #get other sample data - max is in uvb1 ``` ``` -## EEF1A1P9 -## 13514 -``` - -```r -countdata[13514, ] #get other sample data - max is in uvb1 -``` - -``` -## ctl1 ctl2 ctl3 uvb1 uvb2 uvb3 -## EEF1A1P9 3570 3788 4345 7013 4217 3630 +## ctl1 ctl2 ctl3 uvb1 uvb2 uvb3 +## OR4G11P 343 339 335 418 450 437 ``` ```r @@ -272,10 +264,10 @@ We will calculate the mean for each gene for each condition. First make a copy o ```r -countdata2 <- countdata +countData2 <- countData #get Control columns -colnames(countdata2) +colnames(countData2) ``` ``` @@ -283,7 +275,7 @@ colnames(countdata2) ``` ```r -grep("ctl", colnames(countdata2)) +grep("ctl", colnames(countData2)) ``` ``` @@ -291,33 +283,43 @@ grep("ctl", colnames(countdata2)) ``` ```r -ctlCols <- grep("ctl", colnames(countdata2)) -head(countdata2[,ctlCols]) +ctlCols <- grep("ctl", colnames(countData2)) +head(countData2[,ctlCols]) ``` ``` ## ctl1 ctl2 ctl3 -## DDX11L1 0 0 0 -## WASH7P 0 0 0 -## MIR1302-10 0 0 0 -## FAM138A 0 0 0 -## OR4G4P 0 0 0 -## OR4G11P 0 0 0 +## DDX11L1 337 322 325 +## WASH7P 319 377 299 +## MIR1302-10 313 348 328 +## FAM138A 321 333 340 +## OR4G4P 322 311 364 +## OR4G11P 343 339 335 +``` + +```r +apply(countData2[, ctlCols], 1, mean) +``` + +``` +## DDX11L1 WASH7P MIR1302-10 FAM138A OR4G4P OR4G11P +## 328.0 331.7 329.7 331.3 332.3 339.0 ``` ```r -countdata2$ctlMean <- apply(countdata2[, ctlCols], 1, mean) +# here we'll use rowMeans instead, its a convenient shortcut, and also faster! +countData2$ctlMean <- rowMeans(countData2[, ctlCols]) #same for uvb -uvbCols <- grep("uvb", colnames(countdata2)) -countdata2$uvbMean <- apply(countdata2[, uvbCols], 1, mean) +uvbCols <- grep("uvb", colnames(countData2)) +countData2$uvbMean <- rowMeans(countData2[, uvbCols]) ``` Plot the mean expression of each gene in control against the UVB sample mean. Look for outliers. ```r -plot(countdata2$ctlMean, countdata2$uvbMean) +plot(countData2$ctlMean, countData2$uvbMean) ``` ![plot of chunk plot_means](./analysis_files/figure-html/plot_means.png) @@ -325,7 +327,7 @@ plot(countdata2$ctlMean, countdata2$uvbMean) ```r library(ggplot2) -ggplot(countdata2, aes(x=ctlMean, y=uvbMean)) + geom_point() +ggplot(countData2, aes(x=ctlMean, y=uvbMean)) + geom_point() ``` ![plot of chunk ggplot_means](./analysis_files/figure-html/ggplot_means.png) @@ -337,19 +339,14 @@ How could you make this plot more informative and look more professional? Hint: ```r -plot(countdata2$ctlMean, countdata2$uvbMean, log="xy") -``` - -``` -## Warning: 56164 x values <= 0 omitted from logarithmic plot -## Warning: 56110 y values <= 0 omitted from logarithmic plot +plot(countData2$ctlMean, countData2$uvbMean, log="xy") ``` ![plot of chunk exercise2_1](./analysis_files/figure-html/exercise2_1.png) ```r -ggplot(countdata2, aes(x=ctlMean, y=uvbMean)) + geom_point() + scale_x_log10() + scale_y_log10() + theme_bw() +ggplot(countData2, aes(x=ctlMean, y=uvbMean)) + geom_point() + scale_x_log10() + scale_y_log10() + theme_bw() ``` ![plot of chunk exercise2_2](./analysis_files/figure-html/exercise2_2.png) @@ -361,84 +358,76 @@ We can find candidate differentially expressed genes by looking for genes with a ```r -sum(countdata2$ctlMean > 0) +sum(countData2$ctlMean > 0) ``` ``` -## [1] 474 +## [1] 6 ``` ```r -sum(countdata2$uvbMean > 0) +sum(countData2$uvbMean > 0) ``` ``` -## [1] 528 +## [1] 6 ``` ```r -nrow(countdata2) +nrow(countData2) ``` ``` -## [1] 56638 +## [1] 6 ``` ```r -countdata2 <- subset(countdata2, (countdata2$ctlMean > 0 | countdata2$uvbMean > 0)) +countData2 <- subset(countData2, (countData2$ctlMean > 0 | countData2$uvbMean > 0)) #explain | operator meaning OR in this context? -nrow(countdata2) +nrow(countData2) ``` ``` -## [1] 565 +## [1] 6 ``` ```r -countdata2$log2FC <- log2(countdata2$uvbMean / countdata2$ctlMean) -sum(countdata2$log2FC > 2) +countData2$log2FC <- log2(countData2$uvbMean / countData2$ctlMean) +sum(countData2$log2FC > 2) ``` ``` -## [1] 170 +## [1] 0 ``` ```r -sum(countdata2$log2FC < -2) +sum(countData2$log2FC < -2) ``` ``` -## [1] 46 +## [1] 0 ``` Make a new column to store this information in. ```r -countdata2$outlier <- FALSE -countdata2$outlier[countdata2$log2FC > 2] <- TRUE -countdata2$outlier[countdata2$log2FC < -2] <- TRUE -``` - - -```r -plot(countdata2$ctlMean, countdata2$uvbMean, log="xy") +countData2$outlier <- FALSE +countData2$outlier[countData2$log2FC > 2] <- TRUE +countData2$outlier[countData2$log2FC < -2] <- TRUE ``` -``` -## Warning: 91 x values <= 0 omitted from logarithmic plot -## Warning: 37 y values <= 0 omitted from logarithmic plot -``` ```r -points(countdata2$ctlMean[countdata2$outlier==TRUE], countdata2$uvbMean[countdata2$outlier==TRUE], col="red") +plot(countData2$ctlMean, countData2$uvbMean, log="xy") +points(countData2$ctlMean[countData2$outlier==TRUE], countData2$uvbMean[countData2$outlier==TRUE], col="red") ``` ![plot of chunk plot_outliers](./analysis_files/figure-html/plot_outliers.png) ```r -ggplot(countdata2, aes(x=ctlMean, y=uvbMean, colour=outlier)) + geom_point() + scale_x_log10() + scale_y_log10() +ggplot(countData2, aes(x=ctlMean, y=uvbMean, colour=outlier)) + geom_point() + scale_x_log10() + scale_y_log10() ``` ![plot of chunk ggplot_outliers](./analysis_files/figure-html/ggplot_outliers.png) @@ -492,10 +481,6 @@ library(DESeq2) ## Loading required package: RcppArmadillo ``` -``` -## Warning: package 'RcppArmadillo' was built under R version 3.1.1 -``` - ```r citation("DESeq2") ``` @@ -523,7 +508,7 @@ It requires the count data to be in matrix form, and an additional dataframe des ```r # Convert to matrix -class(countdata) +class(countData) ``` ``` @@ -531,8 +516,8 @@ class(countdata) ``` ```r -countdata <- as.matrix(countdata) -class(countdata) +countData <- as.matrix(countData) +class(countData) ``` ``` @@ -540,23 +525,23 @@ class(countdata) ``` ```r -head(countdata) +head(countData) ``` ``` ## ctl1 ctl2 ctl3 uvb1 uvb2 uvb3 -## DDX11L1 0 0 0 0 0 0 -## WASH7P 0 0 0 0 0 0 -## MIR1302-10 0 0 0 0 0 0 -## FAM138A 0 0 0 0 0 0 -## OR4G4P 0 0 0 0 0 0 -## OR4G11P 0 0 0 0 0 0 +## DDX11L1 337 322 325 131 158 136 +## WASH7P 319 377 299 213 194 203 +## MIR1302-10 313 348 328 250 247 263 +## FAM138A 321 333 340 253 234 276 +## OR4G4P 322 311 364 378 315 405 +## OR4G11P 343 339 335 418 450 437 ``` ```r -# construct coldata dataframe +# construct colData dataframe #three replicates of control and UVB. -coldata <- data.frame(condition=c(rep("ctl", 3), rep("uvb",3)), row.names=colnames(countdata)) +colData <- data.frame(condition=c(rep("ctl", 3), rep("uvb",3)), row.names=colnames(countData)) ``` DESeq works on a particular type of object called a DESeqDataSet. @@ -565,16 +550,24 @@ DESeq works on a particular type of object called a DESeqDataSet. ```r #introduce how DESeq2 works - type of object it works on etc # instantiate the DESeqDataSet -dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition) +dds <- DESeqDataSetFromMatrix(countData=countData, colData=colData, design=~condition) +``` + +``` +## Warning: some variables in design formula are characters, converting to +## factors +``` + +```r dds ``` ``` ## class: DESeqDataSet -## dim: 56638 6 +## dim: 6 6 ## exptData(0): ## assays(1): counts -## rownames(56638): DDX11L1 WASH7P ... MT-TT MT-TP +## rownames(6): DDX11L1 WASH7P ... OR4G4P OR4G11P ## rowData metadata column names(0): ## colnames(6): ctl1 ctl2 ... uvb2 uvb3 ## colData names(1): condition @@ -593,6 +586,21 @@ dds <- DESeq(dds) ## estimating dispersions ## gene-wise dispersion estimates ## mean-dispersion relationship +``` + +``` +## Warning: the parametric fit of dispersion estimates over the mean of counts +## failed, which occurs when the trend is not well captured by the +## function y = a/x + b. A local regression fit is automatically performed, +## and the analysis can continue. You can specify fitType='local' or 'mean' +## to avoid this message if re-running the same data. +## When using local regression fit, the user should examine plotDispEsts(dds) +## to make sure the fitted line is not sharply curving up or down based on +## the position of individual points. +## Warning: Estimated rdf < 1.0; not estimating variance +``` + +``` ## final dispersion estimates ## fitting model and testing ``` @@ -609,20 +617,20 @@ head(res) ## DataFrame with 6 rows and 6 columns ## baseMean log2FoldChange lfcSE stat pvalue ## -## DDX11L1 0 NA NA NA NA -## WASH7P 0 NA NA NA NA -## MIR1302-10 0 NA NA NA NA -## FAM138A 0 NA NA NA NA -## OR4G4P 0 NA NA NA NA -## OR4G11P 0 NA NA NA NA +## DDX11L1 225.4 -0.800662 0.11188 -7.15620 8.295e-13 +## WASH7P 261.6 -0.316571 0.10258 -3.08609 2.028e-03 +## MIR1302-10 289.2 -0.001925 0.09442 -0.02039 9.837e-01 +## FAM138A 290.4 -0.005237 0.09441 -0.05546 9.558e-01 +## OR4G4P 354.2 0.498655 0.09741 5.11892 3.073e-07 +## OR4G11P 397.6 0.725214 0.07570 9.58062 9.647e-22 ## padj ## -## DDX11L1 NA -## WASH7P NA -## MIR1302-10 NA -## FAM138A NA -## OR4G4P NA -## OR4G11P NA +## DDX11L1 2.488e-12 +## WASH7P 3.042e-03 +## MIR1302-10 9.837e-01 +## FAM138A 9.837e-01 +## OR4G4P 6.146e-07 +## OR4G11P 5.788e-21 ``` ```r @@ -632,7 +640,7 @@ table(res$padj<0.05) ``` ## ## FALSE TRUE -## 337 50 +## 2 4 ``` ```r @@ -645,65 +653,81 @@ head(res) ## log2 fold change (MAP): condition uvb vs ctl ## Wald test p-value: condition uvb vs ctl ## DataFrame with 6 rows and 6 columns -## baseMean log2FoldChange lfcSE stat pvalue padj -## -## ANXA3 312.32 3.910 0.3475 11.251 2.287e-29 8.849e-27 -## SFRP2 1711.27 -3.356 0.3407 -9.852 6.735e-23 1.303e-20 -## MT2P1 167.36 4.067 0.4582 8.878 6.834e-19 8.816e-17 -## IL8 31.70 5.976 0.9456 6.320 2.620e-10 2.535e-08 -## PTPN13 358.34 -1.386 0.2362 -5.869 4.388e-09 3.396e-07 -## CXCL1 42.47 3.691 0.6448 5.724 1.040e-08 6.705e-07 +## baseMean log2FoldChange lfcSE stat pvalue +## +## OR4G11P 397.6 0.725214 0.07570 9.58062 9.647e-22 +## DDX11L1 225.4 -0.800662 0.11188 -7.15620 8.295e-13 +## OR4G4P 354.2 0.498655 0.09741 5.11892 3.073e-07 +## WASH7P 261.6 -0.316571 0.10258 -3.08609 2.028e-03 +## MIR1302-10 289.2 -0.001925 0.09442 -0.02039 9.837e-01 +## FAM138A 290.4 -0.005237 0.09441 -0.05546 9.558e-01 +## padj +## +## OR4G11P 5.788e-21 +## DDX11L1 2.488e-12 +## OR4G4P 6.146e-07 +## WASH7P 3.042e-03 +## MIR1302-10 9.837e-01 +## FAM138A 9.837e-01 ``` Combine DEseq results with the original counts data. Write significant results to a file. ```r -resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE) -head(resdata) +resData <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE) +head(resData) ``` ``` -## Row.names baseMean log2FoldChange lfcSE stat pvalue padj -## 1 ANXA3 312.32 3.910 0.3475 11.251 2.287e-29 8.849e-27 -## 2 SFRP2 1711.27 -3.356 0.3407 -9.852 6.735e-23 1.303e-20 -## 3 MT2P1 167.36 4.067 0.4582 8.878 6.834e-19 8.816e-17 -## 4 IL8 31.70 5.976 0.9456 6.320 2.620e-10 2.535e-08 -## 5 PTPN13 358.34 -1.386 0.2362 -5.869 4.388e-09 3.396e-07 -## 6 CXCL1 42.47 3.691 0.6448 5.724 1.040e-08 6.705e-07 -## ctl1 ctl2 ctl3 uvb1 uvb2 uvb3 -## 1 26.33 35.131 48.83 378.95 783.44 601.27 -## 2 1847.65 4865.598 2683.82 382.63 325.05 162.88 -## 3 19.02 8.107 23.63 438.34 373.97 141.08 -## 4 0.00 0.000 0.00 66.12 44.18 79.93 -## 5 485.69 535.067 538.65 203.25 205.13 182.26 -## 6 11.70 2.702 0.00 89.99 50.49 99.91 +## Row.names baseMean log2FoldChange lfcSE stat pvalue padj +## 1 OR4G11P 397.6 0.725214 0.07570 9.58062 9.647e-22 5.788e-21 +## 2 DDX11L1 225.4 -0.800662 0.11188 -7.15620 8.295e-13 2.488e-12 +## 3 OR4G4P 354.2 0.498655 0.09741 5.11892 3.073e-07 6.146e-07 +## 4 WASH7P 261.6 -0.316571 0.10258 -3.08609 2.028e-03 3.042e-03 +## 5 MIR1302-10 289.2 -0.001925 0.09442 -0.02039 9.837e-01 9.837e-01 +## 6 FAM138A 290.4 -0.005237 0.09441 -0.05546 9.558e-01 9.837e-01 +## ctl1 ctl2 ctl3 uvb1 uvb2 uvb3 +## 1 313.1 288.2 292.5 480.9 541.6 469.3 +## 2 307.6 273.7 283.8 150.7 190.2 146.1 +## 3 293.9 264.4 317.9 434.9 379.1 435.0 +## 4 291.2 320.5 261.1 245.1 233.5 218.0 +## 5 285.7 295.8 286.4 287.6 297.3 282.5 +## 6 293.0 283.1 296.9 291.1 281.6 296.4 ``` ```r -names(resdata)[1] <- "GeneID" -head(resdata) +names(resData)[1] <- "GeneID" +head(resData) ``` ``` -## GeneID baseMean log2FoldChange lfcSE stat pvalue padj ctl1 -## 1 ANXA3 312.32 3.910 0.3475 11.251 2.287e-29 8.849e-27 26.33 -## 2 SFRP2 1711.27 -3.356 0.3407 -9.852 6.735e-23 1.303e-20 1847.65 -## 3 MT2P1 167.36 4.067 0.4582 8.878 6.834e-19 8.816e-17 19.02 -## 4 IL8 31.70 5.976 0.9456 6.320 2.620e-10 2.535e-08 0.00 -## 5 PTPN13 358.34 -1.386 0.2362 -5.869 4.388e-09 3.396e-07 485.69 -## 6 CXCL1 42.47 3.691 0.6448 5.724 1.040e-08 6.705e-07 11.70 -## ctl2 ctl3 uvb1 uvb2 uvb3 -## 1 35.131 48.83 378.95 783.44 601.27 -## 2 4865.598 2683.82 382.63 325.05 162.88 -## 3 8.107 23.63 438.34 373.97 141.08 -## 4 0.000 0.00 66.12 44.18 79.93 -## 5 535.067 538.65 203.25 205.13 182.26 -## 6 2.702 0.00 89.99 50.49 99.91 +## GeneID baseMean log2FoldChange lfcSE stat pvalue padj +## 1 OR4G11P 397.6 0.725214 0.07570 9.58062 9.647e-22 5.788e-21 +## 2 DDX11L1 225.4 -0.800662 0.11188 -7.15620 8.295e-13 2.488e-12 +## 3 OR4G4P 354.2 0.498655 0.09741 5.11892 3.073e-07 6.146e-07 +## 4 WASH7P 261.6 -0.316571 0.10258 -3.08609 2.028e-03 3.042e-03 +## 5 MIR1302-10 289.2 -0.001925 0.09442 -0.02039 9.837e-01 9.837e-01 +## 6 FAM138A 290.4 -0.005237 0.09441 -0.05546 9.558e-01 9.837e-01 +## ctl1 ctl2 ctl3 uvb1 uvb2 uvb3 +## 1 313.1 288.2 292.5 480.9 541.6 469.3 +## 2 307.6 273.7 283.8 150.7 190.2 146.1 +## 3 293.9 264.4 317.9 434.9 379.1 435.0 +## 4 291.2 320.5 261.1 245.1 233.5 218.0 +## 5 285.7 295.8 286.4 287.6 297.3 282.5 +## 6 293.0 283.1 296.9 291.1 281.6 296.4 +``` + +```r +sig <- subset(resData, padj<0.05) +dir.create("results") +``` + +``` +## Warning: 'results' already exists ``` ```r -sig <- subset(resdata, padj<0.05) write.table(sig, file="results/sig.txt", sep="\t") #tab delim data ``` @@ -733,13 +757,13 @@ head(assay(rld)) ``` ``` -## ctl1 ctl2 ctl3 uvb1 uvb2 uvb3 -## DDX11L1 0 0 0 0 0 0 -## WASH7P 0 0 0 0 0 0 -## MIR1302-10 0 0 0 0 0 0 -## FAM138A 0 0 0 0 0 0 -## OR4G4P 0 0 0 0 0 0 -## OR4G11P 0 0 0 0 0 0 +## ctl1 ctl2 ctl3 uvb1 uvb2 uvb3 +## DDX11L1 7.947 7.888 7.906 7.636 7.724 7.624 +## WASH7P 8.078 8.126 8.026 7.998 7.977 7.947 +## MIR1302-10 8.170 8.187 8.171 8.173 8.189 8.165 +## FAM138A 8.186 8.170 8.192 8.183 8.167 8.191 +## OR4G4P 8.369 8.321 8.406 8.565 8.493 8.565 +## OR4G11P 8.500 8.463 8.470 8.718 8.785 8.705 ``` ```r @@ -747,12 +771,12 @@ assay(rld)[1:5,1:5] ``` ``` -## ctl1 ctl2 ctl3 uvb1 uvb2 -## DDX11L1 0 0 0 0 0 -## WASH7P 0 0 0 0 0 -## MIR1302-10 0 0 0 0 0 -## FAM138A 0 0 0 0 0 -## OR4G4P 0 0 0 0 0 +## ctl1 ctl2 ctl3 uvb1 uvb2 +## DDX11L1 7.947 7.888 7.906 7.636 7.724 +## WASH7P 8.078 8.126 8.026 7.998 7.977 +## MIR1302-10 8.170 8.187 8.171 8.173 8.189 +## FAM138A 8.186 8.170 8.192 8.183 8.167 +## OR4G4P 8.369 8.321 8.406 8.565 8.493 ``` ```r @@ -761,11 +785,11 @@ t(assay(rld))[1:5,1:5] ``` ## DDX11L1 WASH7P MIR1302-10 FAM138A OR4G4P -## ctl1 0 0 0 0 0 -## ctl2 0 0 0 0 0 -## ctl3 0 0 0 0 0 -## uvb1 0 0 0 0 0 -## uvb2 0 0 0 0 0 +## ctl1 7.947 8.078 8.170 8.186 8.369 +## ctl2 7.888 8.126 8.187 8.170 8.321 +## ctl3 7.906 8.026 8.171 8.192 8.406 +## uvb1 7.636 7.998 8.173 8.183 8.565 +## uvb2 7.724 7.977 8.189 8.167 8.493 ``` ```r @@ -773,12 +797,12 @@ dist(t(assay(rld))) ``` ``` -## ctl1 ctl2 ctl3 uvb1 uvb2 -## ctl2 12.18 -## ctl3 12.49 13.61 -## uvb1 15.07 16.22 17.29 -## uvb2 20.01 19.73 20.43 15.05 -## uvb3 18.70 18.94 20.21 13.75 11.83 +## ctl1 ctl2 ctl3 uvb1 uvb2 +## ctl2 0.10004 +## ctl3 0.08184 0.13488 +## uvb1 0.43456 0.45264 0.40115 +## uvb2 0.39578 0.42692 0.37861 0.13597 +## uvb3 0.44902 0.46984 0.40809 0.05509 0.15427 ``` ```r @@ -786,13 +810,13 @@ as.matrix(dist(t(assay(rld)))) ``` ``` -## ctl1 ctl2 ctl3 uvb1 uvb2 uvb3 -## ctl1 0.00 12.18 12.49 15.07 20.01 18.70 -## ctl2 12.18 0.00 13.61 16.22 19.73 18.94 -## ctl3 12.49 13.61 0.00 17.29 20.43 20.21 -## uvb1 15.07 16.22 17.29 0.00 15.05 13.75 -## uvb2 20.01 19.73 20.43 15.05 0.00 11.83 -## uvb3 18.70 18.94 20.21 13.75 11.83 0.00 +## ctl1 ctl2 ctl3 uvb1 uvb2 uvb3 +## ctl1 0.00000 0.1000 0.08184 0.43456 0.3958 0.44902 +## ctl2 0.10004 0.0000 0.13488 0.45264 0.4269 0.46984 +## ctl3 0.08184 0.1349 0.00000 0.40115 0.3786 0.40809 +## uvb1 0.43456 0.4526 0.40115 0.00000 0.1360 0.05509 +## uvb2 0.39578 0.4269 0.37861 0.13597 0.0000 0.15427 +## uvb3 0.44902 0.4698 0.40809 0.05509 0.1543 0.00000 ``` ```r @@ -807,10 +831,6 @@ heatmap(sampleDists) library(gplots) ``` -``` -## Warning: package 'gplots' was built under R version 3.1.1 -``` - ``` ## KernSmooth 2.23 loaded ## Copyright M. P. Wand 1997-2009 @@ -871,13 +891,6 @@ hist(res$pvalue, breaks=50, col="grey") # MA Plot par(pch=16) with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x")) -``` - -``` -## Warning: 56073 x values <= 0 omitted from logarithmic plot -``` - -```r with(subset(res, padj<.05), points(baseMean, log2FoldChange, col="red", pch=16)) library(calibrate) ``` @@ -905,8 +918,11 @@ with(subset(res, abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), with(subset(res, padj<.05 & abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), col="green")) ## Add legend legend("topleft", legend=c("FDR<0.05", "|LFC|>1", "both"), pch=16, col=c("red","orange","green")) -## Label points -with(subset(res, padj<.05 & abs(log2FoldChange)>1), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=1)) ``` ![plot of chunk volcano_plot](./analysis_files/figure-html/volcano_plot.png) + +```r +## Label points +with(subset(res, padj<.05 & abs(log2FoldChange)>1), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=1)) +``` From db8e7fbfaacb6035742718d499ca199e2d709df3 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Wed, 23 Jul 2014 16:19:30 +1000 Subject: [PATCH 09/12] Added explanation for a non-obvious function call. --- analysis.Rmd | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/analysis.Rmd b/analysis.Rmd index 3eaac0910..74e61e3b4 100644 --- a/analysis.Rmd +++ b/analysis.Rmd @@ -123,6 +123,11 @@ There are lots more options you can use to alter the appearance of these plots. We can find candidate differentially expressed genes by looking for genes with a large change between control and UVB samples. A common threshold used is log2 fold change more than 2 fold. We will calculate log2 fold change for all the genes and colour the genes with log2 fold change more than 2 fold on the plot. ```{r remove_unexpressed} +# explain: whats happening here, is for each ctlMean there is a TRUE/FALSE for +# if it is greater than 0. TRUE and FALSE can be summed: FALSE is typically +# considered a 0, and TRUE is considered to be 1. So when we call +# `sum(countData2$ctlMean > 0)`, we're really asking, "how many genes have a +# mean above 0 in the control group?" sum(countData2$ctlMean > 0) sum(countData2$uvbMean > 0) From 519288bd30528ebe96fb439e5a5888fc6e6610db Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Wed, 23 Jul 2014 16:21:45 +1000 Subject: [PATCH 10/12] Added some more teacher comments --- analysis.Rmd | 3 +++ 1 file changed, 3 insertions(+) diff --git a/analysis.Rmd b/analysis.Rmd index 74e61e3b4..efca25c5e 100644 --- a/analysis.Rmd +++ b/analysis.Rmd @@ -138,7 +138,10 @@ nrow(countData2) ``` ```{r log2FC} +# explain: what is fold change? why do we use log2 to quantify the fold change? countData2$log2FC <- log2(countData2$uvbMean / countData2$ctlMean) +# again, reinforce that summing a logical vector gives you the number of +# occurences of TRUE. sum(countData2$log2FC > 2) sum(countData2$log2FC < -2) ``` From ce241cb456f8d4e0d475de1380f24377ca55fa00 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Wed, 23 Jul 2014 16:24:15 +1000 Subject: [PATCH 11/12] added explanation for why we want to convert to a matrix --- analysis.Rmd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/analysis.Rmd b/analysis.Rmd index efca25c5e..819660e70 100644 --- a/analysis.Rmd +++ b/analysis.Rmd @@ -181,7 +181,8 @@ citation("DESeq2") It requires the count data to be in matrix form, and an additional dataframe describing the structure of the experiment. ```{r convert_to_matrix} -# Convert to matrix +# countData is currently a data.frame, but DESeq2 expects its input to be in +# matrix format, so we will convert our countData to a matrix. class(countData) countData <- as.matrix(countData) class(countData) From 249c34f2fe00c9cb70ae1a9e19ac664efb52d494 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Wed, 23 Jul 2014 16:26:00 +1000 Subject: [PATCH 12/12] Fixed comment formatting for consistency --- analysis.Rmd | 64 ++++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/analysis.Rmd b/analysis.Rmd index 819660e70..b249d5bfa 100644 --- a/analysis.Rmd +++ b/analysis.Rmd @@ -7,7 +7,7 @@ output: This is an introduction to RNAseq analysis for use at Software Carpentry bootcamps that have covered novice R. It involves reading in some count data from an RNAseq experiment, exploring the data using base R functions and then analysis with the package DESeq2. -## Install required CRAN packages +# Install required CRAN packages First, install some packages that you'll use. @@ -20,9 +20,9 @@ install.packages("calibrate") Import the data as a `data.frame` and examine it. The data is stored in a text file with the first line being a header giving the column names, and the row names in the first column. ```{r data_input} -## Filename with output from featureCounts +# Filename with output from featureCounts countFile <- "data/counts.txt" -## Read in the data +# Read in the data countData <- read.table(countFile, header=TRUE, row.names=1) head(countData) colnames(countData) @@ -39,9 +39,9 @@ colnames(countData) We can rename the columns to something a bit more readable. ```{r eval=FALSE} -## Manually +# Manually c("ctl1", "ctl2", "ctl3", "uvb1", "uvb2", "uvb3") -## Using paste +# Using paste ?paste paste("ctl", 1:3) paste("ctl", 1:3, sep="") @@ -52,25 +52,25 @@ c(paste0("ctl", 1:4), paste0("uvb", 1:5)) An easier way to do this, especially for files with many columns, is to use the `gsub` command to strip out the extra information. This is also more robust to introduced errors, for example if the column order changes at some point in the future. ```{r rename_cols} -## Using gsub -- reproducible +# Using gsub -- reproducible ?gsub gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countData)) colnames(countData) <- gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countData)) head(countData) ``` -### Exercise 1 +## Exercise 1 Find the gene with the highest expression in any sample. Extract the expression data for this gene for all samples. In which sample does it have the highest expression? What is the function of the gene? Can you suggest why this is the top expressed gene? ```{r exercise_1} -max(apply(countData, 1, max)) #max expression is 7013 -topGene <- which.max(apply(countData, 1, max)) #gene is EEF1A1P9 -countData[topGene, ] #get other sample data - max is in uvb1 +max(apply(countData, 1, max)) # max expression is 7013 +topGene <- which.max(apply(countData, 1, max)) # gene is EEF1A1P9 +countData[topGene, ] # get other sample data - max is in uvb1 -#this is a pseudogene - maybe an artefact of only aligning reads to a single chromosome? +# this is a pseudogene - maybe an artefact of only aligning reads to a single chromosome? ``` -## Data investigation using base R +# Data investigation using base R We can investigate this data a bit more using some of the basic R functions before going on to use more sophisticated analysis tools. @@ -79,7 +79,7 @@ We will calculate the mean for each gene for each condition. First make a copy o ```{r get_means} countData2 <- countData -#get Control columns +# get Control columns colnames(countData2) grep("ctl", colnames(countData2)) ctlCols <- grep("ctl", colnames(countData2)) @@ -88,7 +88,7 @@ apply(countData2[, ctlCols], 1, mean) # here we'll use rowMeans instead, its a convenient shortcut, and also faster! countData2$ctlMean <- rowMeans(countData2[, ctlCols]) -#same for uvb +# same for uvb uvbCols <- grep("uvb", colnames(countData2)) countData2$uvbMean <- rowMeans(countData2[, uvbCols]) ``` @@ -104,7 +104,7 @@ library(ggplot2) ggplot(countData2, aes(x=ctlMean, y=uvbMean)) + geom_point() ``` -### Exercise 2 +## Exercise 2 How could you make this plot more informative and look more professional? Hint: try using a log scale. Try changing colours, transparencies, sizes, or shapes of points. `help(par)` will give you information on lots of graphical parameters that can be set. Help for ggplot2 can be found [here](http://docs.ggplot2.org/current/). @@ -118,7 +118,7 @@ ggplot(countData2, aes(x=ctlMean, y=uvbMean)) + geom_point() + scale_x_log10() + ``` There are lots more options you can use to alter the appearance of these plots. -##Find candidate differentially expressed genes +#Find candidate differentially expressed genes We can find candidate differentially expressed genes by looking for genes with a large change between control and UVB samples. A common threshold used is log2 fold change more than 2 fold. We will calculate log2 fold change for all the genes and colour the genes with log2 fold change more than 2 fold on the plot. @@ -133,7 +133,7 @@ sum(countData2$uvbMean > 0) nrow(countData2) countData2 <- subset(countData2, (countData2$ctlMean > 0 | countData2$uvbMean > 0)) -#explain | operator meaning OR in this context? +# explain: | operator meaning OR in this context? nrow(countData2) ``` @@ -162,13 +162,13 @@ points(countData2$ctlMean[countData2$outlier==TRUE], countData2$uvbMean[countDat ggplot(countData2, aes(x=ctlMean, y=uvbMean, colour=outlier)) + geom_point() + scale_x_log10() + scale_y_log10() ``` -## DESeq2 analysis +# DESeq2 analysis DESeq2 is an R package for analysis of RNAseq data. It is available from [Bioconductor](http://www.bioconductor.org/). Bioconductor is a project to provide tools for analysing high-throughput genomic data including RNA-seq, ChIP-seq and arrays. You can explore Bioconductor packages [here](http://www.bioconductor.org/packages/release/BiocViews.html#___Software). ```{r install_deseq2, eval=FALSE} -#install and have a break to check everyone is up to date? -#explain bioconductor? +# install and have a break to check everyone is up to date? +# explain bioconductor? source("http://bioconductor.org/biocLite.R") biocLite("DESeq2") ``` @@ -189,14 +189,14 @@ class(countData) head(countData) # construct colData dataframe -#three replicates of control and UVB. +# three replicates of control and UVB. colData <- data.frame(condition=c(rep("ctl", 3), rep("uvb",3)), row.names=colnames(countData)) ``` DESeq works on a particular type of object called a DESeqDataSet. ```{r make_deseqdataset} -#introduce how DESeq2 works - type of object it works on etc +# introduce how DESeq2 works - type of object it works on etc # instantiate the DESeqDataSet dds <- DESeqDataSetFromMatrix(countData=countData, colData=colData, design=~condition) dds @@ -212,7 +212,7 @@ dds <- DESeq(dds) res <- results(dds) head(res) table(res$padj<0.05) -## Order by adjusted p-value +# Order by adjusted p-value res <- res[order(res$padj), ] head(res) ``` @@ -227,10 +227,10 @@ head(resData) sig <- subset(resData, padj<0.05) dir.create("results") -write.table(sig, file="results/sig.txt", sep="\t") #tab delim data +write.table(sig, file="results/sig.txt", sep="\t") # tab delim data ``` -## Data Visualization +# Data Visualization We can also do some exploratory plotting of the data. @@ -239,7 +239,7 @@ plotDispEsts(dds, main="Dispersion plot") ``` ```{r plot_heatmaps} -## Regularized log transformation for clustering/heatmaps, etc +# Regularized log transformation for clustering/heatmaps, etc rld <- rlogTransformation(dds) plotPCA(rld) @@ -251,7 +251,7 @@ dist(t(assay(rld))) as.matrix(dist(t(assay(rld)))) sampleDists <- as.matrix(dist(t(assay(rld)))) heatmap(sampleDists) -## better heatmap with gplots +# better heatmap with gplots library(gplots) heatmap.2(sampleDists) heatmap.2(sampleDists, col=colorpanel(64, "steelblue", "white"), key=FALSE, trace="none") @@ -261,13 +261,13 @@ heatmap.2(sampleDists, col=colorpanel(64, "red", "white", "blue"), key=FALSE, tr ``` ```{r plot_pval_hist} -## Examine plot of p-values +# Examine plot of p-values hist(res$pvalue, breaks=50, col="grey") ``` ```{r MA_plot} -#These are the plots that are most recognisable from papers +# These are the plots that are most recognisable from papers # MA Plot par(pch=16) with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x")) @@ -280,14 +280,14 @@ with(subset(res, padj<.05), textxy(baseMean, log2FoldChange, labs=Gene, cex=1, c ```{r volcano_plot} # Volcano plot -## Set point character +# Set point character par(pch=16) with(res, plot(log2FoldChange, -log10(pvalue), main="Volcano plot")) with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), col="red")) with(subset(res, abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), col="orange")) with(subset(res, padj<.05 & abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), col="green")) -## Add legend +# Add legend legend("topleft", legend=c("FDR<0.05", "|LFC|>1", "both"), pch=16, col=c("red","orange","green")) -## Label points +# Label points with(subset(res, padj<.05 & abs(log2FoldChange)>1), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=1)) ```