Merge branch 'sritchie73-bioinfo-r-capstone-aus' into bioinfo-r-capstone

rbeagrie · Jul 23, 2014 · b803492 · b803492
2 parents 5bbba76 + aae6fe6
commit b803492
Show file tree

Hide file tree

Showing 3 changed files with 395 additions and 360 deletions.
diff --git a/data/counts.txt b/data/counts.txt
@@ -0,0 +1,7 @@
+Chr	Start	End	Strand	Length	ctl1.fastq_tophat.accepted_hits.bam	ctl2.fastq_tophat.accepted_hits.bam	ctl3.fastq_tophat.accepted_hits.bam	uvb1.fastq_tophat.accepted_hits.bam	uvb2.fastq_tophat.accepted_hits.bam	uvb3.fastq_tophat.accepted_hits.bam
+DDX11L1	1;1;1;1	11869;12595;12975;13221	12227;12721;13052;14412	+;+;+;+	1756	337	322	325	131	158	136
+WASH7P	1;1;1;1;1;1;1;1;1;1;1;1;1	14363;14970;15796;16607;16854;17233;17498;17602;17915;18268;24734;29321;29534	14829;15038;15947;16765;17055;17368;17504;17742;18061;18379;24891;29370;29806	-;-;-;-;-;-;-;-;-;-;-;-;-	2073	319	377	299	213	194	203
+MIR1302-10	1;1;1	29554;30267;30976	30039;30667;31109	+;+;+	1021	313	348	328	250	247	263
+FAM138A	1;1;1	34554;35245;35721	35174;35481;36081	-;-;-	1219	321	333	340	253	234	276
+OR4G4P	1;1	52473;54830	53312;54936	+;+	947	322	311	364	378	315	405
+OR4G11P	1	62948	63887	+	940	343	339	335	418	450	437
diff --git a/novice/capstones/deseq_capstone/analysis.Rmd b/novice/capstones/deseq_capstone/analysis.Rmd
@@ -7,7 +7,7 @@ output:
 
 This is an introduction to RNAseq analysis for use at Software Carpentry bootcamps that have covered novice R. It involves reading in some count data from an RNAseq experiment, exploring the data using base R functions and then analysis with the package DESeq2.
 
-## Install required CRAN packages
+# Install required CRAN packages
 
 First, install some packages that you'll use.
 
@@ -20,28 +20,28 @@ install.packages("calibrate")
 Import the data as a `data.frame` and examine it. The data is stored in a text file with the first line being a header giving the column names, and the row names in the first column. 
 
 ```{r data_input}
-## Filename with output from featureCounts
-countfile <- "data/counts.txt"
-## Read in the data
-countdata <- read.table(countfile, header=TRUE, row.names=1)
-head(countdata)
-colnames(countdata)
-class(countdata)
+# Filename with output from featureCounts
+countFile <- "data/counts.txt"
+# Read in the data
+countData <- read.table(countFile, header=TRUE, row.names=1)
+head(countData)
+colnames(countData)
+class(countData)
 ```
 It contains information about genes (one gene per row) with the gene positions in the first five columns and then information about the number of reads aligning to the gene in each experimental sample. We don't need the information on gene position, so we can remove it from the data frame.
 
 ```{r}
 # Remove first five columns (chr, start, end, strand, length)
-countdata <- countdata[ ,-(1:5)]
-head(countdata)
-colnames(countdata)
+countData <- countData[ ,-(1:5)]
+head(countData)
+colnames(countData)
 ```
 
 We can rename the columns to something a bit more readable.
 ```{r eval=FALSE}
-## Manually
+# Manually
 c("ctl1", "ctl2", "ctl3", "uvb1", "uvb2", "uvb3")
-## Using paste
+# Using paste
 ?paste
 paste("ctl", 1:3)
 paste("ctl", 1:3, sep="")
@@ -50,115 +50,125 @@ paste0("ctl", 1:3)
 c(paste0("ctl", 1:4), paste0("uvb", 1:5))
 ```
 
-Using `gsub` is a more reproducible way to do this.
+An easier way to do this, especially for files with many columns, is to use the `gsub` command to strip out the extra information. This is also more robust to introduced errors, for example if the column order changes at some point in the future.
 ```{r rename_cols}
-## Using gsub -- reproducible
+# Using gsub -- reproducible
 ?gsub
-gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countdata))
-colnames(countdata) <- gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countdata))
-head(countdata)
+gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countData))
+colnames(countData) <- gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countData))
+head(countData)
 ```
 
-### Exercise 1
+## Exercise 1
 Find the gene with the highest expression in any sample. Extract  the expression data for this gene for all samples. In which sample does it have the highest expression? What is the function of the gene? Can you suggest why this is the top expressed gene?
 
 ```{r exercise_1}
-max(apply(countdata, 1, max)) #max expression is 7013
-which.max(apply(countdata, 1, max)) #gene is EEF1A1P9
-countdata[13514, ] #get other sample data - max is in uvb1
+max(apply(countData, 1, max)) # max expression is 7013
+topGene <- which.max(apply(countData, 1, max)) # gene is EEF1A1P9
+countData[topGene, ] # get other sample data - max is in uvb1
 
-#this is a pseudogene - maybe an artefact of only aligning reads to a single chromosome?
+# this is a pseudogene - maybe an artefact of only aligning reads to a single chromosome?
 ```
 
-## Data investigation using base R
+# Data investigation using base R
 
 We can investigate this data a bit more using some of the basic R functions before going on to use more sophisticated analysis tools.
 
 We will calculate the mean for each gene for each condition. First make a copy of the data, because we'll need it later. We will work on the copy.
 
 ```{r get_means}
-countdata2 <- countdata
-
-#get Control columns
-colnames(countdata2)
-grep("ctl", colnames(countdata2))
-ctlCols <- grep("ctl", colnames(countdata2))
-head(countdata2[,ctlCols])
-countdata2$ctlMean <- apply(countdata2[, ctlCols], 1, mean)
-
-#same for uvb
-uvbCols <- grep("uvb", colnames(countdata2))
-countdata2$uvbMean <- apply(countdata2[, uvbCols], 1, mean)
+countData2 <- countData
+
+# get Control columns
+colnames(countData2)
+grep("ctl", colnames(countData2))
+ctlCols <- grep("ctl", colnames(countData2))
+head(countData2[,ctlCols])
+apply(countData2[, ctlCols], 1, mean) 
+# here we'll use rowMeans instead, its a convenient shortcut, and also faster!
+countData2$ctlMean <- rowMeans(countData2[, ctlCols])
+
+# same for uvb
+uvbCols <- grep("uvb", colnames(countData2))
+countData2$uvbMean <- rowMeans(countData2[, uvbCols])
 ```
 
 Plot the mean expression of each gene in control against the UVB sample mean. Look for outliers.
 
 ```{r plot_means}
-plot(countdata2$ctlMean, countdata2$uvbMean)
+plot(countData2$ctlMean, countData2$uvbMean)
 ```
 
 ```{r ggplot_means}
 library(ggplot2)
-ggplot(countdata2, aes(x=ctlMean, y=uvbMean)) + geom_point()
+ggplot(countData2, aes(x=ctlMean, y=uvbMean)) + geom_point()
 ```
 
-### Exercise 2
+## Exercise 2
 How could you make this plot more informative and look more professional? Hint: try using a log scale. Try changing colours, transparencies, sizes, or shapes of points. 
 
 `help(par)` will give you information on lots of graphical parameters that can be set. Help for ggplot2 can be found [here](http://docs.ggplot2.org/current/).
 
 ```{r exercise2_1}
-plot(countdata2$ctlMean, countdata2$uvbMean, log="xy")
+plot(countData2$ctlMean, countData2$uvbMean, log="xy")
 ```
 
 ```{r exercise2_2}
-ggplot(countdata2, aes(x=ctlMean, y=uvbMean)) + geom_point() + scale_x_log10() + scale_y_log10() + theme_bw()
+ggplot(countData2, aes(x=ctlMean, y=uvbMean)) + geom_point() + scale_x_log10() + scale_y_log10() + theme_bw()
 ```
 There are lots more options you can use to alter the appearance of these plots.
 
-##Find candidate differentially expressed genes
+#Find candidate differentially expressed genes
 
 We can find candidate differentially expressed genes by looking for genes with a large change between control and UVB samples. A common threshold used is log2 fold change more than 2 fold. We will calculate log2 fold change for all the genes and colour the genes with log2 fold change more than 2 fold on the plot.
 
 ```{r remove_unexpressed}
-sum(countdata2$ctlMean > 0)
-sum(countdata2$uvbMean > 0)
-
-nrow(countdata2)
-countdata2 <- subset(countdata2, (countdata2$ctlMean > 0 | countdata2$uvbMean > 0))
-#explain | operator meaning OR in this context?
-nrow(countdata2)
+# explain: whats happening here, is for each ctlMean there is a TRUE/FALSE for 
+# if it is greater than 0. TRUE and FALSE can be summed: FALSE is typically 
+# considered a 0, and TRUE is considered to be 1. So when we call
+# `sum(countData2$ctlMean > 0)`, we're really asking, "how many genes have a 
+# mean above 0 in the control group?"
+sum(countData2$ctlMean > 0)
+sum(countData2$uvbMean > 0)
+
+nrow(countData2)
+countData2 <- subset(countData2, (countData2$ctlMean > 0 | countData2$uvbMean > 0))
+# explain: | operator meaning OR in this context?
+nrow(countData2)
 ```
 
 ```{r log2FC}
-countdata2$log2FC <- log2(countdata2$uvbMean / countdata2$ctlMean)
-sum(countdata2$log2FC > 2)
-sum(countdata2$log2FC < -2)
+# explain: what is fold change? why do we use log2 to quantify the fold change?
+countData2$log2FC <- log2(countData2$uvbMean / countData2$ctlMean)
+# again, reinforce that summing a logical vector gives you the number of 
+# occurences of TRUE.
+sum(countData2$log2FC > 2)
+sum(countData2$log2FC < -2)
 ```
 Make a new column to store this information in.
 
 ```{r outliers}
-countdata2$outlier <- FALSE
-countdata2$outlier[countdata2$log2FC > 2] <- TRUE
-countdata2$outlier[countdata2$log2FC < -2] <- TRUE
+countData2$outlier <- FALSE
+countData2$outlier[countData2$log2FC > 2] <- TRUE
+countData2$outlier[countData2$log2FC < -2] <- TRUE
 ```
 
 ```{r plot_outliers}
-plot(countdata2$ctlMean, countdata2$uvbMean, log="xy")
-points(countdata2$ctlMean[countdata2$outlier==TRUE], countdata2$uvbMean[countdata2$outlier==TRUE], col="red")
+plot(countData2$ctlMean, countData2$uvbMean, log="xy")
+points(countData2$ctlMean[countData2$outlier==TRUE], countData2$uvbMean[countData2$outlier==TRUE], col="red")
 ```
 
 ```{r ggplot_outliers}
-ggplot(countdata2, aes(x=ctlMean, y=uvbMean, colour=outlier)) + geom_point() + scale_x_log10() + scale_y_log10() 
+ggplot(countData2, aes(x=ctlMean, y=uvbMean, colour=outlier)) + geom_point() + scale_x_log10() + scale_y_log10() 
 ```
 
-## DESeq2 analysis
+# DESeq2 analysis
 
 DESeq2 is an R package for analysis of RNAseq data. It is available from [Bioconductor](http://www.bioconductor.org/). Bioconductor is a project to provide tools for analysing high-throughput genomic data including RNA-seq, ChIP-seq and arrays. You can explore Bioconductor packages [here](http://www.bioconductor.org/packages/release/BiocViews.html#___Software). 
 
 ```{r install_deseq2, eval=FALSE}
-#install and have a break to check everyone is up to date?
-#explain bioconductor?
+# install and have a break to check everyone is up to date?
+# explain bioconductor?
 source("http://bioconductor.org/biocLite.R")
 biocLite("DESeq2")
 ```
@@ -171,23 +181,24 @@ citation("DESeq2")
 It requires the count data to be in matrix form, and an additional dataframe describing the structure of the experiment.
 
 ```{r convert_to_matrix}
-# Convert to matrix
-class(countdata)
-countdata <- as.matrix(countdata)
-class(countdata)
-head(countdata)
-
-# construct coldata dataframe
-#three replicates of control and UVB.
-coldata <- data.frame(condition=c(rep("ctl", 3), rep("uvb",3)), row.names=colnames(countdata))
+# countData is currently a data.frame, but DESeq2 expects its input to be in 
+# matrix format, so we will convert our countData to a matrix.
+class(countData)
+countData <- as.matrix(countData)
+class(countData)
+head(countData)
+
+# construct colData dataframe
+# three replicates of control and UVB.
+colData <- data.frame(condition=c(rep("ctl", 3), rep("uvb",3)), row.names=colnames(countData))
 ```
 
 DESeq works on a particular type of object called a DESeqDataSet.
 
 ```{r make_deseqdataset}
-#introduce how DESeq2 works - type of object it works on etc
+# introduce how DESeq2 works - type of object it works on etc
 # instantiate the DESeqDataSet
-dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
+dds <- DESeqDataSetFromMatrix(countData=countData, colData=colData, design=~condition)
 dds
 ```
 
@@ -201,24 +212,25 @@ dds <- DESeq(dds)
 res <- results(dds)
 head(res)
 table(res$padj<0.05)
-## Order by adjusted p-value
+# Order by adjusted p-value
 res <- res[order(res$padj), ]
 head(res)
 ```
 
 Combine DEseq results with the original counts data. Write significant results to a file.
 
 ```{r write_results}
-resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
-head(resdata)
-names(resdata)[1] <- "GeneID"
-head(resdata)
-
-sig <- subset(resdata, padj<0.05)
-write.table(sig, file="results/sig.txt", sep="\t") #tab delim data
+resData <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
+head(resData)
+names(resData)[1] <- "GeneID"
+head(resData)
+
+sig <- subset(resData, padj<0.05)
+dir.create("results")
+write.table(sig, file="results/sig.txt", sep="\t") # tab delim data
 ```
 
-## Data Visualization
+# Data Visualization
 
 We can also do some exploratory plotting of the data.
 
@@ -227,7 +239,7 @@ plotDispEsts(dds, main="Dispersion plot")
 ```
 
 ```{r plot_heatmaps}
-## Regularized log transformation for clustering/heatmaps, etc
+# Regularized log transformation for clustering/heatmaps, etc
 rld <- rlogTransformation(dds)
 plotPCA(rld)
 
@@ -239,7 +251,7 @@ dist(t(assay(rld)))
 as.matrix(dist(t(assay(rld))))
 sampleDists <- as.matrix(dist(t(assay(rld))))
 heatmap(sampleDists)
-## better heatmap with gplots
+# better heatmap with gplots
 library(gplots)
 heatmap.2(sampleDists)
 heatmap.2(sampleDists, col=colorpanel(64, "steelblue", "white"), key=FALSE, trace="none")
@@ -249,13 +261,13 @@ heatmap.2(sampleDists, col=colorpanel(64, "red", "white", "blue"), key=FALSE, tr
 ```
 
 ```{r plot_pval_hist}
-## Examine plot of p-values
+# Examine plot of p-values
 hist(res$pvalue, breaks=50, col="grey")
 ```
 
 
 ```{r MA_plot}
-#These are the plots that are most recognisable from papers
+# These are the plots that are most recognisable from papers
 # MA Plot
 par(pch=16)
 with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x"))
@@ -268,14 +280,14 @@ with(subset(res, padj<.05), textxy(baseMean, log2FoldChange, labs=Gene, cex=1, c
 
 ```{r volcano_plot}
 # Volcano plot
-## Set point character
+# Set point character
 par(pch=16)
 with(res, plot(log2FoldChange, -log10(pvalue), main="Volcano plot"))
 with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), col="red"))
 with(subset(res, abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), col="orange"))
 with(subset(res, padj<.05 & abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), col="green"))
-## Add legend
+# Add legend
 legend("topleft", legend=c("FDR<0.05", "|LFC|>1", "both"), pch=16, col=c("red","orange","green"))
-## Label points
+# Label points
 with(subset(res, padj<.05 & abs(log2FoldChange)>1), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=1))
 ```