From 5d13af80bbb92d14982b169a6c0ed428ddb6ea62 Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Wed, 23 Jul 2014 12:06:24 +1000
Subject: [PATCH 01/12] Expanded on motivation for using `gsub` to strip extra
 information on column names

---
 analysis.Rmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/analysis.Rmd b/analysis.Rmd
index 3edbf0e52..ee5f39fdd 100644
--- a/analysis.Rmd
+++ b/analysis.Rmd
@@ -50,7 +50,7 @@ paste0("ctl", 1:3)
 c(paste0("ctl", 1:4), paste0("uvb", 1:5))
 ```
 
-Using `gsub` is a more reproducible way to do this.
+An easier way to do this, especially for files with many columns, is to use the `gsub` command to strip out the extra information. This is also more robust to introduced errors, for example if the column order changes at some point in the future.
 ```{r rename_cols}
 ## Using gsub -- reproducible
 ?gsub

From 5a01412a88cdabc53d52a1215687329c572539f3 Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Wed, 23 Jul 2014 12:07:56 +1000
Subject: [PATCH 02/12] removed magic number, replacing with variable that
 stores the gene with the maximum number of aligning reads across all samples.

---
 analysis.Rmd | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/analysis.Rmd b/analysis.Rmd
index ee5f39fdd..8a9b7b853 100644
--- a/analysis.Rmd
+++ b/analysis.Rmd
@@ -64,8 +64,8 @@ Find the gene with the highest expression in any sample. Extract  the expression
 
 ```{r exercise_1}
 max(apply(countdata, 1, max)) #max expression is 7013
-which.max(apply(countdata, 1, max)) #gene is EEF1A1P9
-countdata[13514, ] #get other sample data - max is in uvb1
+top.gene <- which.max(apply(countdata, 1, max)) #gene is EEF1A1P9
+countdata[top.gene, ] #get other sample data - max is in uvb1
 
 #this is a pseudogene - maybe an artefact of only aligning reads to a single chromosome?
 ```

From 0244f5001cd016a369fe114238e100f5fe3184d3 Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Wed, 23 Jul 2014 14:16:02 +1000
Subject: [PATCH 03/12] Introduced the `rowMeans` function as a shortcut, while
 reinforcing the concept of the `apply` family of functions.

---
 analysis.Rmd | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/analysis.Rmd b/analysis.Rmd
index 8a9b7b853..54da9ce3e 100644
--- a/analysis.Rmd
+++ b/analysis.Rmd
@@ -84,11 +84,13 @@ colnames(countdata2)
 grep("ctl", colnames(countdata2))
 ctlCols <- grep("ctl", colnames(countdata2))
 head(countdata2[,ctlCols])
-countdata2$ctlMean <- apply(countdata2[, ctlCols], 1, mean)
+apply(countdata2[, ctlCols], 1, mean) 
+# here we'll use rowMeans instead, its a convenient shortcut, and also faster!
+countdata2$ctlMean <- rowMeans(countdata2[, ctlCols])
 
 #same for uvb
 uvbCols <- grep("uvb", colnames(countdata2))
-countdata2$uvbMean <- apply(countdata2[, uvbCols], 1, mean)
+countdata2$uvbMean <- rowMeans(countdata2[, uvbCols]
 ```
 
 Plot the mean expression of each gene in control against the UVB sample mean. Look for outliers.

From 38572c32f2e294c462b6ba0b309633f19168adef Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Wed, 23 Jul 2014 15:43:19 +1000
Subject: [PATCH 04/12] Added required call to "dir.create" so that the Rmd
 file can be knit'd (how do I verb knitr?)

---
 analysis.Rmd | 1 +
 1 file changed, 1 insertion(+)

diff --git a/analysis.Rmd b/analysis.Rmd
index 54da9ce3e..1c8022b12 100644
--- a/analysis.Rmd
+++ b/analysis.Rmd
@@ -217,6 +217,7 @@ names(resdata)[1] <- "GeneID"
 head(resdata)
 
 sig <- subset(resdata, padj<0.05)
+dir.create("results")
 write.table(sig, file="results/sig.txt", sep="\t") #tab delim data
 ```
 

From 759651126ab9f40775190e4422a544f296d9f7e0 Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Wed, 23 Jul 2014 15:56:03 +1000
Subject: [PATCH 05/12] Reconstructed count data from previously knit'd md
 file.

---
 data/counts.txt | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 data/counts.txt

diff --git a/data/counts.txt b/data/counts.txt
new file mode 100644
index 000000000..6f0f26a7c
--- /dev/null
+++ b/data/counts.txt
@@ -0,0 +1,7 @@
+Chr	Start	End	Strand	Length	ctl1.fastq_tophat.accepted_hits.bam	ctl2.fastq_tophat.accepted_hits.bam	ctl3.fastq_tophat.accepted_hits.bam	uvb1.fastq_tophat.accepted_hits.bam	uvb2.fastq_tophat.accepted_hits.bam	uvb3.fastq_tophat.accepted_hits.bam
+DDX11L1	1;1;1;1	11869;12595;12975;13221	12227;12721;13052;14412	+;+;+;+	1756	0	0	0	0	0	0
+WASH7P	1;1;1;1;1;1;1;1;1;1;1;1;1	14363;14970;15796;16607;16854;17233;17498;17602;17915;18268;24734;29321;29534	14829;15038;15947;16765;17055;17368;17504;17742;18061;18379;24891;29370;29806	-;-;-;-;-;-;-;-;-;-;-;-;-	2073	0	0	0	0	0	0
+MIR1302-10	1;1;1	29554;30267;30976	30039;30667;31109	+;+;+	1021	0	0	0	0	0	0
+FAM138A	1;1;1	34554;35245;35721	35174;35481;36081	-;-;-	1219	0	0	0	0	0	0
+OR4G4P	1;1	52473;54830	53312;54936	+;+	947	0	0	0	0	0	0
+OR4G11P	1	62948	63887	+	940	0	0	0	0	0	0

From 025540e446876a2f2fd9074a81fd9ee988ee2612 Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Wed, 23 Jul 2014 16:06:41 +1000
Subject: [PATCH 06/12] Spoofed some differential RNAseq count data

---
 data/counts.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/data/counts.txt b/data/counts.txt
index 6f0f26a7c..cfbc57ad1 100644
--- a/data/counts.txt
+++ b/data/counts.txt
@@ -1,7 +1,7 @@
 Chr	Start	End	Strand	Length	ctl1.fastq_tophat.accepted_hits.bam	ctl2.fastq_tophat.accepted_hits.bam	ctl3.fastq_tophat.accepted_hits.bam	uvb1.fastq_tophat.accepted_hits.bam	uvb2.fastq_tophat.accepted_hits.bam	uvb3.fastq_tophat.accepted_hits.bam
-DDX11L1	1;1;1;1	11869;12595;12975;13221	12227;12721;13052;14412	+;+;+;+	1756	0	0	0	0	0	0
-WASH7P	1;1;1;1;1;1;1;1;1;1;1;1;1	14363;14970;15796;16607;16854;17233;17498;17602;17915;18268;24734;29321;29534	14829;15038;15947;16765;17055;17368;17504;17742;18061;18379;24891;29370;29806	-;-;-;-;-;-;-;-;-;-;-;-;-	2073	0	0	0	0	0	0
-MIR1302-10	1;1;1	29554;30267;30976	30039;30667;31109	+;+;+	1021	0	0	0	0	0	0
-FAM138A	1;1;1	34554;35245;35721	35174;35481;36081	-;-;-	1219	0	0	0	0	0	0
-OR4G4P	1;1	52473;54830	53312;54936	+;+	947	0	0	0	0	0	0
-OR4G11P	1	62948	63887	+	940	0	0	0	0	0	0
+DDX11L1	1;1;1;1	11869;12595;12975;13221	12227;12721;13052;14412	+;+;+;+	1756	337	322	325	131	158	136
+WASH7P	1;1;1;1;1;1;1;1;1;1;1;1;1	14363;14970;15796;16607;16854;17233;17498;17602;17915;18268;24734;29321;29534	14829;15038;15947;16765;17055;17368;17504;17742;18061;18379;24891;29370;29806	-;-;-;-;-;-;-;-;-;-;-;-;-	2073	319	377	299	213	194	203
+MIR1302-10	1;1;1	29554;30267;30976	30039;30667;31109	+;+;+	1021	313	348	328	250	247	263
+FAM138A	1;1;1	34554;35245;35721	35174;35481;36081	-;-;-	1219	321	333	340	253	234	276
+OR4G4P	1;1	52473;54830	53312;54936	+;+	947	322	311	364	378	315	405
+OR4G11P	1	62948	63887	+	940	343	339	335	418	450	437

From 5dec78c3c97531a1fae9f61723251077b06b10ce Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Wed, 23 Jul 2014 16:14:23 +1000
Subject: [PATCH 07/12] Changed some variable names to camelCase for
 consistency with rest of the naming scheme

---
 analysis.Rmd | 106 +++++++++++++++++++++++++--------------------------
 1 file changed, 53 insertions(+), 53 deletions(-)

diff --git a/analysis.Rmd b/analysis.Rmd
index 1c8022b12..3eaac0910 100644
--- a/analysis.Rmd
+++ b/analysis.Rmd
@@ -21,20 +21,20 @@ Import the data as a `data.frame` and examine it. The data is stored in a text f
 
 ```{r data_input}
 ## Filename with output from featureCounts
-countfile <- "data/counts.txt"
+countFile <- "data/counts.txt"
 ## Read in the data
-countdata <- read.table(countfile, header=TRUE, row.names=1)
-head(countdata)
-colnames(countdata)
-class(countdata)
+countData <- read.table(countFile, header=TRUE, row.names=1)
+head(countData)
+colnames(countData)
+class(countData)
 ```
 It contains information about genes (one gene per row) with the gene positions in the first five columns and then information about the number of reads aligning to the gene in each experimental sample. We don't need the information on gene position, so we can remove it from the data frame.
 
 ```{r}
 # Remove first five columns (chr, start, end, strand, length)
-countdata <- countdata[ ,-(1:5)]
-head(countdata)
-colnames(countdata)
+countData <- countData[ ,-(1:5)]
+head(countData)
+colnames(countData)
 ```
 
 We can rename the columns to something a bit more readable.
@@ -54,18 +54,18 @@ An easier way to do this, especially for files with many columns, is to use the
 ```{r rename_cols}
 ## Using gsub -- reproducible
 ?gsub
-gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countdata))
-colnames(countdata) <- gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countdata))
-head(countdata)
+gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countData))
+colnames(countData) <- gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countData))
+head(countData)
 ```
 
 ### Exercise 1
 Find the gene with the highest expression in any sample. Extract  the expression data for this gene for all samples. In which sample does it have the highest expression? What is the function of the gene? Can you suggest why this is the top expressed gene?
 
 ```{r exercise_1}
-max(apply(countdata, 1, max)) #max expression is 7013
-top.gene <- which.max(apply(countdata, 1, max)) #gene is EEF1A1P9
-countdata[top.gene, ] #get other sample data - max is in uvb1
+max(apply(countData, 1, max)) #max expression is 7013
+topGene <- which.max(apply(countData, 1, max)) #gene is EEF1A1P9
+countData[topGene, ] #get other sample data - max is in uvb1
 
 #this is a pseudogene - maybe an artefact of only aligning reads to a single chromosome?
 ```
@@ -77,31 +77,31 @@ We can investigate this data a bit more using some of the basic R functions befo
 We will calculate the mean for each gene for each condition. First make a copy of the data, because we'll need it later. We will work on the copy.
 
 ```{r get_means}
-countdata2 <- countdata
+countData2 <- countData
 
 #get Control columns
-colnames(countdata2)
-grep("ctl", colnames(countdata2))
-ctlCols <- grep("ctl", colnames(countdata2))
-head(countdata2[,ctlCols])
-apply(countdata2[, ctlCols], 1, mean) 
+colnames(countData2)
+grep("ctl", colnames(countData2))
+ctlCols <- grep("ctl", colnames(countData2))
+head(countData2[,ctlCols])
+apply(countData2[, ctlCols], 1, mean) 
 # here we'll use rowMeans instead, its a convenient shortcut, and also faster!
-countdata2$ctlMean <- rowMeans(countdata2[, ctlCols])
+countData2$ctlMean <- rowMeans(countData2[, ctlCols])
 
 #same for uvb
-uvbCols <- grep("uvb", colnames(countdata2))
-countdata2$uvbMean <- rowMeans(countdata2[, uvbCols]
+uvbCols <- grep("uvb", colnames(countData2))
+countData2$uvbMean <- rowMeans(countData2[, uvbCols])
 ```
 
 Plot the mean expression of each gene in control against the UVB sample mean. Look for outliers.
 
 ```{r plot_means}
-plot(countdata2$ctlMean, countdata2$uvbMean)
+plot(countData2$ctlMean, countData2$uvbMean)
 ```
 
 ```{r ggplot_means}
 library(ggplot2)
-ggplot(countdata2, aes(x=ctlMean, y=uvbMean)) + geom_point()
+ggplot(countData2, aes(x=ctlMean, y=uvbMean)) + geom_point()
 ```
 
 ### Exercise 2
@@ -110,11 +110,11 @@ How could you make this plot more informative and look more professional? Hint:
 `help(par)` will give you information on lots of graphical parameters that can be set. Help for ggplot2 can be found [here](http://docs.ggplot2.org/current/).
 
 ```{r exercise2_1}
-plot(countdata2$ctlMean, countdata2$uvbMean, log="xy")
+plot(countData2$ctlMean, countData2$uvbMean, log="xy")
 ```
 
 ```{r exercise2_2}
-ggplot(countdata2, aes(x=ctlMean, y=uvbMean)) + geom_point() + scale_x_log10() + scale_y_log10() + theme_bw()
+ggplot(countData2, aes(x=ctlMean, y=uvbMean)) + geom_point() + scale_x_log10() + scale_y_log10() + theme_bw()
 ```
 There are lots more options you can use to alter the appearance of these plots.
 
@@ -123,35 +123,35 @@ There are lots more options you can use to alter the appearance of these plots.
 We can find candidate differentially expressed genes by looking for genes with a large change between control and UVB samples. A common threshold used is log2 fold change more than 2 fold. We will calculate log2 fold change for all the genes and colour the genes with log2 fold change more than 2 fold on the plot.
 
 ```{r remove_unexpressed}
-sum(countdata2$ctlMean > 0)
-sum(countdata2$uvbMean > 0)
+sum(countData2$ctlMean > 0)
+sum(countData2$uvbMean > 0)
 
-nrow(countdata2)
-countdata2 <- subset(countdata2, (countdata2$ctlMean > 0 | countdata2$uvbMean > 0))
+nrow(countData2)
+countData2 <- subset(countData2, (countData2$ctlMean > 0 | countData2$uvbMean > 0))
 #explain | operator meaning OR in this context?
-nrow(countdata2)
+nrow(countData2)
 ```
 
 ```{r log2FC}
-countdata2$log2FC <- log2(countdata2$uvbMean / countdata2$ctlMean)
-sum(countdata2$log2FC > 2)
-sum(countdata2$log2FC < -2)
+countData2$log2FC <- log2(countData2$uvbMean / countData2$ctlMean)
+sum(countData2$log2FC > 2)
+sum(countData2$log2FC < -2)
 ```
 Make a new column to store this information in.
 
 ```{r outliers}
-countdata2$outlier <- FALSE
-countdata2$outlier[countdata2$log2FC > 2] <- TRUE
-countdata2$outlier[countdata2$log2FC < -2] <- TRUE
+countData2$outlier <- FALSE
+countData2$outlier[countData2$log2FC > 2] <- TRUE
+countData2$outlier[countData2$log2FC < -2] <- TRUE
 ```
 
 ```{r plot_outliers}
-plot(countdata2$ctlMean, countdata2$uvbMean, log="xy")
-points(countdata2$ctlMean[countdata2$outlier==TRUE], countdata2$uvbMean[countdata2$outlier==TRUE], col="red")
+plot(countData2$ctlMean, countData2$uvbMean, log="xy")
+points(countData2$ctlMean[countData2$outlier==TRUE], countData2$uvbMean[countData2$outlier==TRUE], col="red")
 ```
 
 ```{r ggplot_outliers}
-ggplot(countdata2, aes(x=ctlMean, y=uvbMean, colour=outlier)) + geom_point() + scale_x_log10() + scale_y_log10() 
+ggplot(countData2, aes(x=ctlMean, y=uvbMean, colour=outlier)) + geom_point() + scale_x_log10() + scale_y_log10() 
 ```
 
 ## DESeq2 analysis
@@ -174,14 +174,14 @@ It requires the count data to be in matrix form, and an additional dataframe des
 
 ```{r convert_to_matrix}
 # Convert to matrix
-class(countdata)
-countdata <- as.matrix(countdata)
-class(countdata)
-head(countdata)
+class(countData)
+countData <- as.matrix(countData)
+class(countData)
+head(countData)
 
-# construct coldata dataframe
+# construct colData dataframe
 #three replicates of control and UVB.
-coldata <- data.frame(condition=c(rep("ctl", 3), rep("uvb",3)), row.names=colnames(countdata))
+colData <- data.frame(condition=c(rep("ctl", 3), rep("uvb",3)), row.names=colnames(countData))
 ```
 
 DESeq works on a particular type of object called a DESeqDataSet.
@@ -189,7 +189,7 @@ DESeq works on a particular type of object called a DESeqDataSet.
 ```{r make_deseqdataset}
 #introduce how DESeq2 works - type of object it works on etc
 # instantiate the DESeqDataSet
-dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
+dds <- DESeqDataSetFromMatrix(countData=countData, colData=colData, design=~condition)
 dds
 ```
 
@@ -211,12 +211,12 @@ head(res)
 Combine DEseq results with the original counts data. Write significant results to a file.
 
 ```{r write_results}
-resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
-head(resdata)
-names(resdata)[1] <- "GeneID"
-head(resdata)
+resData <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
+head(resData)
+names(resData)[1] <- "GeneID"
+head(resData)
 
-sig <- subset(resdata, padj<0.05)
+sig <- subset(resData, padj<0.05)
 dir.create("results")
 write.table(sig, file="results/sig.txt", sep="\t") #tab delim data
 ```

From 89fafde3a87fc19ff16121c1be08fc6e287cb228 Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Wed, 23 Jul 2014 16:15:05 +1000
Subject: [PATCH 08/12] knit'd md file now reflects Rmd file

---
 analysis.md | 560 +++++++++++++++++++++++++++-------------------------
 1 file changed, 288 insertions(+), 272 deletions(-)

diff --git a/analysis.md b/analysis.md
index e83b463f1..f01d234cf 100644
--- a/analysis.md
+++ b/analysis.md
@@ -18,10 +18,10 @@ Import the data as a `data.frame` and examine it. The data is stored in a text f
 
 ```r
 ## Filename with output from featureCounts
-countfile <- "data/counts.txt"
+countFile <- "data/counts.txt"
 ## Read in the data
-countdata <- read.table(countfile, header=TRUE, row.names=1)
-head(countdata)
+countData <- read.table(countFile, header=TRUE, row.names=1)
+head(countData)
 ```
 
 ```
@@ -54,51 +54,51 @@ head(countdata)
 ## OR4G4P                           +;+    947
 ## OR4G11P                            +    940
 ##            ctl1.fastq_tophat.accepted_hits.bam
-## DDX11L1                                      0
-## WASH7P                                       0
-## MIR1302-10                                   0
-## FAM138A                                      0
-## OR4G4P                                       0
-## OR4G11P                                      0
+## DDX11L1                                    337
+## WASH7P                                     319
+## MIR1302-10                                 313
+## FAM138A                                    321
+## OR4G4P                                     322
+## OR4G11P                                    343
 ##            ctl2.fastq_tophat.accepted_hits.bam
-## DDX11L1                                      0
-## WASH7P                                       0
-## MIR1302-10                                   0
-## FAM138A                                      0
-## OR4G4P                                       0
-## OR4G11P                                      0
+## DDX11L1                                    322
+## WASH7P                                     377
+## MIR1302-10                                 348
+## FAM138A                                    333
+## OR4G4P                                     311
+## OR4G11P                                    339
 ##            ctl3.fastq_tophat.accepted_hits.bam
-## DDX11L1                                      0
-## WASH7P                                       0
-## MIR1302-10                                   0
-## FAM138A                                      0
-## OR4G4P                                       0
-## OR4G11P                                      0
+## DDX11L1                                    325
+## WASH7P                                     299
+## MIR1302-10                                 328
+## FAM138A                                    340
+## OR4G4P                                     364
+## OR4G11P                                    335
 ##            uvb1.fastq_tophat.accepted_hits.bam
-## DDX11L1                                      0
-## WASH7P                                       0
-## MIR1302-10                                   0
-## FAM138A                                      0
-## OR4G4P                                       0
-## OR4G11P                                      0
+## DDX11L1                                    131
+## WASH7P                                     213
+## MIR1302-10                                 250
+## FAM138A                                    253
+## OR4G4P                                     378
+## OR4G11P                                    418
 ##            uvb2.fastq_tophat.accepted_hits.bam
-## DDX11L1                                      0
-## WASH7P                                       0
-## MIR1302-10                                   0
-## FAM138A                                      0
-## OR4G4P                                       0
-## OR4G11P                                      0
+## DDX11L1                                    158
+## WASH7P                                     194
+## MIR1302-10                                 247
+## FAM138A                                    234
+## OR4G4P                                     315
+## OR4G11P                                    450
 ##            uvb3.fastq_tophat.accepted_hits.bam
-## DDX11L1                                      0
-## WASH7P                                       0
-## MIR1302-10                                   0
-## FAM138A                                      0
-## OR4G4P                                       0
-## OR4G11P                                      0
+## DDX11L1                                    136
+## WASH7P                                     203
+## MIR1302-10                                 263
+## FAM138A                                    276
+## OR4G4P                                     405
+## OR4G11P                                    437
 ```
 
 ```r
-colnames(countdata)
+colnames(countData)
 ```
 
 ```
@@ -116,7 +116,7 @@ colnames(countdata)
 ```
 
 ```r
-class(countdata)
+class(countData)
 ```
 
 ```
@@ -127,57 +127,57 @@ It contains information about genes (one gene per row) with the gene positions i
 
 ```r
 # Remove first five columns (chr, start, end, strand, length)
-countdata <- countdata[ ,-(1:5)]
-head(countdata)
+countData <- countData[ ,-(1:5)]
+head(countData)
 ```
 
 ```
 ##            ctl1.fastq_tophat.accepted_hits.bam
-## DDX11L1                                      0
-## WASH7P                                       0
-## MIR1302-10                                   0
-## FAM138A                                      0
-## OR4G4P                                       0
-## OR4G11P                                      0
+## DDX11L1                                    337
+## WASH7P                                     319
+## MIR1302-10                                 313
+## FAM138A                                    321
+## OR4G4P                                     322
+## OR4G11P                                    343
 ##            ctl2.fastq_tophat.accepted_hits.bam
-## DDX11L1                                      0
-## WASH7P                                       0
-## MIR1302-10                                   0
-## FAM138A                                      0
-## OR4G4P                                       0
-## OR4G11P                                      0
+## DDX11L1                                    322
+## WASH7P                                     377
+## MIR1302-10                                 348
+## FAM138A                                    333
+## OR4G4P                                     311
+## OR4G11P                                    339
 ##            ctl3.fastq_tophat.accepted_hits.bam
-## DDX11L1                                      0
-## WASH7P                                       0
-## MIR1302-10                                   0
-## FAM138A                                      0
-## OR4G4P                                       0
-## OR4G11P                                      0
+## DDX11L1                                    325
+## WASH7P                                     299
+## MIR1302-10                                 328
+## FAM138A                                    340
+## OR4G4P                                     364
+## OR4G11P                                    335
 ##            uvb1.fastq_tophat.accepted_hits.bam
-## DDX11L1                                      0
-## WASH7P                                       0
-## MIR1302-10                                   0
-## FAM138A                                      0
-## OR4G4P                                       0
-## OR4G11P                                      0
+## DDX11L1                                    131
+## WASH7P                                     213
+## MIR1302-10                                 250
+## FAM138A                                    253
+## OR4G4P                                     378
+## OR4G11P                                    418
 ##            uvb2.fastq_tophat.accepted_hits.bam
-## DDX11L1                                      0
-## WASH7P                                       0
-## MIR1302-10                                   0
-## FAM138A                                      0
-## OR4G4P                                       0
-## OR4G11P                                      0
+## DDX11L1                                    158
+## WASH7P                                     194
+## MIR1302-10                                 247
+## FAM138A                                    234
+## OR4G4P                                     315
+## OR4G11P                                    450
 ##            uvb3.fastq_tophat.accepted_hits.bam
-## DDX11L1                                      0
-## WASH7P                                       0
-## MIR1302-10                                   0
-## FAM138A                                      0
-## OR4G4P                                       0
-## OR4G11P                                      0
+## DDX11L1                                    136
+## WASH7P                                     203
+## MIR1302-10                                 263
+## FAM138A                                    276
+## OR4G4P                                     405
+## OR4G11P                                    437
 ```
 
 ```r
-colnames(countdata)
+colnames(countData)
 ```
 
 ```
@@ -203,12 +203,12 @@ paste0("ctl", 1:3)
 c(paste0("ctl", 1:4), paste0("uvb", 1:5))
 ```
 
-Using `gsub` is a more reproducible way to do this.
+An easier way to do this, especially for files with many columns, is to use the `gsub` command to strip out the extra information. This is also more robust to introduced errors, for example if the column order changes at some point in the future.
 
 ```r
 ## Using gsub -- reproducible
 ?gsub
-gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countdata))
+gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countData))
 ```
 
 ```
@@ -216,18 +216,18 @@ gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(count
 ```
 
 ```r
-colnames(countdata) <- gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countdata))
-head(countdata)
+colnames(countData) <- gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countData))
+head(countData)
 ```
 
 ```
 ##            ctl1 ctl2 ctl3 uvb1 uvb2 uvb3
-## DDX11L1       0    0    0    0    0    0
-## WASH7P        0    0    0    0    0    0
-## MIR1302-10    0    0    0    0    0    0
-## FAM138A       0    0    0    0    0    0
-## OR4G4P        0    0    0    0    0    0
-## OR4G11P       0    0    0    0    0    0
+## DDX11L1     337  322  325  131  158  136
+## WASH7P      319  377  299  213  194  203
+## MIR1302-10  313  348  328  250  247  263
+## FAM138A     321  333  340  253  234  276
+## OR4G4P      322  311  364  378  315  405
+## OR4G11P     343  339  335  418  450  437
 ```
 
 ### Exercise 1
@@ -235,29 +235,21 @@ Find the gene with the highest expression in any sample. Extract  the expression
 
 
 ```r
-max(apply(countdata, 1, max)) #max expression is 7013
+max(apply(countData, 1, max)) #max expression is 7013
 ```
 
 ```
-## [1] 7013
+## [1] 450
 ```
 
 ```r
-which.max(apply(countdata, 1, max)) #gene is EEF1A1P9
+topGene <- which.max(apply(countData, 1, max)) #gene is EEF1A1P9
+countData[topGene, ] #get other sample data - max is in uvb1
 ```
 
 ```
-## EEF1A1P9 
-##    13514
-```
-
-```r
-countdata[13514, ] #get other sample data - max is in uvb1
-```
-
-```
-##          ctl1 ctl2 ctl3 uvb1 uvb2 uvb3
-## EEF1A1P9 3570 3788 4345 7013 4217 3630
+##         ctl1 ctl2 ctl3 uvb1 uvb2 uvb3
+## OR4G11P  343  339  335  418  450  437
 ```
 
 ```r
@@ -272,10 +264,10 @@ We will calculate the mean for each gene for each condition. First make a copy o
 
 
 ```r
-countdata2 <- countdata
+countData2 <- countData
 
 #get Control columns
-colnames(countdata2)
+colnames(countData2)
 ```
 
 ```
@@ -283,7 +275,7 @@ colnames(countdata2)
 ```
 
 ```r
-grep("ctl", colnames(countdata2))
+grep("ctl", colnames(countData2))
 ```
 
 ```
@@ -291,33 +283,43 @@ grep("ctl", colnames(countdata2))
 ```
 
 ```r
-ctlCols <- grep("ctl", colnames(countdata2))
-head(countdata2[,ctlCols])
+ctlCols <- grep("ctl", colnames(countData2))
+head(countData2[,ctlCols])
 ```
 
 ```
 ##            ctl1 ctl2 ctl3
-## DDX11L1       0    0    0
-## WASH7P        0    0    0
-## MIR1302-10    0    0    0
-## FAM138A       0    0    0
-## OR4G4P        0    0    0
-## OR4G11P       0    0    0
+## DDX11L1     337  322  325
+## WASH7P      319  377  299
+## MIR1302-10  313  348  328
+## FAM138A     321  333  340
+## OR4G4P      322  311  364
+## OR4G11P     343  339  335
+```
+
+```r
+apply(countData2[, ctlCols], 1, mean) 
+```
+
+```
+##    DDX11L1     WASH7P MIR1302-10    FAM138A     OR4G4P    OR4G11P 
+##      328.0      331.7      329.7      331.3      332.3      339.0
 ```
 
 ```r
-countdata2$ctlMean <- apply(countdata2[, ctlCols], 1, mean)
+# here we'll use rowMeans instead, its a convenient shortcut, and also faster!
+countData2$ctlMean <- rowMeans(countData2[, ctlCols])
 
 #same for uvb
-uvbCols <- grep("uvb", colnames(countdata2))
-countdata2$uvbMean <- apply(countdata2[, uvbCols], 1, mean)
+uvbCols <- grep("uvb", colnames(countData2))
+countData2$uvbMean <- rowMeans(countData2[, uvbCols])
 ```
 
 Plot the mean expression of each gene in control against the UVB sample mean. Look for outliers.
 
 
 ```r
-plot(countdata2$ctlMean, countdata2$uvbMean)
+plot(countData2$ctlMean, countData2$uvbMean)
 ```
 
 ![plot of chunk plot_means](./analysis_files/figure-html/plot_means.png) 
@@ -325,7 +327,7 @@ plot(countdata2$ctlMean, countdata2$uvbMean)
 
 ```r
 library(ggplot2)
-ggplot(countdata2, aes(x=ctlMean, y=uvbMean)) + geom_point()
+ggplot(countData2, aes(x=ctlMean, y=uvbMean)) + geom_point()
 ```
 
 ![plot of chunk ggplot_means](./analysis_files/figure-html/ggplot_means.png) 
@@ -337,19 +339,14 @@ How could you make this plot more informative and look more professional? Hint:
 
 
 ```r
-plot(countdata2$ctlMean, countdata2$uvbMean, log="xy")
-```
-
-```
-## Warning: 56164 x values <= 0 omitted from logarithmic plot
-## Warning: 56110 y values <= 0 omitted from logarithmic plot
+plot(countData2$ctlMean, countData2$uvbMean, log="xy")
 ```
 
 ![plot of chunk exercise2_1](./analysis_files/figure-html/exercise2_1.png) 
 
 
 ```r
-ggplot(countdata2, aes(x=ctlMean, y=uvbMean)) + geom_point() + scale_x_log10() + scale_y_log10() + theme_bw()
+ggplot(countData2, aes(x=ctlMean, y=uvbMean)) + geom_point() + scale_x_log10() + scale_y_log10() + theme_bw()
 ```
 
 ![plot of chunk exercise2_2](./analysis_files/figure-html/exercise2_2.png) 
@@ -361,84 +358,76 @@ We can find candidate differentially expressed genes by looking for genes with a
 
 
 ```r
-sum(countdata2$ctlMean > 0)
+sum(countData2$ctlMean > 0)
 ```
 
 ```
-## [1] 474
+## [1] 6
 ```
 
 ```r
-sum(countdata2$uvbMean > 0)
+sum(countData2$uvbMean > 0)
 ```
 
 ```
-## [1] 528
+## [1] 6
 ```
 
 ```r
-nrow(countdata2)
+nrow(countData2)
 ```
 
 ```
-## [1] 56638
+## [1] 6
 ```
 
 ```r
-countdata2 <- subset(countdata2, (countdata2$ctlMean > 0 | countdata2$uvbMean > 0))
+countData2 <- subset(countData2, (countData2$ctlMean > 0 | countData2$uvbMean > 0))
 #explain | operator meaning OR in this context?
-nrow(countdata2)
+nrow(countData2)
 ```
 
 ```
-## [1] 565
+## [1] 6
 ```
 
 
 ```r
-countdata2$log2FC <- log2(countdata2$uvbMean / countdata2$ctlMean)
-sum(countdata2$log2FC > 2)
+countData2$log2FC <- log2(countData2$uvbMean / countData2$ctlMean)
+sum(countData2$log2FC > 2)
 ```
 
 ```
-## [1] 170
+## [1] 0
 ```
 
 ```r
-sum(countdata2$log2FC < -2)
+sum(countData2$log2FC < -2)
 ```
 
 ```
-## [1] 46
+## [1] 0
 ```
 Make a new column to store this information in.
 
 
 ```r
-countdata2$outlier <- FALSE
-countdata2$outlier[countdata2$log2FC > 2] <- TRUE
-countdata2$outlier[countdata2$log2FC < -2] <- TRUE
-```
-
-
-```r
-plot(countdata2$ctlMean, countdata2$uvbMean, log="xy")
+countData2$outlier <- FALSE
+countData2$outlier[countData2$log2FC > 2] <- TRUE
+countData2$outlier[countData2$log2FC < -2] <- TRUE
 ```
 
-```
-## Warning: 91 x values <= 0 omitted from logarithmic plot
-## Warning: 37 y values <= 0 omitted from logarithmic plot
-```
 
 ```r
-points(countdata2$ctlMean[countdata2$outlier==TRUE], countdata2$uvbMean[countdata2$outlier==TRUE], col="red")
+plot(countData2$ctlMean, countData2$uvbMean, log="xy")
+points(countData2$ctlMean[countData2$outlier==TRUE], countData2$uvbMean[countData2$outlier==TRUE], col="red")
 ```
 
 ![plot of chunk plot_outliers](./analysis_files/figure-html/plot_outliers.png) 
 
 
 ```r
-ggplot(countdata2, aes(x=ctlMean, y=uvbMean, colour=outlier)) + geom_point() + scale_x_log10() + scale_y_log10() 
+ggplot(countData2, aes(x=ctlMean, y=uvbMean, colour=outlier)) + geom_point() + scale_x_log10() + scale_y_log10() 
 ```
 
 ![plot of chunk ggplot_outliers](./analysis_files/figure-html/ggplot_outliers.png) 
@@ -492,10 +481,6 @@ library(DESeq2)
 ## Loading required package: RcppArmadillo
 ```
 
-```
-## Warning: package 'RcppArmadillo' was built under R version 3.1.1
-```
-
 ```r
 citation("DESeq2")
 ```
@@ -523,7 +508,7 @@ It requires the count data to be in matrix form, and an additional dataframe des
 
 ```r
 # Convert to matrix
-class(countdata)
+class(countData)
 ```
 
 ```
@@ -531,8 +516,8 @@ class(countdata)
 ```
 
 ```r
-countdata <- as.matrix(countdata)
-class(countdata)
+countData <- as.matrix(countData)
+class(countData)
 ```
 
 ```
@@ -540,23 +525,23 @@ class(countdata)
 ```
 
 ```r
-head(countdata)
+head(countData)
 ```
 
 ```
 ##            ctl1 ctl2 ctl3 uvb1 uvb2 uvb3
-## DDX11L1       0    0    0    0    0    0
-## WASH7P        0    0    0    0    0    0
-## MIR1302-10    0    0    0    0    0    0
-## FAM138A       0    0    0    0    0    0
-## OR4G4P        0    0    0    0    0    0
-## OR4G11P       0    0    0    0    0    0
+## DDX11L1     337  322  325  131  158  136
+## WASH7P      319  377  299  213  194  203
+## MIR1302-10  313  348  328  250  247  263
+## FAM138A     321  333  340  253  234  276
+## OR4G4P      322  311  364  378  315  405
+## OR4G11P     343  339  335  418  450  437
 ```
 
 ```r
-# construct coldata dataframe
+# construct colData dataframe
 #three replicates of control and UVB.
-coldata <- data.frame(condition=c(rep("ctl", 3), rep("uvb",3)), row.names=colnames(countdata))
+colData <- data.frame(condition=c(rep("ctl", 3), rep("uvb",3)), row.names=colnames(countData))
 ```
 
 DESeq works on a particular type of object called a DESeqDataSet.
@@ -565,16 +550,24 @@ DESeq works on a particular type of object called a DESeqDataSet.
 ```r
 #introduce how DESeq2 works - type of object it works on etc
 # instantiate the DESeqDataSet
-dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
+dds <- DESeqDataSetFromMatrix(countData=countData, colData=colData, design=~condition)
+```
+
+```
+## Warning: some variables in design formula are characters, converting to
+## factors
+```
+
+```r
 dds
 ```
 
 ```
 ## class: DESeqDataSet 
-## dim: 56638 6 
+## dim: 6 6 
 ## exptData(0):
 ## assays(1): counts
-## rownames(56638): DDX11L1 WASH7P ... MT-TT MT-TP
+## rownames(6): DDX11L1 WASH7P ... OR4G4P OR4G11P
 ## rowData metadata column names(0):
 ## colnames(6): ctl1 ctl2 ... uvb2 uvb3
 ## colData names(1): condition
@@ -593,6 +586,21 @@ dds <- DESeq(dds)
 ## estimating dispersions
 ## gene-wise dispersion estimates
 ## mean-dispersion relationship
+```
+
+```
+## Warning: the parametric fit of dispersion estimates over the mean of counts
+## failed, which occurs when the trend is not well captured by the
+## function y = a/x + b. A local regression fit is automatically performed,
+## and the analysis can continue. You can specify fitType='local' or 'mean'
+## to avoid this message if re-running the same data.
+## When using local regression fit, the user should examine plotDispEsts(dds)
+## to make sure the fitted line is not sharply curving up or down based on
+## the position of individual points.
+## Warning: Estimated rdf < 1.0; not estimating variance
+```
+
+```
 ## final dispersion estimates
 ## fitting model and testing
 ```
@@ -609,20 +617,20 @@ head(res)
 ## DataFrame with 6 rows and 6 columns
 ##             baseMean log2FoldChange     lfcSE      stat    pvalue
 ##            <numeric>      <numeric> <numeric> <numeric> <numeric>
-## DDX11L1            0             NA        NA        NA        NA
-## WASH7P             0             NA        NA        NA        NA
-## MIR1302-10         0             NA        NA        NA        NA
-## FAM138A            0             NA        NA        NA        NA
-## OR4G4P             0             NA        NA        NA        NA
-## OR4G11P            0             NA        NA        NA        NA
+## DDX11L1        225.4      -0.800662   0.11188  -7.15620 8.295e-13
+## WASH7P         261.6      -0.316571   0.10258  -3.08609 2.028e-03
+## MIR1302-10     289.2      -0.001925   0.09442  -0.02039 9.837e-01
+## FAM138A        290.4      -0.005237   0.09441  -0.05546 9.558e-01
+## OR4G4P         354.2       0.498655   0.09741   5.11892 3.073e-07
+## OR4G11P        397.6       0.725214   0.07570   9.58062 9.647e-22
 ##                 padj
 ##            <numeric>
-## DDX11L1           NA
-## WASH7P            NA
-## MIR1302-10        NA
-## FAM138A           NA
-## OR4G4P            NA
-## OR4G11P           NA
+## DDX11L1    2.488e-12
+## WASH7P     3.042e-03
+## MIR1302-10 9.837e-01
+## FAM138A    9.837e-01
+## OR4G4P     6.146e-07
+## OR4G11P    5.788e-21
 ```
 
 ```r
@@ -632,7 +640,7 @@ table(res$padj<0.05)
 ```
 ## 
 ## FALSE  TRUE 
-##   337    50
+##     2     4
 ```
 
 ```r
@@ -645,65 +653,81 @@ head(res)
 ## log2 fold change (MAP): condition uvb vs ctl 
 ## Wald test p-value: condition uvb vs ctl 
 ## DataFrame with 6 rows and 6 columns
-##         baseMean log2FoldChange     lfcSE      stat    pvalue      padj
-##        <numeric>      <numeric> <numeric> <numeric> <numeric> <numeric>
-## ANXA3     312.32          3.910    0.3475    11.251 2.287e-29 8.849e-27
-## SFRP2    1711.27         -3.356    0.3407    -9.852 6.735e-23 1.303e-20
-## MT2P1     167.36          4.067    0.4582     8.878 6.834e-19 8.816e-17
-## IL8        31.70          5.976    0.9456     6.320 2.620e-10 2.535e-08
-## PTPN13    358.34         -1.386    0.2362    -5.869 4.388e-09 3.396e-07
-## CXCL1      42.47          3.691    0.6448     5.724 1.040e-08 6.705e-07
+##             baseMean log2FoldChange     lfcSE      stat    pvalue
+##            <numeric>      <numeric> <numeric> <numeric> <numeric>
+## OR4G11P        397.6       0.725214   0.07570   9.58062 9.647e-22
+## DDX11L1        225.4      -0.800662   0.11188  -7.15620 8.295e-13
+## OR4G4P         354.2       0.498655   0.09741   5.11892 3.073e-07
+## WASH7P         261.6      -0.316571   0.10258  -3.08609 2.028e-03
+## MIR1302-10     289.2      -0.001925   0.09442  -0.02039 9.837e-01
+## FAM138A        290.4      -0.005237   0.09441  -0.05546 9.558e-01
+##                 padj
+##            <numeric>
+## OR4G11P    5.788e-21
+## DDX11L1    2.488e-12
+## OR4G4P     6.146e-07
+## WASH7P     3.042e-03
+## MIR1302-10 9.837e-01
+## FAM138A    9.837e-01
 ```
 
 Combine DEseq results with the original counts data. Write significant results to a file.
 
 
 ```r
-resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
-head(resdata)
+resData <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
+head(resData)
 ```
 
 ```
-##   Row.names baseMean log2FoldChange  lfcSE   stat    pvalue      padj
-## 1     ANXA3   312.32          3.910 0.3475 11.251 2.287e-29 8.849e-27
-## 2     SFRP2  1711.27         -3.356 0.3407 -9.852 6.735e-23 1.303e-20
-## 3     MT2P1   167.36          4.067 0.4582  8.878 6.834e-19 8.816e-17
-## 4       IL8    31.70          5.976 0.9456  6.320 2.620e-10 2.535e-08
-## 5    PTPN13   358.34         -1.386 0.2362 -5.869 4.388e-09 3.396e-07
-## 6     CXCL1    42.47          3.691 0.6448  5.724 1.040e-08 6.705e-07
-##      ctl1     ctl2    ctl3   uvb1   uvb2   uvb3
-## 1   26.33   35.131   48.83 378.95 783.44 601.27
-## 2 1847.65 4865.598 2683.82 382.63 325.05 162.88
-## 3   19.02    8.107   23.63 438.34 373.97 141.08
-## 4    0.00    0.000    0.00  66.12  44.18  79.93
-## 5  485.69  535.067  538.65 203.25 205.13 182.26
-## 6   11.70    2.702    0.00  89.99  50.49  99.91
+##    Row.names baseMean log2FoldChange   lfcSE     stat    pvalue      padj
+## 1    OR4G11P    397.6       0.725214 0.07570  9.58062 9.647e-22 5.788e-21
+## 2    DDX11L1    225.4      -0.800662 0.11188 -7.15620 8.295e-13 2.488e-12
+## 3     OR4G4P    354.2       0.498655 0.09741  5.11892 3.073e-07 6.146e-07
+## 4     WASH7P    261.6      -0.316571 0.10258 -3.08609 2.028e-03 3.042e-03
+## 5 MIR1302-10    289.2      -0.001925 0.09442 -0.02039 9.837e-01 9.837e-01
+## 6    FAM138A    290.4      -0.005237 0.09441 -0.05546 9.558e-01 9.837e-01
+##    ctl1  ctl2  ctl3  uvb1  uvb2  uvb3
+## 1 313.1 288.2 292.5 480.9 541.6 469.3
+## 2 307.6 273.7 283.8 150.7 190.2 146.1
+## 3 293.9 264.4 317.9 434.9 379.1 435.0
+## 4 291.2 320.5 261.1 245.1 233.5 218.0
+## 5 285.7 295.8 286.4 287.6 297.3 282.5
+## 6 293.0 283.1 296.9 291.1 281.6 296.4
 ```
 
 ```r
-names(resdata)[1] <- "GeneID"
-head(resdata)
+names(resData)[1] <- "GeneID"
+head(resData)
 ```
 
 ```
-##   GeneID baseMean log2FoldChange  lfcSE   stat    pvalue      padj    ctl1
-## 1  ANXA3   312.32          3.910 0.3475 11.251 2.287e-29 8.849e-27   26.33
-## 2  SFRP2  1711.27         -3.356 0.3407 -9.852 6.735e-23 1.303e-20 1847.65
-## 3  MT2P1   167.36          4.067 0.4582  8.878 6.834e-19 8.816e-17   19.02
-## 4    IL8    31.70          5.976 0.9456  6.320 2.620e-10 2.535e-08    0.00
-## 5 PTPN13   358.34         -1.386 0.2362 -5.869 4.388e-09 3.396e-07  485.69
-## 6  CXCL1    42.47          3.691 0.6448  5.724 1.040e-08 6.705e-07   11.70
-##       ctl2    ctl3   uvb1   uvb2   uvb3
-## 1   35.131   48.83 378.95 783.44 601.27
-## 2 4865.598 2683.82 382.63 325.05 162.88
-## 3    8.107   23.63 438.34 373.97 141.08
-## 4    0.000    0.00  66.12  44.18  79.93
-## 5  535.067  538.65 203.25 205.13 182.26
-## 6    2.702    0.00  89.99  50.49  99.91
+##       GeneID baseMean log2FoldChange   lfcSE     stat    pvalue      padj
+## 1    OR4G11P    397.6       0.725214 0.07570  9.58062 9.647e-22 5.788e-21
+## 2    DDX11L1    225.4      -0.800662 0.11188 -7.15620 8.295e-13 2.488e-12
+## 3     OR4G4P    354.2       0.498655 0.09741  5.11892 3.073e-07 6.146e-07
+## 4     WASH7P    261.6      -0.316571 0.10258 -3.08609 2.028e-03 3.042e-03
+## 5 MIR1302-10    289.2      -0.001925 0.09442 -0.02039 9.837e-01 9.837e-01
+## 6    FAM138A    290.4      -0.005237 0.09441 -0.05546 9.558e-01 9.837e-01
+##    ctl1  ctl2  ctl3  uvb1  uvb2  uvb3
+## 1 313.1 288.2 292.5 480.9 541.6 469.3
+## 2 307.6 273.7 283.8 150.7 190.2 146.1
+## 3 293.9 264.4 317.9 434.9 379.1 435.0
+## 4 291.2 320.5 261.1 245.1 233.5 218.0
+## 5 285.7 295.8 286.4 287.6 297.3 282.5
+## 6 293.0 283.1 296.9 291.1 281.6 296.4
+```
+
+```r
+sig <- subset(resData, padj<0.05)
+dir.create("results")
+```
+
+```
+## Warning: 'results' already exists
 ```
 
 ```r
-sig <- subset(resdata, padj<0.05)
 write.table(sig, file="results/sig.txt", sep="\t") #tab delim data
 ```
 
@@ -733,13 +757,13 @@ head(assay(rld))
 ```
 
 ```
-##            ctl1 ctl2 ctl3 uvb1 uvb2 uvb3
-## DDX11L1       0    0    0    0    0    0
-## WASH7P        0    0    0    0    0    0
-## MIR1302-10    0    0    0    0    0    0
-## FAM138A       0    0    0    0    0    0
-## OR4G4P        0    0    0    0    0    0
-## OR4G11P       0    0    0    0    0    0
+##             ctl1  ctl2  ctl3  uvb1  uvb2  uvb3
+## DDX11L1    7.947 7.888 7.906 7.636 7.724 7.624
+## WASH7P     8.078 8.126 8.026 7.998 7.977 7.947
+## MIR1302-10 8.170 8.187 8.171 8.173 8.189 8.165
+## FAM138A    8.186 8.170 8.192 8.183 8.167 8.191
+## OR4G4P     8.369 8.321 8.406 8.565 8.493 8.565
+## OR4G11P    8.500 8.463 8.470 8.718 8.785 8.705
 ```
 
 ```r
@@ -747,12 +771,12 @@ assay(rld)[1:5,1:5]
 ```
 
 ```
-##            ctl1 ctl2 ctl3 uvb1 uvb2
-## DDX11L1       0    0    0    0    0
-## WASH7P        0    0    0    0    0
-## MIR1302-10    0    0    0    0    0
-## FAM138A       0    0    0    0    0
-## OR4G4P        0    0    0    0    0
+##             ctl1  ctl2  ctl3  uvb1  uvb2
+## DDX11L1    7.947 7.888 7.906 7.636 7.724
+## WASH7P     8.078 8.126 8.026 7.998 7.977
+## MIR1302-10 8.170 8.187 8.171 8.173 8.189
+## FAM138A    8.186 8.170 8.192 8.183 8.167
+## OR4G4P     8.369 8.321 8.406 8.565 8.493
 ```
 
 ```r
@@ -761,11 +785,11 @@ t(assay(rld))[1:5,1:5]
 
 ```
 ##      DDX11L1 WASH7P MIR1302-10 FAM138A OR4G4P
-## ctl1       0      0          0       0      0
-## ctl2       0      0          0       0      0
-## ctl3       0      0          0       0      0
-## uvb1       0      0          0       0      0
-## uvb2       0      0          0       0      0
+## ctl1   7.947  8.078      8.170   8.186  8.369
+## ctl2   7.888  8.126      8.187   8.170  8.321
+## ctl3   7.906  8.026      8.171   8.192  8.406
+## uvb1   7.636  7.998      8.173   8.183  8.565
+## uvb2   7.724  7.977      8.189   8.167  8.493
 ```
 
 ```r
@@ -773,12 +797,12 @@ dist(t(assay(rld)))
 ```
 
 ```
-##       ctl1  ctl2  ctl3  uvb1  uvb2
-## ctl2 12.18                        
-## ctl3 12.49 13.61                  
-## uvb1 15.07 16.22 17.29            
-## uvb2 20.01 19.73 20.43 15.05      
-## uvb3 18.70 18.94 20.21 13.75 11.83
+##         ctl1    ctl2    ctl3    uvb1    uvb2
+## ctl2 0.10004                                
+## ctl3 0.08184 0.13488                        
+## uvb1 0.43456 0.45264 0.40115                
+## uvb2 0.39578 0.42692 0.37861 0.13597        
+## uvb3 0.44902 0.46984 0.40809 0.05509 0.15427
 ```
 
 ```r
@@ -786,13 +810,13 @@ as.matrix(dist(t(assay(rld))))
 ```
 
 ```
-##       ctl1  ctl2  ctl3  uvb1  uvb2  uvb3
-## ctl1  0.00 12.18 12.49 15.07 20.01 18.70
-## ctl2 12.18  0.00 13.61 16.22 19.73 18.94
-## ctl3 12.49 13.61  0.00 17.29 20.43 20.21
-## uvb1 15.07 16.22 17.29  0.00 15.05 13.75
-## uvb2 20.01 19.73 20.43 15.05  0.00 11.83
-## uvb3 18.70 18.94 20.21 13.75 11.83  0.00
+##         ctl1   ctl2    ctl3    uvb1   uvb2    uvb3
+## ctl1 0.00000 0.1000 0.08184 0.43456 0.3958 0.44902
+## ctl2 0.10004 0.0000 0.13488 0.45264 0.4269 0.46984
+## ctl3 0.08184 0.1349 0.00000 0.40115 0.3786 0.40809
+## uvb1 0.43456 0.4526 0.40115 0.00000 0.1360 0.05509
+## uvb2 0.39578 0.4269 0.37861 0.13597 0.0000 0.15427
+## uvb3 0.44902 0.4698 0.40809 0.05509 0.1543 0.00000
 ```
 
 ```r
@@ -807,10 +831,6 @@ heatmap(sampleDists)
 library(gplots)
 ```
 
-```
-## Warning: package 'gplots' was built under R version 3.1.1
-```
-
 ```
 ## KernSmooth 2.23 loaded
 ## Copyright M. P. Wand 1997-2009
@@ -871,13 +891,6 @@ hist(res$pvalue, breaks=50, col="grey")
 # MA Plot
 par(pch=16)
 with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x"))
-```
-
-```
-## Warning: 56073 x values <= 0 omitted from logarithmic plot
-```
-
-```r
 with(subset(res, padj<.05), points(baseMean, log2FoldChange, col="red", pch=16))
 library(calibrate)
 ```
@@ -905,8 +918,11 @@ with(subset(res, abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue),
 with(subset(res, padj<.05 & abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), col="green"))
 ## Add legend
 legend("topleft", legend=c("FDR<0.05", "|LFC|>1", "both"), pch=16, col=c("red","orange","green"))
-## Label points
-with(subset(res, padj<.05 & abs(log2FoldChange)>1), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=1))
 ```
 
 ![plot of chunk volcano_plot](./analysis_files/figure-html/volcano_plot.png) 
+
+```r
+## Label points
+with(subset(res, padj<.05 & abs(log2FoldChange)>1), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=1))
+```

From db8e7fbfaacb6035742718d499ca199e2d709df3 Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Wed, 23 Jul 2014 16:19:30 +1000
Subject: [PATCH 09/12] Added explanation for a non-obvious function call.

---
 analysis.Rmd | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/analysis.Rmd b/analysis.Rmd
index 3eaac0910..74e61e3b4 100644
--- a/analysis.Rmd
+++ b/analysis.Rmd
@@ -123,6 +123,11 @@ There are lots more options you can use to alter the appearance of these plots.
 We can find candidate differentially expressed genes by looking for genes with a large change between control and UVB samples. A common threshold used is log2 fold change more than 2 fold. We will calculate log2 fold change for all the genes and colour the genes with log2 fold change more than 2 fold on the plot.
 
 ```{r remove_unexpressed}
+# explain: whats happening here, is for each ctlMean there is a TRUE/FALSE for 
+# if it is greater than 0. TRUE and FALSE can be summed: FALSE is typically 
+# considered a 0, and TRUE is considered to be 1. So when we call
+# `sum(countData2$ctlMean > 0)`, we're really asking, "how many genes have a 
+# mean above 0 in the control group?"
 sum(countData2$ctlMean > 0)
 sum(countData2$uvbMean > 0)
 

From 519288bd30528ebe96fb439e5a5888fc6e6610db Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Wed, 23 Jul 2014 16:21:45 +1000
Subject: [PATCH 10/12] Added some more teacher comments

---
 analysis.Rmd | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/analysis.Rmd b/analysis.Rmd
index 74e61e3b4..efca25c5e 100644
--- a/analysis.Rmd
+++ b/analysis.Rmd
@@ -138,7 +138,10 @@ nrow(countData2)
 ```
 
 ```{r log2FC}
+# explain: what is fold change? why do we use log2 to quantify the fold change?
 countData2$log2FC <- log2(countData2$uvbMean / countData2$ctlMean)
+# again, reinforce that summing a logical vector gives you the number of 
+# occurences of TRUE.
 sum(countData2$log2FC > 2)
 sum(countData2$log2FC < -2)
 ```

From ce241cb456f8d4e0d475de1380f24377ca55fa00 Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Wed, 23 Jul 2014 16:24:15 +1000
Subject: [PATCH 11/12] added explanation for why we want to convert to a
 matrix

---
 analysis.Rmd | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/analysis.Rmd b/analysis.Rmd
index efca25c5e..819660e70 100644
--- a/analysis.Rmd
+++ b/analysis.Rmd
@@ -181,7 +181,8 @@ citation("DESeq2")
 It requires the count data to be in matrix form, and an additional dataframe describing the structure of the experiment.
 
 ```{r convert_to_matrix}
-# Convert to matrix
+# countData is currently a data.frame, but DESeq2 expects its input to be in 
+# matrix format, so we will convert our countData to a matrix.
 class(countData)
 countData <- as.matrix(countData)
 class(countData)

From 249c34f2fe00c9cb70ae1a9e19ac664efb52d494 Mon Sep 17 00:00:00 2001
From: Scott Ritchie <sritchie73@gmail.com>
Date: Wed, 23 Jul 2014 16:26:00 +1000
Subject: [PATCH 12/12] Fixed comment formatting for consistency

---
 analysis.Rmd | 64 ++++++++++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/analysis.Rmd b/analysis.Rmd
index 819660e70..b249d5bfa 100644
--- a/analysis.Rmd
+++ b/analysis.Rmd
@@ -7,7 +7,7 @@ output:
 
 This is an introduction to RNAseq analysis for use at Software Carpentry bootcamps that have covered novice R. It involves reading in some count data from an RNAseq experiment, exploring the data using base R functions and then analysis with the package DESeq2.
 
-## Install required CRAN packages
+# Install required CRAN packages
 
 First, install some packages that you'll use.
 
@@ -20,9 +20,9 @@ install.packages("calibrate")
 Import the data as a `data.frame` and examine it. The data is stored in a text file with the first line being a header giving the column names, and the row names in the first column. 
 
 ```{r data_input}
-## Filename with output from featureCounts
+# Filename with output from featureCounts
 countFile <- "data/counts.txt"
-## Read in the data
+# Read in the data
 countData <- read.table(countFile, header=TRUE, row.names=1)
 head(countData)
 colnames(countData)
@@ -39,9 +39,9 @@ colnames(countData)
 
 We can rename the columns to something a bit more readable.
 ```{r eval=FALSE}
-## Manually
+# Manually
 c("ctl1", "ctl2", "ctl3", "uvb1", "uvb2", "uvb3")
-## Using paste
+# Using paste
 ?paste
 paste("ctl", 1:3)
 paste("ctl", 1:3, sep="")
@@ -52,25 +52,25 @@ c(paste0("ctl", 1:4), paste0("uvb", 1:5))
 
 An easier way to do this, especially for files with many columns, is to use the `gsub` command to strip out the extra information. This is also more robust to introduced errors, for example if the column order changes at some point in the future.
 ```{r rename_cols}
-## Using gsub -- reproducible
+# Using gsub -- reproducible
 ?gsub
 gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countData))
 colnames(countData) <- gsub(pattern=".fastq_tophat.accepted_hits.bam", replacement="", x=colnames(countData))
 head(countData)
 ```
 
-### Exercise 1
+## Exercise 1
 Find the gene with the highest expression in any sample. Extract  the expression data for this gene for all samples. In which sample does it have the highest expression? What is the function of the gene? Can you suggest why this is the top expressed gene?
 
 ```{r exercise_1}
-max(apply(countData, 1, max)) #max expression is 7013
-topGene <- which.max(apply(countData, 1, max)) #gene is EEF1A1P9
-countData[topGene, ] #get other sample data - max is in uvb1
+max(apply(countData, 1, max)) # max expression is 7013
+topGene <- which.max(apply(countData, 1, max)) # gene is EEF1A1P9
+countData[topGene, ] # get other sample data - max is in uvb1
 
-#this is a pseudogene - maybe an artefact of only aligning reads to a single chromosome?
+# this is a pseudogene - maybe an artefact of only aligning reads to a single chromosome?
 ```
 
-## Data investigation using base R
+# Data investigation using base R
 
 We can investigate this data a bit more using some of the basic R functions before going on to use more sophisticated analysis tools.
 
@@ -79,7 +79,7 @@ We will calculate the mean for each gene for each condition. First make a copy o
 ```{r get_means}
 countData2 <- countData
 
-#get Control columns
+# get Control columns
 colnames(countData2)
 grep("ctl", colnames(countData2))
 ctlCols <- grep("ctl", colnames(countData2))
@@ -88,7 +88,7 @@ apply(countData2[, ctlCols], 1, mean)
 # here we'll use rowMeans instead, its a convenient shortcut, and also faster!
 countData2$ctlMean <- rowMeans(countData2[, ctlCols])
 
-#same for uvb
+# same for uvb
 uvbCols <- grep("uvb", colnames(countData2))
 countData2$uvbMean <- rowMeans(countData2[, uvbCols])
 ```
@@ -104,7 +104,7 @@ library(ggplot2)
 ggplot(countData2, aes(x=ctlMean, y=uvbMean)) + geom_point()
 ```
 
-### Exercise 2
+## Exercise 2
 How could you make this plot more informative and look more professional? Hint: try using a log scale. Try changing colours, transparencies, sizes, or shapes of points. 
 
 `help(par)` will give you information on lots of graphical parameters that can be set. Help for ggplot2 can be found [here](http://docs.ggplot2.org/current/).
@@ -118,7 +118,7 @@ ggplot(countData2, aes(x=ctlMean, y=uvbMean)) + geom_point() + scale_x_log10() +
 ```
 There are lots more options you can use to alter the appearance of these plots.
 
-##Find candidate differentially expressed genes
+#Find candidate differentially expressed genes
 
 We can find candidate differentially expressed genes by looking for genes with a large change between control and UVB samples. A common threshold used is log2 fold change more than 2 fold. We will calculate log2 fold change for all the genes and colour the genes with log2 fold change more than 2 fold on the plot.
 
@@ -133,7 +133,7 @@ sum(countData2$uvbMean > 0)
 
 nrow(countData2)
 countData2 <- subset(countData2, (countData2$ctlMean > 0 | countData2$uvbMean > 0))
-#explain | operator meaning OR in this context?
+# explain: | operator meaning OR in this context?
 nrow(countData2)
 ```
 
@@ -162,13 +162,13 @@ points(countData2$ctlMean[countData2$outlier==TRUE], countData2$uvbMean[countDat
 ggplot(countData2, aes(x=ctlMean, y=uvbMean, colour=outlier)) + geom_point() + scale_x_log10() + scale_y_log10() 
 ```
 
-## DESeq2 analysis
+# DESeq2 analysis
 
 DESeq2 is an R package for analysis of RNAseq data. It is available from [Bioconductor](http://www.bioconductor.org/). Bioconductor is a project to provide tools for analysing high-throughput genomic data including RNA-seq, ChIP-seq and arrays. You can explore Bioconductor packages [here](http://www.bioconductor.org/packages/release/BiocViews.html#___Software). 
 
 ```{r install_deseq2, eval=FALSE}
-#install and have a break to check everyone is up to date?
-#explain bioconductor?
+# install and have a break to check everyone is up to date?
+# explain bioconductor?
 source("http://bioconductor.org/biocLite.R")
 biocLite("DESeq2")
 ```
@@ -189,14 +189,14 @@ class(countData)
 head(countData)
 
 # construct colData dataframe
-#three replicates of control and UVB.
+# three replicates of control and UVB.
 colData <- data.frame(condition=c(rep("ctl", 3), rep("uvb",3)), row.names=colnames(countData))
 ```
 
 DESeq works on a particular type of object called a DESeqDataSet.
 
 ```{r make_deseqdataset}
-#introduce how DESeq2 works - type of object it works on etc
+# introduce how DESeq2 works - type of object it works on etc
 # instantiate the DESeqDataSet
 dds <- DESeqDataSetFromMatrix(countData=countData, colData=colData, design=~condition)
 dds
@@ -212,7 +212,7 @@ dds <- DESeq(dds)
 res <- results(dds)
 head(res)
 table(res$padj<0.05)
-## Order by adjusted p-value
+# Order by adjusted p-value
 res <- res[order(res$padj), ]
 head(res)
 ```
@@ -227,10 +227,10 @@ head(resData)
 
 sig <- subset(resData, padj<0.05)
 dir.create("results")
-write.table(sig, file="results/sig.txt", sep="\t") #tab delim data
+write.table(sig, file="results/sig.txt", sep="\t") # tab delim data
 ```
 
-## Data Visualization
+# Data Visualization
 
 We can also do some exploratory plotting of the data.
 
@@ -239,7 +239,7 @@ plotDispEsts(dds, main="Dispersion plot")
 ```
 
 ```{r plot_heatmaps}
-## Regularized log transformation for clustering/heatmaps, etc
+# Regularized log transformation for clustering/heatmaps, etc
 rld <- rlogTransformation(dds)
 plotPCA(rld)
 
@@ -251,7 +251,7 @@ dist(t(assay(rld)))
 as.matrix(dist(t(assay(rld))))
 sampleDists <- as.matrix(dist(t(assay(rld))))
 heatmap(sampleDists)
-## better heatmap with gplots
+# better heatmap with gplots
 library(gplots)
 heatmap.2(sampleDists)
 heatmap.2(sampleDists, col=colorpanel(64, "steelblue", "white"), key=FALSE, trace="none")
@@ -261,13 +261,13 @@ heatmap.2(sampleDists, col=colorpanel(64, "red", "white", "blue"), key=FALSE, tr
 ```
 
 ```{r plot_pval_hist}
-## Examine plot of p-values
+# Examine plot of p-values
 hist(res$pvalue, breaks=50, col="grey")
 ```
 
 
 ```{r MA_plot}
-#These are the plots that are most recognisable from papers
+# These are the plots that are most recognisable from papers
 # MA Plot
 par(pch=16)
 with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x"))
@@ -280,14 +280,14 @@ with(subset(res, padj<.05), textxy(baseMean, log2FoldChange, labs=Gene, cex=1, c
 
 ```{r volcano_plot}
 # Volcano plot
-## Set point character
+# Set point character
 par(pch=16)
 with(res, plot(log2FoldChange, -log10(pvalue), main="Volcano plot"))
 with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), col="red"))
 with(subset(res, abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), col="orange"))
 with(subset(res, padj<.05 & abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), col="green"))
-## Add legend
+# Add legend
 legend("topleft", legend=c("FDR<0.05", "|LFC|>1", "both"), pch=16, col=c("red","orange","green"))
-## Label points
+# Label points
 with(subset(res, padj<.05 & abs(log2FoldChange)>1), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=1))
 ```