-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_job.sh
178 lines (116 loc) · 4.57 KB
/
run_job.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/bin/bash
#specify the input fastq files
fq_F=$1
fq_R=$2
run_macs2=$3
p_value=$4
project_dir=$5
#specify the location of reference genome fast file for bwa mem
GENOME_FASTA=/gpfs/data/mcnerney-lab/reference_genomes_CUX1_CASP_diff/FASTA_files/hg19/hg19_Ordered.fa
#specify the dir to the blasklist file
blacklist=/gpfs/data/mcnerney-lab/reference_genomes_CUX1_CASP_diff/blacklists/hg19-blacklist.v2.bed
#grab base name of the fastq files
base=`basename $fq_F .fastq.gz`
echo "Sample name is $base"
#specify the number of cores to use for various downstream analysis
cores=8
#create all the output directories
# The -p option means mkdir will create the whole dir if it does not exist and refrain from complaining if it does exist
mkdir -p $project_dir/output/bwa
mkdir -p $project_dir/output/bigwigs
# set up output filenames and locations
bwa_mem_out=$project_dir/output/bwa/${base}.sam
samtools_q30_in=$project_dir/output/bwa/${base}.sam
samtools_q30_out=$project_dir/output/bwa/${base}.q30.bam
samtools_sort_in=$project_dir/output/bwa/${base}.q30.bam
samtools_sort_out=$project_dir/output/bwa/${base}.q30.srt.bam
samtools_dedup_in=$project_dir/output/bwa/${base}.q30.srt.bam
samtools_dedup_out=$project_dir/output/bwa/${base}.q30.srt.dedup.bam
samtools_noMITO_in=$project_dir/output/bwa/${base}.q30.srt.dedup.bam
samtools_noMITO_out=$project_dir/output/bwa/${base}.q30.srt.dedup.noMITO.bam
bedtools_rm_blacklist_in=$project_dir/output/bwa/${base}.q30.srt.dedup.noMITO.bam
bedtools_rm_blacklist_out=$project_dir/output/bwa/${base}.q30.srt.dedup.noMITO.blkrm.bam
bam_to_bed_in=$project_dir/output/bwa/${base}.q30.srt.dedup.noMITO.blkrm.bam
bam_to_bed_out=$project_dir/output/bwa/${base}.q30.srt.dedup.noMITO.blkrm.bed
bamCoverage_in=$project_dir/output/bwa/${base}.q30.srt.dedup.noMITO.blkrm.bam
bamCoverage_out=$project_dir/output/bigwigs/${base}.q30.srt.dedup.noMITO.blkrm.bw
#run the jobs
#genome alignment
echo "Run bwa mem"
cd $project_dir/input
module load gcc/12.1.0
module load intel/2022.2
module load llvm/14.0.5
module load bwa/0.7.17
bwa mem -t $cores \
$GENOME_FASTA \
$fq_F \
$fq_R \
> $bwa_mem_out
#filter and clean bam files
echo "Run samtools filter"
module load gcc/12.1.0
module load intel/2022.2
module load llvm/14.0.5
module load samtools/1.17
#q30 filtering
samtools view -bSh -q 30 -o $samtools_q30_out $samtools_q30_in
#sort
samtools sort -o $samtools_sort_out $samtools_sort_in
#dedup
samtools rmdup $samtools_dedup_in $samtools_dedup_out
#remove mitochondrial reads
samtools view -h $samtools_noMITO_in | awk '{if($3 != "chrM" && $3 != "chrUn"){print $0}}' | samtools view -hb > $samtools_noMITO_out
#index and generate bai file
samtools index $samtools_noMITO_out
echo "run bedtools to remove blacklist regions from bam files"
module load gcc/12.1.0
module load intel/2022.2
module load llvm/14.0.5
module load bedtools/2.30.0
bedtools intersect -abam $bedtools_rm_blacklist_in -b $blacklist -v > $bedtools_rm_blacklist_out
#now let's convert the bam to bed file. This is because the ATAC-specific MACS2 configuration requires bed as input
echo "convert bam to bed file"
bedtools bamtobed -i $bam_to_bed_in > $bam_to_bed_out
#remove the intermediate index file
rm *.dedup.bam.bai
echo "index the final bam file"
samtools index $bedtools_rm_blacklist_out
#generate bigwig files
#you need deepTools for this, and deepTools could be called on as long as you loaded the correct python version, so you don't need to load deepTools separately.
echo "generating bigwig files"
module load gcc/12.1.0
module load intel/2022.2
module load llvm/14.0.5
module load bedtools/2.30.0
module load python/3.10.5
bamCoverage -b $bamCoverage_in -o $bamCoverage_out
#macs2 peak calling
#macs2 peak calling
if [[ $run_macs2=true ]]; then
echo "Running macs2 analysis"
# Check if p value is specified
if [[ -z $p_value ]]; then
echo "Error: p value must be specified with -macs2."
exit 1
fi
mkdir -p $project_dir/output/macs2
macs2_outdir=$project_dir/output/macs2
module load gcc/12.1.0
module load intel/2022.2
module load llvm/14.0.5
module load python/3.10.5
macs2 callpeak -t $bam_to_bed_out \
-f BED \
-g hs \
-n $base \
--nomodel \
--shift -75 \
--extsize 150 \
--call-summits \
-p $p_value \
-B \
--outdir $macs2_outdir
else
echo "Skipping the macs2 peak calling analysis. You can run it separately"
fi