forked from mourisl/T1K
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patht1k-build.pl
193 lines (175 loc) · 4.39 KB
/
t1k-build.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env perl
use strict ;
use warnings ;
use Cwd 'cwd' ;
use Cwd 'abs_path' ;
use File::Basename ;
use File::Path 'make_path' ;
use IO::Uncompress::Unzip qw(unzip $UnzipError) ;
my $progName = "t1k-build.pl" ;
die "$progName usage: ./$progName [OPTIONS]:\n".
"Required:\n".
"\t-d STRING: EMBL-ENA dat file\n".
"\t\tOr\n".
"\t-f STRING: plain gene sequence file\n".
"\t\tOr\n".
"\t--download STRING: IPD-IMGT/HLA or IPD-KIR or user-specified dat file download link\n".
"Optional:\n".
"\t-o STRING: output folder (default: ./)\n".
"\t-g STRING: genome annotation file (default: not used)\n".
"\t--target STRING: gene name keyword (default: no filter)\n".
"\t--prefix STRING: file prefix (default: based on --target or -o)\n".
"\t--ignore-partial: ignore partial allele at all (default: fill in intron if exons are complete)\n".
"\t--partial-intron-noseq: the partial introns and pseudo exons are not present in the sequence of the dat file, e.g. IPD-KIR_2.13.0.\n"
if (@ARGV == 0);
sub system_call
{
print STDERR "[".localtime()."] SYSTEM CALL: ".join(" ",@_)."\n";
system(@_) == 0
or die "system @_ failed: $?";
#print STDERR " finished\n";
}
my $WD = dirname( abs_path( $0 ) ) ;
my $i ;
my $ipdFasta = "" ;
my $ipdDat = "" ;
my $outputDirectory = "./" ;
my $annotationFile = "" ;
my $downloadPath = "" ;
my $targetGene = "" ;
my $outputPrefix = "" ;
my $ignorePartial = 0 ;
my $partialIntronHasNoSeq = 0 ;
for ($i = 0 ; $i < @ARGV ; ++$i)
{
if ($ARGV[$i] eq "-f")
{
$ipdFasta = $ARGV[$i + 1] ;
++$i ;
}
elsif ($ARGV[$i] eq "-d")
{
$ipdDat = $ARGV[$i + 1] ;
++$i ;
}
elsif ($ARGV[$i] eq "--download")
{
$downloadPath = $ARGV[$i + 1] ;
++$i ;
}
elsif ($ARGV[$i] eq "-o")
{
$outputDirectory = $ARGV[$i + 1] ;
++$i ;
}
elsif ($ARGV[$i] eq "-g")
{
$annotationFile = $ARGV[$i + 1] ;
++$i ;
}
elsif ($ARGV[$i] eq "--target")
{
$targetGene = lc($ARGV[$i + 1]) ;
++$i ;
}
elsif ( $ARGV[$i] eq "--prefix")
{
$outputPrefix = $ARGV[$i + 1] ;
++$i ;
}
elsif ( $ARGV[$i] eq "--ignore-partial")
{
$ignorePartial = 1 ;
++$i ;
}
elsif ( $ARGV[$i] eq "--partial-intron-noseq")
{
$partialIntronHasNoSeq = 1 ;
}
else
{
die "Unknown parameter ".$ARGV[$i]."\n" ;
}
}
if ($ipdFasta eq "" && $ipdDat eq "" && $downloadPath eq "")
{
die "Need to use -d/-f/--download to specify dat file, sequence file or dat file download link.\n" ;
}
if ( !-d $outputDirectory)
{
make_path $outputDirectory ;
}
if ($ipdDat eq "" and $downloadPath ne "")
{
if (uc($downloadPath) eq "IPD-IMGT/HLA")
{
$ipdDat = "$outputDirectory/hla.dat" ;
system_call("curl -o $ipdDat.zip https://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/hla.dat.zip") ;
unzip "$ipdDat.zip" => $ipdDat
or die "unzip failed: $UnzipError\n";
}
elsif (uc($downloadPath) eq "IPD-KIR")
{
$ipdDat = "$outputDirectory/kir.dat" ;
system_call("curl -o $ipdDat https://ftp.ebi.ac.uk/pub/databases/ipd/kir/kir.dat") ;
}
else
{
$ipdDat = "$outputDirectory/t1k_ref.dat" ;
system_call("curl -o $ipdDat $downloadPath") ;
}
}
if ($outputPrefix eq "")
{
if ($targetGene ne "")
{
$outputPrefix = $targetGene ;
}
elsif ($outputDirectory ne "./" )
{
$outputPrefix = (split /\//, $outputDirectory)[0] ;
}
else
{
$outputPrefix = "T1K_ref" ;
}
}
my $rnaSeqFile = "$outputDirectory/${outputPrefix}_rna_seq.fa" ;
my $dnaSeqFile = "$outputDirectory/${outputPrefix}_dna_seq.fa" ;
if ($ipdDat ne "")
{
my $options = "" ;
$options .= " --gene $targetGene" if ($targetGene ne "") ;
$options .= " --ignorePartial" if ($ignorePartial == 1) ;
$options .= " --partialIntronHasNoSeq" if ($partialIntronHasNoSeq == 1) ;
system_call("perl $WD/ParseDatFile.pl $ipdDat --mode dna $options > $dnaSeqFile") ;
system_call("perl $WD/ParseDatFile.pl $ipdDat --mode rna $options > $rnaSeqFile") ;
}
else
{
# Reheader the IPD gene sequence file
open FP, $ipdFasta ;
open FPout, ">$rnaSeqFile" ;
while (<FP>)
{
if (!/^>/)
{
print FPout $_ ;
next ;
}
chomp ;
my @cols = split /\s/, substr($_, 1) ;
print FPout ">".$cols[1]."\n" ;
}
close FP ;
close FPout ;
}
# Add the genome coordinate to fasta file.
if ($annotationFile ne "")
{
system_call("perl $WD/AddGeneCoord.pl $rnaSeqFile $annotationFile > $outputDirectory/${outputPrefix}_rna_coord.fa") ;
if ($ipdDat ne "")
{
system_call("perl $WD/AddGeneCoord.pl $dnaSeqFile $annotationFile > $outputDirectory/${outputPrefix}_dna_coord.fa") ;
}
}