-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_1001genome_VCF.pl
144 lines (132 loc) · 5.21 KB
/
parse_1001genome_VCF.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/perl
# parse_1001genome_VCF.pl
#
# Jay Moore, John Innes Centre, Norwich
#
# 08/11/2019
#
# Version 1
#
# Change log
#
# Summary of functions
#
# Done:
#
# Read .bed file of CG sites
# Stream .VCF file on STDIN
# Parse the file identifying cytosines and CG dinucleotides
# Output separate lists, one for each accession, of CG sites with mutations and of cytosines with mutations
#
# To do:
#
my $cg_sites_file = '/jic/scratch/groups/Daniel-Zilberman/Reference_data/Genomes/Arabidopsis/TAIR10/TAIR10_Chr.all.CG_sites.tsv';
# make a hash to put the CG sites in
my %cg_sites;
open(my $cg_sites_fh, '<:encoding(UTF-8)', $cg_sites_file) or die "Could not open file '$cg_sites_file' $!";
while (<$cg_sites_fh>) {
my $line = $_;
chomp $line;
my @this_site = split "\t", $line;
# create an entry in the hash for the CG site pasting together chrom and locus1 as the key
$cg_sites{$this_site[0].$this_site[1]}=1;
}
close $cg_sites_fh;
#foreach my $cg_site(sort keys %cg_sites) {
# print "$cg_site\n";
#}
my %samples;
while (<>) {
my $line = $_;
chomp $line;
if (substr($line,0,1) eq '#') {
if (substr($line,0,2) eq '##') {
# skip comments
} else {
# it's the header line with all the accession IDs - grab them for later
my @header = split '\t', $line;
for (my $sample=9; $sample<=$#header; $sample++) {
$samples{$sample} = $header[$sample];
}
foreach my $sample(sort keys %samples) {
# print $sample." ".$samples{$sample}."\n";
system('rm '.$samples{$sample}.'_CG_vars.txt');
system('rm '.$samples{$sample}.'_C_vars.txt');
}
}
} else {
# we have finished headers and are now doing content
my @entries = split '\t', $line;
# make list of lines mutated at this site - start with an empty array
my @mutated_lines = ();
print $entries[0].' '.$entries[1].' '.$entries[2].' '.$entries[3].' '.$entries[4].' '.$entries[5].' '.$entries[6].' '.$entries[7].' '.$entries[8]."\n";
for (my $sample=9; $sample<=$#entries; $sample++) {
# parse the entry of a single accession
my @entry = split ':', $entries[$sample];
#print "entry[0] ".$entry[0]."\n";
if (($entry[0] ne '0|0') & ($entry[0] ne './.')) {
# if the entry is neither ref|ref nor missing data, it is a mutation - log the relevant sample column number to the array
push @mutated_lines, $sample;
print $samples{$sample}." ".$entry[0]."\n";
}
}
# identify the sequence identified as the reference version
my $refseq = $entries[3];
# go through the mutated lines writing out the site info to the relevant files
foreach my $mutant (@mutated_lines) {
#print "hello $mutant\n";
#if ((length $samples{$mutant}) == 0) {
#print "hello $mutant\n";
#}
# do any of the sites in the reference version overlap a CG site?
my $overlaps_cg = 0;
#print length $refseq;
for (my $this_site=0; $this_site<(length $refseq); $this_site++) {
# if either the site or its neighbour are the cytosine of a CG site
if (defined $cg_sites{'Chr'.$entries[0].$entries[1]}) {
if ($overlaps_cg != $entries[1]) {
$overlaps_cg = $entries[1];
open(my $fh, '>>', $samples{$mutant}.'_CG_vars.txt') or die "Could not write to $samples{$mutant}_CG_vars.txt $!";
say $fh 'Chr'.$entries[0]."\t".$entries[1];
close $fh;
}
} elsif (defined $cg_sites{'Chr'.$entries[0].($entries[1]-1)}) {
# has this CG site already been reported?
# There is still a chance that the same CG site gets reported twice, if both nucleotides in the site have separate mutations. In that case the site will get reported twice in any genotypes which have a non-reference allele at both sites
if ($overlaps_cg != ($entries[1]-1)) {
$overlaps_cg = $entries[1]-1;
open(my $fh, '>>', $samples{$mutant}.'_CG_vars.txt') or die "Could not write to $samples{$mutant}_CG_vars.txt $!";
say $fh 'Chr'.$entries[0]."\t".($entries[1]-1);
close $fh;
}
}
}
#if ($overlaps_cg > 0) {
# output the relevant CG site locus to the files for the samples which vary from the reference
#print 'hello';
#open(my $fh, '>>', $samples{$mutant}.'_CG_vars.txt') or die "Could not write to $samples{$mutant}_CG_vars.txt $!";
#say $fh 'Chr'.$entries[0]."\t".$entries[1];
#close $fh;
#}
# are any of the sites in the reference version a C (or a G) in the reference?
my $overlaps_c = 0;
for (my $this_site=0; $this_site<(length $refseq); $this_site++) {
if ((substr($refseq,$this_site,1) eq 'C') | (substr($refseq,$this_site,1) eq 'G')) {
$overlaps_c = 1;
$this_subsite = $entries[1]+$this_site;
# output the relevant C/G site locus to the files for the samples which vary from the reference
open(my $fh, '>>', $samples{$mutant}.'_C_vars.txt') or die "Could not write to $samples{$mutant}_C_vars.txt $!";
say $fh 'Chr'.$entries[0]."\t".$this_subsite;
close $fh;
}
}
#if ($overlaps_c == 1) {
# output the relevant C site locus to the files for the samples which vary from the reference
#open(my $fh, '>>', $samples{$mutant}.'_C_vars.txt') or die "Could not write to $samples{$mutant}_C_vars.txt $!";
#say $fh 'Chr'.$entries[0]."\t".$entries[1];
#close $fh;
#}
}
}
}