-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathidentify_LINE_repeatmasker_softclip_R3.py
executable file
·106 lines (89 loc) · 4.65 KB
/
identify_LINE_repeatmasker_softclip_R3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python
#######################################################################################
### ###
### Copyright (C) 2017 Pawel Krawczyk ([email protected]) ###
### ###
### This program is free software: you can redistribute it and/or modify ###
### it under the terms of the GNU General Public License as published by ###
### the Free Software Foundation, either version 3 of the License, or ###
### (at your option) any later version. ###
### ###
### This program is distributed in the hope that it will be useful, ###
### but WITHOUT ANY WARRANTY; without even the implied warranty of ###
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ###
### GNU General Public License for more details. ###
### ###
### You should have received a copy of the GNU General Public License ###
### along with this program. If not, see <http://www.gnu.org/licenses/>. ###
### ###
#######################################################################################
import argparse
# parse command line arguments
parser = argparse.ArgumentParser(
description='Analyze repatmasker output file - identify soft clipped fragments (R3 reads)')
parser.add_argument('--input', dest='inputfile', action='store',
help='Input file from Parsing-RepeatMasker-Outputs/parseRM.pl (required)', required=True)
parser.add_argument('--fasta', dest='fastafile', action='store',
help='Fasta file with sequences provided as an input for repeatmasker (required)', required=True)
parser.add_argument('--output', dest='output', action='store',
help='Output tsv file (required)', required=True)
args = parser.parse_args()
from Bio import SeqIO
import pandas as pd
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
data = pd.DataFrame.from_csv(args.inputfile, sep='\t')
sequences = SeqIO.index(args.fastafile, "fasta")
seq_records = []
seq_names = []
for row in data.itertuples():
LINE_name = row.Rname
overlap = row.IfOverlap
seq_name = row.Gname
# check if sequence was already processed
# to choose only the best (first) match of repeatmasker for further analyses
if (seq_name in seq_names):
print("sequence already processed: " + seq_name)
else:
seq_names.append(seq_name) # append seq name to processed seq_names
# get LINE positions in analyzed sequence:
LINE_start_in_seq = row.Gstart
LINE_end_in_seq = row.Gend
LINE_left_in_seq = row.Gleft
LINE_start_in_ref = row.Rstart
LINE_end_in_ref = row.Rend
LINE_left_in_ref = row.Rleft
# get the full sequence
record2 = sequences[str(seq_name)]
whole_seq = record2.seq
whole_seq_id = record2.id
# get the LINE seq based on coordinates from repeatmasker
LINE_seq = whole_seq[LINE_start_in_seq - 1:LINE_end_in_seq]
LINE_seq_obj = Seq(str(LINE_seq))
# reverse complement the LINE sequence
LINE_seq = str(LINE_seq_obj.reverse_complement())
clip_seq = whole_seq[:LINE_start_in_seq - 1]
clip_seq_obj = Seq(str(clip_seq))
# reverse complement clipped fragment
clip_seq = str(clip_seq_obj.reverse_complement())
# produce output sequence:
seq_descr = "\tclip5: \tclip3: " + clip_seq + "\tpos: " + \
str(LINE_end_in_ref) + "\tref: " + str(LINE_name)
processed_seq = SeqRecord(
Seq(str(LINE_seq)), id=whole_seq_id, description=str(seq_descr))
seq_records.append(processed_seq)
#include sequences for which LINE sequences were not found
for record in SeqIO.parse(args.fastafile, "fasta"):
seq_id = record.id # get id of read
if (seq_id in seq_names):
next
else:
whole_seq = record.seq
clip_seq = str(whole_seq.reverse_complement())
seq_descr = "\tclip5: \tclip3: \tpos: -1\tref: -1"
processed_seq = SeqRecord(
Seq(str(clip_seq)), id=seq_id, description=str(seq_descr))
seq_records.append(processed_seq)
# write fasta output file
SeqIO.write(seq_records, args.output, "fasta")
### END ###