-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparseTEClass.py
executable file
·53 lines (41 loc) · 1.66 KB
/
parseTEClass.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import argparse
from Bio import SeqIO
import re
####################################
## Usage: python parseTEClass.py -f <Name of the library of analyzed TEs from TEClass.>
##
def get_args():
#What this script does
parser = argparse.ArgumentParser(description="Compare header and result of TEClass analysis.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
#Argument for input library used for cd-hit-est
parser.add_argument('-f', '--fasta', type=str, help='Name of the library of analyzed TEs from TEClass.', required=True)
args = parser.parse_args()
FASTA = args.fasta
return FASTA
def parserecorddesc(ID):
IDREPLACE = ID.replace('|', ',')
IDREPLACE = IDREPLACE.replace(': ', ',')
IDSPLIT = re.split(',', IDREPLACE)
TECLASSRESULT = IDSPLIT[2]
HEADER = IDSPLIT[0]
SPLITHEADER = re.split('#', HEADER)
SPLITHEADER = SPLITHEADER[1].replace('/', ',')
SPLITHEADER = re.split(',', SPLITHEADER)
RMASKCALL = SPLITHEADER[0]
return HEADER, RMASKCALL, TECLASSRESULT
FASTA = get_args()
PREFIX = os.path.splitext(FASTA)[0]
#PREFIX = PREFIX[0]
with open(PREFIX + '_teClass_comparison.txt', 'w') as OUT:
with open(PREFIX + '_tocheck.fa', 'w') as OUTFASTA:
with open(PREFIX + '_valid.fa', 'w') as VALIDFASTA:
for record in SeqIO.parse(FASTA, 'fasta'):
HEADER, RMASKCALL, TECLASSRESULT = parserecorddesc(record.description)
if RMASKCALL == TECLASSRESULT:
MATCH = 'exact_match'
SeqIO.write(record, VALIDFASTA, 'fasta')
else:
MATCH = 'no_match'
SeqIO.write(record, OUTFASTA, 'fasta')
OUT.write(HEADER + '\t' + RMASKCALL + '\t' + TECLASSRESULT + '\t' + MATCH + '\n')