-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathabbreviate_sam_names.py
executable file
·38 lines (33 loc) · 1.4 KB
/
abbreviate_sam_names.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/bin/env pypy
# abbreviate names in a SAM file to be less than MAX_LEN
# but still unique , by including a hash of the entire name in the
# abbreviated name
# (this gets around e.g.
#"
#[E::sam_parse1] query name too long
#[W::sam_read1] Parse error at line 7736052
#[main_samview] truncated file.
#"
# usage example (e.g. to avoid writing out the patched sam) :
# cat long_names.sam | ./abbreviate_sam_names.py | samtools view -Sb -@ 12 -o shorter_names.bam 1>abbreviation.stdout 2>abbreviation.sdterr
MAX_LEN=250 # found 252 did not work - still get parsing error ( ref https://www.biostars.org/p/259891/ )
import sys
import hashlib
import re
for record in sys.stdin:
if len(record.strip()) ==0:
sys.stdout.write(record) # output whitespace records unchanged
else:
fields=re.split("\s+", record.strip())
if fields[0] == "@":
sys.stdout.write(record) # output header records unchanged
else:
if len(fields[0]) <= MAX_LEN:
sys.stdout.write(record) # output records that don't need abbreviating unchanged
else:
h=hashlib.md5()
h.update(fields[0])
hash=h.hexdigest()
new_name="%s_%s"%(fields[0][0:MAX_LEN-len(hash)-1], hash) # i.e. make a digest of the entire name and make it part of the shortened
# name so its unique
sys.stdout.write("\t".join([new_name] + fields[1:])+"\n")