Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose --rename-regex in main annotate CLI #221

Merged
merged 10 commits into from
Feb 12, 2021
35 changes: 26 additions & 9 deletions dammit/components/fastx.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def rename_fasta(fasta_fn,

if split_regex is None:
counter = count()
header_func = lambda name: '{0}_{1}'.format(basename, next(counter))
header_func = lambda _: '{0}_{1}'.format(basename, next(counter))
else:
def header_func(header):
results = re.search(split_regex, header).groupdict()
Expand Down Expand Up @@ -80,7 +80,11 @@ def rename_fasta_cmd(fasta_fn,
names_fn,
basename,
split_regex):
''' Copy a FASTA file and rename the headers.
''' Copy a FASTA file and rename the headers. If --split-regex is used,
it should be provided in Python `re` format and contain a named field keyed
as `name` that extracts the desired string. For example, providing
(?P<name>^[a-zA-Z0-9]+) will match from the beginning of the sequence header
up to the first non-alphanumeric symbol.
'''

allowed = r'[a-zA-Z0-9_\-:|\.]+'
Expand Down Expand Up @@ -209,7 +213,7 @@ def generate_sequence_name(original_name, sequence, annotation_df):
pass


def generate_sequence_summary(original_name, sequence, annotation_df):
def generate_sequence_summary(seqid, sequence, annotation_df):
'''Given a FASTA sequence's original name, the sequence itself,
and a DataFrame with its corresponding GFF3 annotations, generate
a summary line of the annotations in key=value format.
Expand Down Expand Up @@ -251,12 +255,12 @@ def generate_sequence_summary(original_name, sequence, annotation_df):
for _, row in fgroup.iterrows()])
annots.append('{0}={1}'.format(feature_type, collapsed))

desc = '{0} {1}'.format(original_name, ' '.join(annots))
desc = '{0} {1}'.format(seqid, ' '.join(annots))

return desc


def annotate_fasta(transcriptome_fn, gff3_fn, output_fn):
def annotate_fasta(transcriptome_fn, gff3_fn, output_fn, name_map=None):
'''Annotate the headers in a FASTA file with its corresponding GFF3 file
and place the resulting FASTA file in output_fn.
\f
Expand All @@ -265,13 +269,22 @@ def annotate_fasta(transcriptome_fn, gff3_fn, output_fn):
transcriptome_fn (str): Path to the FASTA file.
gff3_fn (str): Path to the GFF3 annotations.
output_fn (str): Path to store the resulting annotated FASTA.
name_map (str): Path to CSV file with original-renamed transcripts.
'''

if name_map is not None:
name_map = pd.read_csv(name_map)

annotations = GFF3Parser(gff3_fn).read()
with open(output_fn, 'w') as fp:
for n, record in enumerate(ReadParser(transcriptome_fn)):
df = annotations.query('seqid == "{0}"'.format(record.name))
desc = generate_sequence_summary(record.name, record.sequence,
seqid_query = 'seqid == "{0}"'.format(record.name)
df = annotations.query(seqid_query)

renamed_query = 'renamed == "{0}"'.format(record.name)
seqid = record.name if name_map is None else name_map.query(renamed_query).original.iloc[0]

desc = generate_sequence_summary(seqid, record.sequence,
df)
fp.write('>{0}\n{1}\n'.format(desc.strip(), record.sequence))

Expand All @@ -280,8 +293,12 @@ def annotate_fasta(transcriptome_fn, gff3_fn, output_fn):
@click.argument('transcriptome_fn')
@click.argument('gff3_fn')
@click.argument('output_fn')
def annotate_fasta_cmd(transcriptome_fn, gff3_fn, output_fn):
@click.option('--name-map',
help='CSV file with mapping of original names to renamed trascripts. '\
'If provided, transcripts will be mapped back to their original names. '\
'Must contain the columns "original" and "renamed".')
def annotate_fasta_cmd(transcriptome_fn, gff3_fn, output_fn, name_map):
'''Annotate a FASTA file from a GFF3 file.
'''

annotate_fasta(transcriptome_fn, gff3_fn, output_fn)
annotate_fasta(transcriptome_fn, gff3_fn, output_fn, name_map=name_map)
26 changes: 24 additions & 2 deletions dammit/components/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,21 @@ def run_group(config,
' will be of the form <name>_<X>.'\
' It should not have spaces, pipes,'\
' ampersands, or other characters'\
' with special meaning to BASH.')
' with special meaning to BASH.'\
' Superseded by --regex-rename.')
@click.option('--regex-rename',
help='Rename transcripts using a regex pattern. The regex should follow '
' Python `re` format and contain a named field keyed'\
' as `name` that extracts the desired string. For example, providing'\
' "(?P<name>^[a-zA-Z0-9\.]+)" will match from the beginning of the sequence header'\
' up to the first symbol that is not alphanumeric or a period.'\
' Supersedes --base-name.')
@click.option('--rename/--no-rename', default=None,
help='If --no-rename, original transcript names are preserved'\
' in the final annotated FASTA. --base-name is'\
' still used in intermediate files. If --rename (the default '\
' behavior), the renamed transcript names are used in the final '\
' annotated FASTA.')
@click.option('-e', '--global-evalue',
type=float,
help='global e-value cutoff for similarity searches.')
Expand All @@ -165,14 +179,16 @@ def run_group(config,
def annotate_cmd(config,
transcriptome,
base_name,
regex_rename,
rename,
global_evalue,
output_dir,
user_database,
dry_run,
extra_snakemake_args):
''' The main annotation pipeline. Calculates assembly stats;
runs BUSCO; runs LAST against OrthoDB (and optionally uniref90),
HMMER against Pfam, Inferal against Rfam, and Conditional Reciprocal
HMMER against Pfam, Infernal against Rfam, and Conditional Reciprocal
Best-hit Blast against user databases; and aggregates all results in
a properly formatted GFF3 file.'''

Expand All @@ -199,6 +215,12 @@ def annotate_cmd(config,
if base_name:
config.core['basename'] = base_name

if regex_rename:
config.core['regex_rename'] = regex_rename

if rename is not None:
config.core['rename'] = rename

# the default global_evalue is null
if global_evalue:
config.core['global_evalue'] = global_evalue
Expand Down
2 changes: 2 additions & 0 deletions dammit/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ global_evalue: null
pipeline: default

basename: Transcript
rename: true
regex_rename: null

# n_threads: total threads to pass to snakemake -j
n_threads: 0
Expand Down
Loading