import copy import pysam import collections import os import cgatcore.experiment as E from cgatcore.pipeline import cluster_runnable import cgatcore.iotools as IOTools import cgatcore.pipeline as P from cgat import GTF import tempfile import re from cgat import Bed import pandas as pd #from pandas.core.frame import DataFrame #import numpy as np #import matplotlib as mpl #mpl.use('Agg') # So that matplotlib.pyplot doesn't try to find the X-server and give an error #import matplotlib.pyplot as plt #import sys #sys.path.insert(0, "/home/mbp15ja/dev/AuxiliaryPrograms/StringOperations/") #import StringOperations @cluster_runnable def getMappedUnmappedReads(infile, outfile): # Determine the extension of the file (BAM or SAM), done this for SAM/BAM compatibility # At the moment, all calls should only pass SAM, if it stops working just leave # samfile = pysam.AlignmentFile(infile, "r") _ , file_extension = os.path.splitext(infile) if (file_extension.lower() == '.sam'): samfile = pysam.AlignmentFile(infile, "r") elif (file_extension.lower() == '.bam'): samfile = pysam.AlignmentFile(infile, "rb") # Create the name of the files for the unmapped and mapped reads out_unmapped_basename = P.snip(os.path.basename(outfile), '.tsv') + '.unmapped.txt.gz' out_unmapped = os.path.join(os.path.dirname(outfile), (out_unmapped_basename)) out_mapped_basename = P.snip(os.path.basename(outfile), '.tsv') + '.mapped.txt.gz' out_mapped = os.path.join(os.path.dirname(outfile), (out_mapped_basename)) # Unmapped readnames unmapped = IOTools.open_file(out_unmapped, "w") # Mapped readnames mapped = IOTools.open_file(out_mapped, "w") dictionary = {'read_name': 'first', 'mapped': 'no'} c = collections.Counter() # Go through the reads for read in samfile: # Get the read name read_name = read.query_name # First case, assign the read name to the dictionary if dictionary['read_name'] == 'first': dictionary['read_name'] = read_name # Add one read to the total c["total"] += 1 # If a fragment from the read has been seen before if dictionary['read_name'] == read_name: # If that fragment was mapped if dictionary['mapped'] == 'yes': # Skip to the next data line continue # If all the prior fragments from the read haven't been mapped else: # If it is mapped, change the state to mapped (the default beginning state is unmapped) if not read.is_unmapped: # Put the read as mapped dictionary['mapped'] = 'yes' # If a fragment from the same read hasn't been seen before (new read) else: # If the last read wasn't mapped, store it if dictionary['mapped'] == 'no': unmapped.write(dictionary['read_name']+"\n") # Add one read to the unmapped c["unmapped"] += 1 else: mapped.write(dictionary['read_name'] + "\n") # Add one read to the mapped c["mapped"] += 1 # Add one read to the unmapped c["total"] += 1 # Create a new dictionary with unmapped and the current read name dictionary = {'read_name': read_name, 'mapped': 'no'} # If it is mapped, change the state to mapped (the default beginning state is unmapped) if not read.is_unmapped: # Put the read as mapped dictionary['mapped'] = 'yes' # The last read is hanging # If it is unmapped, store it if dictionary['mapped'] == 'no': unmapped.write(read_name + "\n") # Add one read to the unmapped c["unmapped"] += 1 else: mapped.write(dictionary['read_name'] + "\n") # Add one read to the mapped c["mapped"] += 1 # Store the counters # Create a name for the counters with IOTools.open_file(outfile, "w") as output_file_write: for counter in c: output_file_write.write(counter + ':\t' + str(c[counter]) + '\n') #------------------------------------------------------------------------------ # Looks into sample_table for the sample specified by sample_name # (it performs a case insensitive comparison). # If it finds it, returns the shift specified for that sample. # If it finds an empty string, it returns "0". # If it doesn't find it, or if it is not an integer or empty returns an exception # The shift is defined as the previously 5' constant (performed on all reads within a read pair) # trimming. # For example, if 2bp have been trimmed from the 5' end in qc, # five_prime_quality_trimming=2 # Negative numbers are also allowed if the reads have been extended. # Inputs: # -sample_name: The name of the sample # -sample_table: The tab separated table specifying the following # (doesn't contain a header): # # Sample_name condition(control/treatment) five_prime_quality_trimming(number of base pairs trimmed from both reads' 5' end in a pair) # nd2345 control 2 # # # Outputs: # -returns the shift specified for that sample or an exception def getSampleQCShift(sample_name, sample_table): shift = "" # To make sure the sample is found found = False with open(sample_table, 'r') as reader: for line in reader: # Remove the new line character from the line # (avoids taking this into account downstream) line = line.rstrip('\n') fields = line.split("\t") # If the name of the sample is equal to the name specified in the # file if fields[0].lower() == sample_name.lower(): shift = fields[2] # If it is not empty test whether it is an integer if shift != "": try: int(shift) except ValueError: raise Exception("The sample specified: " + sample_name + " contains a non empty non number string") # If it is empty return 0 else: shift = "0" found = True break if not found: raise Exception("The sample specified: "+ sample_name + " has not been found") else: return shift #------------------------------------------------------------------------------ # inspired pipelineCaptCPerl.filterReadMappings # Assuming a Sam/Bam file resulting from read pairs mapping # and containing ONLY UNIQUE mappings per pair in each read # and SORTED BY READNAME from any mapper output. # Because a pair R1 and R2 can only have one alignment each, # at most there would be 2 alignments, both alignments must # reflect either a proper pair or not a proper pair, same with # pcr_duplicates. Therefore this information can be extracted # after both reads have been seen # Obtains statistics from the reads and read pairs. # For individual reads: # -Unmapped reads # -Reads which are supplementary alignment (subread parts mapping to # multiple places. # -Exception if any non primary alignments found # For read pairs: # -Read is not paired (will count 1 read per supposed read pair: not found) # -Read pair is not mapped in proper pair # -PCR duplicate # -Read pairs which are all of the below: # -Read pair mapped in proper pair # -Doesn't contain supplementary alignments # -Doesn't contain secondary alignments # -Not PCR duplicate # -Contains R1 and R2 # -Read pairs which are all of the below: # -Read pair mapped in proper pair # -Contains supplementary alignments (maybe translocations, inversions, etc..) # -Doesn't contain secondary alignments # -Not PCR duplicate # -Contains R1 and R2 # # Inputs: # -infile: Sam/Bam file containing only unique mappings per pair in each read # sorted by readname sorted by readname from any mapper output # -stats_outfile: Outfile with the statistics # # Output: # -returns printable output with the stats # # Exception: # -If a multimapping (read alignment not being primary is found). @cluster_runnable def getUniquelyMappedPairsNoMultimapping(infile, stats_outfile): c = collections.Counter() # Determine the extension of the file (BAM or SAM), done this for SAM/BAM # compatibility At the moment, all calls should only pass SAM, if it stops # working just leave samfile = pysam.AlignmentFile(infile, "r") _ , file_extension = os.path.splitext(infile) if (file_extension.lower() == '.sam'): samfile = pysam.AlignmentFile(infile, "r") elif (file_extension.lower() == '.bam'): samfile = pysam.AlignmentFile(infile, "rb") # We need to know when the read name changes last_read_name = "first" # We need to know the last read when the read name changes to assign # the read_pair details from the last read when we have seen all the read # pair alignments last_read = "" # Create new dictionary template to store the details read_pair_dict_template = {'read_pair_is_paired':"", # Read pair contains R1 and R2 'read_pair_mapped_in_proper_pair':"", # Read pair contains proper mapped pair 'read_pair_is_pcr_duplicate':"", # Read pair is a PCR duplicate 'read_number_unmapped':0, # Number of R1 and R2 unmapped 'read_number_supplementary':0, # Number of R1 and R2 supplementary (Eg. Translocations) } # Create new copy of the template dictionary to store read_pair_dict = copy.deepcopy(read_pair_dict_template) # Process the reads for read in samfile: # If it is the first read, assign the name (it will consider # it as read name hasn't changed) if last_read_name == "first": last_read_name = read.query_name # The read has changed excepting the first read from the file if (last_read_name != read.query_name): # We have seen all the reads from a pair, fill the details # of the pair # reads contain a pair (R1 and R2) read_pair_dict['read_pair_is_paired'] = last_read.is_paired # Read pair contains proper mapped pair read_pair_dict['read_pair_mapped_in_proper_pair'] = last_read.is_proper_pair # Read pair is a PCR duplicate read_pair_dict['read_pair_is_pcr_duplicate'] = last_read.is_duplicate # Store everything updating the counters c = getUniquelyMappedPairsUpdateCounters(c, read_pair_dict) # Create new copy of the template dictionary to store read_pair_dict = copy.deepcopy(read_pair_dict_template) # Fill the dictionary with the details for the individual reads if read.is_unmapped: read_pair_dict['read_number_unmapped'] += 1 if read.is_supplementary: read_pair_dict['read_number_supplementary'] += 1 # If the read alignment is secondary, it means multimapping is enabled # exit with exception if read.is_secondary: raise Exception("Secondary read found: "+ read.query_name + " Multimapping was enabled") # Last command in the loop: update the last_read with # the read currently viewed last_read = read last_read_name = read.query_name # Process the last read # We have seen all the reads from a pair, fill the details # of the pair # reads contain a pair (R1 and R2) read_pair_dict['read_pair_is_paired'] = last_read.is_paired # Read pair contains proper mapped pair read_pair_dict['read_pair_mapped_in_proper_pair'] = last_read.is_proper_pair # Read pair is a PCR duplicate read_pair_dict['read_pair_is_pcr_duplicate'] = last_read.is_duplicate # Store everything updating the counters c = getUniquelyMappedPairsUpdateCounters(c, read_pair_dict) createOutputStringFromCounters(c, stats_outfile) #------------------------------------------------------------------------------ # Updates the counters of reads taking into account the guidelines from # getUniquelyMappedPairs # Inputs: # -counter: Counter with the different keys and the counts # -dictionary: Dictionary containing the counts for the different keys # of the last read pair # # Outputs: # -updated_counter: Counter after adding the different keys and corresponding counts def getUniquelyMappedPairsUpdateCounters(counter, dictionary): # Get proper mapped pairs if dictionary["read_pair_is_paired"] and dictionary["read_pair_mapped_in_proper_pair"]: counter["total_proper_mapped_pairs"] += 1 # For supplementary, for now, all we want to know is whether a read # has these alignments or not if dictionary['read_number_supplementary'] != 0: counter["proper_mapped_pairs_with_supp_alignment"] += 1 if dictionary["read_pair_is_pcr_duplicate"]: counter["proper_mapped_pairs_pcr_duplicate"] += 1 # -Read pairs which are all of the below (group1): # -Read pair mapped in proper pair # -Doesn't contain supplementary alignments # -Doesn't contain secondary alignments # -Not PCR duplicate # -Contains R1 and R2 if ((dictionary['read_number_supplementary'] == 0) and \ not dictionary["read_pair_is_pcr_duplicate"]): counter["proper_mapped_pairs_with_no_supp_or_pcr_duplicate"] += 1 # -Read pairs which are all of the below (group2): # -Read pair mapped in proper pair # -Contains supplementary alignments (maybe translocations, inversions, etc..) # -Doesn't contain secondary alignments # -Not PCR duplicate # -Contains R1 and R2 elif ((dictionary['read_number_supplementary'] != 0) and \ not dictionary["read_pair_is_pcr_duplicate"]): counter["proper_mapped_pairs_with_supp_or_pcr_duplicate"] += 1 return counter #------------------------------------------------------------------------------ # Writes the counters in the stats_outfile # Inputs: # -counters: Counters with different keys and counts # -stats_outfile: File with the output of the stats def createOutputStringFromCounters(counters, stats_outfile): with open(stats_outfile, 'w') as writer: for key in counters: writer.write(key + ":\t" +str(counters[key]) + "\n") #-------------------------------------------------------------------------------- # Updates the counters of reads taking into account the guidelines from # getCorrectReadPairs # Inputs: # -counter: Counter with the different keys and the counts # -dictionary: Dictionary containing the counts for the different keys # of the last read pair # # Outputs: # -updated_counter: Counter after adding the different keys and corresponding counts def getCorrectReadPairsUpdateCounters(counter, dictionary): # Variable to control the pair isn't into any "incorrect category" correct_pair = True if (dictionary["read_pair_mapped_in_proper_pair"] and dictionary["read_pair_is_paired"]): counter["read_pairs_mapped_in_proper_pair"] += 1 if dictionary["read_pair_is_pcr_duplicate"]: counter["proper_read_pair_is_pcr_duplicate"] += 1 correct_pair = False if dictionary['proper_pair_additional_hit_equally_good']: counter["proper_read_pair_additional_hit_equally_good"] += 1 correct_pair = False if dictionary['proper_pair_with_chimeric_read']: counter["proper_read_pair_with_chimeric_read"] += 1 correct_pair = False # After performing all the checks, see which are "correct" pairs if correct_pair: counter["proper_read_pair_no_dup_no_additional_hit_equally_good_no_chimeric"] += 1 # Not proper pairs else: counter["read_pairs_mapped_not_in_proper_pair"] += 1 return counter #------------------------------------------------------------------------------ # inspired pipelineCaptCPerl.filterReadMappings # Assuming a Sam/Bam file resulting from read pairs mapping with BWA mem -M # and containing unique or multiple mappings (1 pair - multiple alignments) # per pair in each read and SORTED BY READNAME from any mapper # output. # Obtains the number of correctly mapped read pairs and the number of those pairs # correctly mapped read pairs which are marked as PCR duplicates # Note: For each read pair, it only needs one proper read pair mapping in it's alignments # to be considered a correctly mapped read pair. # As long as one correctly mapped read pair is classified as a PCR duplicate, # the read pair will be considered a PCR duplicate. # # # Inputs: # -infile: Sam/Bam file containing only unique mappings per pair in each read # sorted by readname sorted by readname from BWA mem -M # -stats_outfile: Outfile with the statistics # # Output: # -returns printable output with the stats @cluster_runnable def getCorrectReadPairs(infile, stats_outfile): c = collections.Counter() # Determine the extension of the file (BAM or SAM), done this for SAM/BAM compatibility # At the moment, all calls should only pass SAM, if it stops working just leave # samfile = pysam.AlignmentFile(infile, "r") filename, file_extension = os.path.splitext(infile) if (file_extension.lower() == '.sam'): samfile = pysam.AlignmentFile(infile, "r") elif (file_extension.lower() == '.bam'): samfile = pysam.AlignmentFile(infile, "rb") # We need to know when the read name changes last_read_name = "first" # We need to know the last read when the read name changes to assign # the read_pair details from the last read when we have seen all the read pair # alignments last_read = "" # Create new dictionary template to store the details read_pair_dict_template = {'read_pair_is_paired':False, # Read pair contains R1 and R2 'read_pair_mapped_in_proper_pair':False, # Read pair contains proper mapped pair 'read_pair_is_pcr_duplicate':False, # Read pair is a PCR duplicate 'proper_pair_additional_hit_equally_good':False, # If additional hits with equally good score are found 'proper_pair_with_chimeric_read':False, # Chimeric read } # Create new copy of the template dictionary to store read_pair_dict = copy.deepcopy(read_pair_dict_template) # Process the reads for read in samfile: # If it is the first read, assign the name (it will consider # it as read name hasn't changed if last_read_name == "first": last_read_name = read.query_name # The read has changed excepting the first read from the file if (last_read_name != read.query_name): # We have seen all the reads from a pair, update the counters # Store everything updating the counters c = getCorrectReadPairsUpdateCounters(c, read_pair_dict) # Create new copy of the template dictionary to store read_pair_dict = copy.deepcopy(read_pair_dict_template) # Checks for the read # For every read pair we check whether it is a proper pair, # We only need one mapping from the read pair as a proper pair to # consider it a proper pair towards the output if(read.is_paired and read.is_proper_pair): read_pair_dict['read_pair_is_paired'] = read.is_paired read_pair_dict['read_pair_mapped_in_proper_pair'] = read.is_proper_pair # We only need one proper pair marked as duplicate for a read pair # to consider it a duplicate. if read.is_duplicate: read_pair_dict['read_pair_is_pcr_duplicate'] = read.is_duplicate # Check additional hits when the read pair happens if read.has_tag("XA"): # If there are additional hits if (len(read.get_tag("XA").split(","))>1): # If the score on the additional hit (XS) is as good or better # than the score of the original hit (AS) if (read.get_tag("AS") <= read.get_tag("XS")): read_pair_dict['proper_pair_additional_hit_equally_good'] = True # Check chimeric hits (split reads) if read.has_tag("SA"): # If there are chimeric hits if (len(read.get_tag("SA").split(","))>1): read_pair_dict['proper_pair_with_chimeric_read'] = True # Last command in the loop: update the last_read with # the read currently viewed last_read = read last_read_name = read.query_name # Process the last read # We have seen all the reads from a pair, update the counters # Store everything updating the counters c = getCorrectReadPairsUpdateCounters(c, read_pair_dict) createOutputStringFromCounters(c, stats_outfile) #------------------------------------------------------------------------------ # Creates a statement to exclude from the infile the chrs # indicated in excluded_chrs and saves the processed file to outfile # # Inputs: # -infile: peaks infile (compressed or uncompressed) to process. # -excluded_chrs: list of chrs to exclude (separated by ,) # -outfile: processed file without the chrs regions. # # Outputs: # -Writes the processed file to outfile. def createExcludingChrFromBedStatement(infile, excluded_chrs, outfile): statement = "" # If no excluded chrs provided just copy the file if excluded_chrs == "": statement = "cp "+infile+" "+outfile else: # See the extension of the file to determine the cat/zcat command _, file_extension = os.path.splitext(infile) if file_extension == ".gz": statement = "zcat " else: statement = "cat " # Create the statement statement += infile statement += " | egrep -v '(" statement += excluded_chrs statement += ")' | gzip -c > " statement += outfile return statement #------------------------------------------------------------------------------ # Given a contig_name, gets the corresponding length of the contig looking into # contig_file for the corresponding contig_name. If it doesn't find it, returns # -1. # # contig_file format: # chr1 2500000 # chr2 170000 # # Inputs: # -contig_name: Name of the contig # -contig_file: Name of the contig file # # Outputs: # -contig_length: The length of the contig. If it doesn't find it, returns -1. def getContigLength(contig_name, contig_file): contig_length = -1 with open(contig_file, 'r') as reader: for line in reader: # Remove the new line character from the line (avoids taking this into account downstream) line = line.rstrip('\n') fields = line.split("\t") if( fields[0] == contig_name): # Get the fragment length contig_length = fields[1] break return contig_length #------------------------------------------------------------------------------ # Performs a lot faster if the infile has been previously chromosome sorted # First checks that all the bed segments don't surpass the start and end of the chr. # Regions for which its ends are before the beginning of the chr, are considered empty and disregarded. # Regions for which its starts are after the end of the chr, are considered empty and disregarded. # After those filters, starts which are past the beginning of the chr are put at 0 # and ends which are past the end of the chr are put at the chromosome length # Any regions where the start and end positions mean the region is empty are disregarded. # Inputs: # -infile: bed file with strand information after being processed with bedtools slop. # In theory, any bed file should serve since all fields are outputted, this includes bedgraph files. # Tested formats: # -bed: chr1 0 100 # -bedgraph: chr1 0 100 1.2345 # -bed6 (with strand): chr1 10213 10256 HSQ-700220:198:C5WA4ACXX:5:1304:17892:7253 7 + # -contigs: file with contig \t length pairs for the infile # -outfile: processed file eliminating the problems described above # -log_file: Any deleted or modified regions are logged here. # # Outputs: # processed contents to the outfile # Exception if a contig is not found in the list of contigs. @cluster_runnable def correctSlopChromosomeEdges(infile, contigs, outfile, log_file): with IOTools.open_file(outfile, "w") as writer, \ IOTools.open_file(log_file, "w") as log: last_contig = "first" contig_length = -1 # Go through each region for bed_region in Bed.iterator(IOTools.open_file(infile, "r")): # Create and output bed copying the details output_bed = bed_region.copy() if bed_region.contig != last_contig: # Get the contig length contig_length = int(getContigLength(bed_region.contig, contigs)) last_contig = bed_region.contig # If a contig length is not found in the contigs file raise an exception if contig_length == -1: raise Exception("Correcting positions, contig "+bed_region.contig+ " not found in the list of contigs: "+contigs) # Start >= length of chromosome if (bed_region.start >= contig_length): log.write("Correcting positions, file: "+infile+"." +" Eliminating bed region: "+str(bed_region)+ " start >= contig length "+str(contig_length)+ "\n") # Skip outputting this bed continue # end <= 0 if (bed_region.end <= 0): log.write("Correcting positions, file: "+infile+"." +" Eliminating bed region: "+str(bed_region)+ " end <= 0\n") # Skip outputting this bed continue # End surpassing contig length, correct if (bed_region.end > contig_length): # The last allowable position is the end of the chromosome base # The end coordinate is not included in the region output_bed.end = contig_length log.write("Correcting positions, file: "+infile+"." +" End surpassing length of chromosome. " +"Before: "+str(bed_region)+ " Now: "+str(output_bed)+"\n") # End surpassing contig beginning, correct if (bed_region.start < 0): # The last allowable position is the end of the chromosome base output_bed.start = 0 log.write("Correcting positions, file: "+infile+"." +" Beginning surpassing beginning of chromosome. " +"Before: "+str(bed_region)+ " Now: "+str(output_bed)+"\n") # At this point all the points in the region can't exceed the chromosome edges, now verify if the # region is not empty/is adequate (start < end) if output_bed.start >= output_bed.end: # Print reason to disregard in the log log.write("Correcting positions, file: "+infile+ "Start equal or larger than end (empty). " +"Start: " +str(output_bed.start)+ " End: " +str(output_bed.end)+ "\n") # If it gets here the bed file can be outputted else: writer.write(str(output_bed)) writer.write("\n") # Creates a statement to exclude from the peaks infile the bed regions # indicated in excluded_beds and saves the processed file to outfile # # Inputs: # -infile: bed infile (compressed or uncompressed) to process. # -excluded_beds: list of chrs to exclude (separated by |) # -outfile: processed file without the chrs. # # Outputs: # -Writes the processed file to outfile. def createExcludingBedsFromBedStatement(infile, excluded_beds, outfile): statement = "" # Get the string into a list, separating the elements try: list_excluded_beds = excluded_beds.split(",") except AttributeError: list_excluded_beds = excluded_beds # If no excluded beds provided just copy the file if len(list_excluded_beds) == 0: statement = "cp "+infile+" "+outfile else: # Create the statement # Because sometimes inputing a compressed file to bedtools intersect # can cause problems (https://github.com/arq5x/bedtools2/issues/513) # The file is zcat first and inputted to bedtools intersect -a stdin if infile.endswith(".gz"): statement += "zcat "+infile else: statement += "cat "+infile statement += " | bedtools intersect -v -a stdin" statement += " -b " for excluded_bed in list_excluded_beds: statement += excluded_bed statement += " " statement += " | gzip -c > " statement += outfile return statement # Creates a statement to exclude from the peaks infile the bed regions # indicated in excluded_beds and saves the processed file to outfile # # Inputs: # -infile: bed infile (compressed or uncompressed) to process. # -excluded_beds: list of chrs to exclude (separated by |) # -outfile: processed file without the chrs. # # Outputs: # -Writes the processed file to outfile. @cluster_runnable def createExcludingBedsFromBedStatement(infile, excluded_beds, outfile): statement = "" # Get the string into a list, separating the elements if isinstance(excluded_beds, str): list_excluded_beds = list(excluded_beds) else: list_excluded_beds = excluded_beds # If no excluded beds provided just copy the file if len(list_excluded_beds) == 0: statement = "cp "+infile+" "+outfile else: # Create the statement # Because sometimes inputing a compressed file to bedtools intersect # can cause problems (https://github.com/arq5x/bedtools2/issues/513) # The file is zcat first and inputted to bedtools intersect -a stdin if infile.endswith(".gz"): statement += "zcat "+infile else: statement += "cat "+infile statement += " | bedtools intersect -v -a stdin" statement += " -b " for excluded_bed in list_excluded_beds: statement += excluded_bed statement += " " statement += " | gzip -c > " statement += outfile return statement def gene_to_transcript_map(infile, outfile): '''Parses infile GTF and extracts mapping of transcripts to gene, outputs as a tsv file''' iterator = GTF.iterator(IOTools.open_file(infile)) transcript2gene_dict = {} for entry in iterator: # Check the same transcript_id is not mapped to multiple gene_ids! if entry.transcript_id in transcript2gene_dict: if not entry.gene_id == transcript2gene_dict[entry.transcript_id]: raise ValueError('''multipe gene_ids associated with the same transcript_id %s %s''' % ( entry.gene_id, transcript2gene_dict[entry.transcript_id])) else: transcript2gene_dict[entry.transcript_id] = entry.gene_id with IOTools.open_file(outfile, "w") as outf: outf.write("transcript_id\tgene_id\n") for key, value in sorted(transcript2gene_dict.items()): outf.write("%s\t%s\n" % (key, value)) @cluster_runnable def merge_counts(infiles, outfiles): ''' Take salmon inputs from each file and produce a matrix of counts''' transcript_infiles = [x[0] for x in infiles] gene_infiles = [x[1] for x in infiles] transcript_outfile, gene_outfile = outfiles def mergeinfiles(infiles, outfile): final_df = pd.DataFrame() for infile in infiles: tmp_df = pd.read_table(infile, sep="\t", index_col=0) final_df = final_df.merge( tmp_df, how="outer", left_index=True, right_index=True) final_df = final_df.round() final_df.sort_index(inplace=True) final_df.columns = [re.sub("_RNA", "", x) for x in final_df.columns] final_df.to_csv(outfile, sep="\t", compression="gzip") mergeinfiles(transcript_infiles, transcript_outfile) mergeinfiles(gene_infiles, gene_outfile) @cluster_runnable def merge_atac_and_rna_de(atac_file, rna_file, outfile, logfile): '''Take a table of differentially accessbile peaks (that have been annotated with genes) and a table of differentially expressed genes, and combine the two''' logf = IOTools.open_file(logfile, "w") all_ATAC_RNA_interactions_table = pd.read_table(atac_file, header=0, delimiter="\t"); all_ATAC_RNA_interactions_table_nas = all_ATAC_RNA_interactions_table[ all_ATAC_RNA_interactions_table.isnull().any(axis=1)] if (all_ATAC_RNA_interactions_table_nas.shape[0]): logf.write("ATAC RNA interactions rows containing NA") # Print the dimensions of the read table to make sure the reading is proper logf.write("all_ATAC_RNA_interactions_table read: " + str(all_ATAC_RNA_interactions_table.shape)) DE_RNA_table = pd.read_table(rna_file, header=0, delimiter="\t"); DE_RNA_table_nas = DE_RNA_table[DE_RNA_table.isnull().any(axis=1)] if (DE_RNA_table_nas.shape[0]): logf.write("DE RNA rows containing NA") # Print the dimensions of the read table to make sure the reading is proper logf.write("DE_RNA_table read: " +str(DE_RNA_table.shape)) # Rename columns with ATAC and RNA prefixes ATAC_RNA_cols = all_ATAC_RNA_interactions_table.columns.tolist() logf.write("Original ATAC_RNA table columns") logf.write("\t".join(ATAC_RNA_cols)) for i in range(len(ATAC_RNA_cols)): if((i>1) and (i<9)): ATAC_RNA_cols[i] = "ATAC_" + ATAC_RNA_cols[i] all_ATAC_RNA_interactions_table.columns = ATAC_RNA_cols logf.write("Renamed columns") logf.write("\t".join(ATAC_RNA_cols)) DE_RNA_cols = DE_RNA_table.columns.tolist() logf.write("Original ATAC_RNA table columns") logf.write("\t".join(DE_RNA_cols)) for i in range(len(DE_RNA_cols)): if((i>0) and (i<8)): DE_RNA_cols[i] = "RNA_" + DE_RNA_cols[i] DE_RNA_table.columns = DE_RNA_cols logf.write("Renamed columns") logf.write("\t".join(DE_RNA_cols)) # Merge the RNA-seq details merged_df = pd.merge(all_ATAC_RNA_interactions_table, DE_RNA_table, left_on=["gene_id"], right_on=["id"], how="inner"); merged_df_nas = merged_df[merged_df.isnull().any(axis=1)] if (merged_df_nas.shape[0]): logf.write("merged_df rows containing NA") # Print the dimensions of the read table to make sure the reading is proper logf.write("merged_df read: " +str(merged_df.shape)) # Drop the duplicated "id" column merged_df.drop(["id"], axis=1, inplace=True); # Output the table (NA -> "") merged_df.to_csv(outfile, sep="\t", header=True, compression="gzip", mode="w", index_label=None, index=False, na_rep="", line_terminator="\n") logf.close()