diff --git a/hicexplorer/chicAggregateStatistic.py b/hicexplorer/chicAggregateStatistic.py index 674cfadf..0bce42bc 100644 --- a/hicexplorer/chicAggregateStatistic.py +++ b/hicexplorer/chicAggregateStatistic.py @@ -63,13 +63,13 @@ def parse_arguments(args=None): return parser -def filter_scores_target_list(pScoresDictionary, pTargetList=None, pTargetIntervalTree=None, pTargetFile=None): +def filter_scores_target_list(pScoresDictionary, pTargetFType, pTargetPosDict, pTargetList=None, pTargetIntervalTree=None, pTargetFile=None): accepted_scores = {} same_target_dict = {} target_regions_intervaltree = None - if pTargetList is not None: - + # newly added + if pTargetFType == 'hdf5': # read hdf content for this specific combination targetFileHDF5Object = h5py.File(pTargetFile, 'r') target_object = targetFileHDF5Object['/'.join(pTargetList)] @@ -77,6 +77,17 @@ def filter_scores_target_list(pScoresDictionary, pTargetList=None, pTargetInterv start_list = list(target_object['start_list'][:]) end_list = list(target_object['end_list'][:]) targetFileHDF5Object.close() + elif pTargetFType == 'bed4': + chromosome = pTargetPosDict[pTargetList[-1]]['chromosome'] + start_list = pTargetPosDict[pTargetList[-1]]['start_list'] + end_list = pTargetPosDict[pTargetList[-1]]['end_list'] + elif pTargetFType == 'bed3': + target_regions_intervaltree = pTargetIntervalTree + else: + log.error('No target list given.') + raise Exception('No target list given.') + + if pTargetList is not None: chromosome = [chromosome] * len(start_list) target_regions = list(zip(chromosome, start_list, end_list)) @@ -85,12 +96,6 @@ def filter_scores_target_list(pScoresDictionary, pTargetList=None, pTargetInterv hicmatrix = hm.hiCMatrix() target_regions_intervaltree = hicmatrix.intervalListToIntervalTree(target_regions)[0] - elif pTargetIntervalTree is not None: - target_regions_intervaltree = pTargetIntervalTree - - else: - log.error('No target list given.') - raise Exception('No target list given.') for key in pScoresDictionary: chromosome = pScoresDictionary[key][0] @@ -193,12 +198,12 @@ def writeAggregateHDF(pOutFileName, pOutfileNamesList, pAcceptedScoresList, pArg aggregateFileH5Object.close() -def run_target_list_compilation(pInteractionFilesList, pTargetList, pArgs, pViewpointObj, pQueue=None, pOneTarget=False): +def run_target_list_compilation(pInteractionFilesList, pTargetList, pTargetFType, pTargetPosDict, pArgs, pViewpointObj, pQueue=None, pOneTarget=False): outfile_names_list = [] accepted_scores_list = [] target_regions_intervaltree = None try: - if pOneTarget == True: + if pTargetFType == 'bed3': try: target_regions = utilities.readBed(pTargetList) except Exception as exp: @@ -211,14 +216,13 @@ def run_target_list_compilation(pInteractionFilesList, pTargetList, pArgs, pView outfile_names_list_intern = [] accepted_scores_list_intern = [] for sample in interactionFile: - interaction_data, interaction_file_data, _ = pViewpointObj.readInteractionFile(pArgs.interactionFile, sample) if pOneTarget == True: target_file = None else: target_file = pTargetList[i] - accepted_scores = filter_scores_target_list(interaction_file_data, pTargetList=target_file, pTargetIntervalTree=target_regions_intervaltree, pTargetFile=pArgs.targetFile) + accepted_scores = filter_scores_target_list(interaction_file_data, pTargetFType, pTargetPosDict, pTargetList=target_file, pTargetIntervalTree=target_regions_intervaltree, pTargetFile=pArgs.targetFile) outfile_names_list_intern.append(sample) accepted_scores_list_intern.append(accepted_scores) @@ -238,7 +242,7 @@ def run_target_list_compilation(pInteractionFilesList, pTargetList, pArgs, pView return -def call_multi_core(pInteractionFilesList, pTargetFileList, pFunctionName, pArgs, pViewpointObj): +def call_multi_core(pInteractionFilesList, pTargetFileList, pTargetFType, pTargetPosDict, pFunctionName, pArgs, pViewpointObj): if len(pInteractionFilesList) < pArgs.threads: pArgs.threads = len(pInteractionFilesList) outfile_names_list = [None] * pArgs.threads @@ -272,6 +276,8 @@ def call_multi_core(pInteractionFilesList, pTargetFileList, pFunctionName, pArgs process[i] = Process(target=pFunctionName, kwargs=dict( pInteractionFilesList=interactionFileListThread, pTargetList=targetFileListThread, + pTargetFType=pTargetFType, + pTargetPosDict=pTargetPosDict, pArgs=pArgs, pViewpointObj=pViewpointObj, pQueue=queue[i], @@ -318,16 +324,32 @@ def main(args=None): targetList = [] present_genes = {} + target_ftype = '' + targetPosDict = None # read hdf file interactionFileHDF5Object = h5py.File(args.interactionFile, 'r') keys_interactionFile = list(interactionFileHDF5Object.keys()) if h5py.is_hdf5(args.targetFile): - targetDict, present_genes = viewpointObj.readTargetHDFFile(args.targetFile) + target_ftype = 'hdf5' else: - targetList = [args.targetFile] + with open(args.targetFile) as file: + for line in file.readlines(): + if line.startswith('#'): + continue + _line = line.strip().split('\t') + break + if len(_line) == 4: + log.info('Targets BED contains 4 columns, aggregating on column 4') + target_ftype = 'bed4' + present_genes, targetDict, targetPosDict = utilities.readTargetBed(args.targetFile) + elif len(_line) == 3: + targetList = [args.targetFile] + target_ftype = 'bed3' + else: + log.error('BED of targets list must have 3 or 4 columns') if len(keys_interactionFile) > 1: @@ -346,7 +368,7 @@ def main(args=None): geneList2 = sorted(list(matrix_obj2[chromosome2].keys())) for gene1, gene2 in zip(geneList1, geneList2): - if h5py.is_hdf5(args.targetFile): + if target_ftype != 'bed3': if gene1 in present_genes[sample][sample2]: interactionDict[gene1] = [[sample, chromosome1, gene1], [sample2, chromosome2, gene2]] else: @@ -356,7 +378,7 @@ def main(args=None): interactionFileHDF5Object.close() - if h5py.is_hdf5(args.targetFile): + if target_ftype != 'bed3': key_outer_matrix = present_genes.keys() for keys_outer in key_outer_matrix: keys_inner_matrix = present_genes[keys_outer].keys() @@ -365,5 +387,5 @@ def main(args=None): interactionList.append(interactionDict[gene]) targetList.append(targetDict[gene]) - outfile_names_list, accepted_scores_list = call_multi_core(interactionList, targetList, run_target_list_compilation, args, viewpointObj) + outfile_names_list, accepted_scores_list = call_multi_core(interactionList, targetList, target_ftype, targetPosDict, run_target_list_compilation, args, viewpointObj) writeAggregateHDF(args.outFileName, outfile_names_list, accepted_scores_list, args) diff --git a/hicexplorer/hicCorrectMatrix.py b/hicexplorer/hicCorrectMatrix.py index a3221368..71cd58b7 100644 --- a/hicexplorer/hicCorrectMatrix.py +++ b/hicexplorer/hicCorrectMatrix.py @@ -235,6 +235,9 @@ def correct_subparser(): 'of chromosomes and/or translocations.', action='store_true') + parserOpt.add_argument('--filteredBed', + help='Print bins filtered our by --filterThreshold to this file') + parserOpt.add_argument('--verbose', help='Print processing status.', action='store_true') @@ -548,6 +551,7 @@ def filter_by_zscore(hic_ma, lower_threshold, upper_threshold, perchr=False): to avoid introducing bias due to different chromosome numbers """ + log.info("filtering by z-score") to_remove = [] if perchr: for chrname in list(hic_ma.interval_trees): @@ -575,6 +579,7 @@ def filter_by_zscore(hic_ma, lower_threshold, upper_threshold, perchr=False): "\n".format(chrname, lower_threshold, upper_threshold)) to_remove.extend(problematic) + else: row_sum = np.asarray(hic_ma.matrix.sum(axis=1)).flatten() # subtract from row sum, the diagonal @@ -584,7 +589,6 @@ def filter_by_zscore(hic_ma, lower_threshold, upper_threshold, perchr=False): mad = MAD(row_sum) to_remove = np.flatnonzero(mad.is_outlier( lower_threshold, upper_threshold)) - return sorted(to_remove) @@ -658,6 +662,12 @@ def main(args=None): restore_masked_bins=False) assert matrix_shape == ma.matrix.shape + + if args.filteredBed: + with open(args.filteredBed, 'w') as f: + for outlier_region in set(outlier_regions): + interval = ma.cut_intervals[outlier_region] + f.write(f"{interval[0]}\t{interval[1]}\t{interval[2]}\t.\t{interval[3]}\t.\n") # mask filtered regions ma.maskBins(outlier_regions) total_filtered_out = set(outlier_regions) diff --git a/hicexplorer/test/general/test_chicAggregateStatistic.py b/hicexplorer/test/general/test_chicAggregateStatistic.py index ef8052ee..9282422f 100644 --- a/hicexplorer/test/general/test_chicAggregateStatistic.py +++ b/hicexplorer/test/general/test_chicAggregateStatistic.py @@ -113,41 +113,81 @@ def test_regular_mode_threads(): aggregateFileH5Object.close() -def test_target_list(): +def test_target_list_bed3(): outfile_aggregate = NamedTemporaryFile(suffix='.hdf5', delete=False) outfile_aggregate.close() args = "--interactionFile {} --targetFile {} --outFileName {} \ - -t {}".format(ROOT + 'chicViewpoint/two_matrices.hdf5', - ROOT + 'chicAggregateStatistic/target_list.bed', + -t {}".format(ROOT + 'chicViewpoint/two_matrices_custom_keys.hdf5', + ROOT + 'chicAggregateStatistic/target_list_3col.bed', outfile_aggregate.name, 10).split() chicAggregateStatistic.main(args) aggregateFileH5Object = h5py.File(outfile_aggregate.name, 'r') - assert 'FL-E13-5_chr1_MB-E10-5_chr1' in aggregateFileH5Object - assert 'FL-E13-5_chr1' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1'] - assert 'MB-E10-5_chr1' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1'] + assert 'c_adj_norm_t_adj_norm' in aggregateFileH5Object + assert 'c_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm'] + assert 't_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm'] - assert 'genes' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'] - assert 'genes' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'] + assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'] + assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'] assert len(aggregateFileH5Object) == 1 assert aggregateFileH5Object.attrs['type'] == 'aggregate' - for chromosome in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1']: + for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm']: - assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome]) == 3 + assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]) == 3 - for gene in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome]: - assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome][gene]) == 7 - for data in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome][gene]: + for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]: + assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]) == 7 + for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]: assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions'] - for chromosome in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1']: + for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm']: - assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome]) == 3 + assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]) == 3 - for gene in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome]: - assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome][gene]) == 7 - for data in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome][gene]: + for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]: + assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]) == 7 + for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]: + assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions'] + + aggregateFileH5Object.close() + + +def test_target_list_bed4(): + outfile_aggregate = NamedTemporaryFile(suffix='.hdf5', delete=False) + outfile_aggregate.close() + args = "--interactionFile {} --targetFile {} --outFileName {} \ + -t {}".format(ROOT + 'chicViewpoint/two_matrices_custom_keys.hdf5', + ROOT + 'chicAggregateStatistic/target_list_4col.bed', + outfile_aggregate.name, 10).split() + chicAggregateStatistic.main(args) + + aggregateFileH5Object = h5py.File(outfile_aggregate.name, 'r') + assert 'c_adj_norm_t_adj_norm' in aggregateFileH5Object + assert 'c_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm'] + assert 't_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm'] + + assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'] + assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'] + assert len(aggregateFileH5Object) == 1 + assert aggregateFileH5Object.attrs['type'] == 'aggregate' + + for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm']: + + assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]) == 3 + + for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]: + assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]) == 7 + for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]: + assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions'] + + for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm']: + + assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]) == 3 + + for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]: + assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]) == 7 + for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]: assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions'] aggregateFileH5Object.close() diff --git a/hicexplorer/test/general/test_hicCorrectMatrix.py b/hicexplorer/test/general/test_hicCorrectMatrix.py index e6324847..10cbaa34 100644 --- a/hicexplorer/test/general/test_hicCorrectMatrix.py +++ b/hicexplorer/test/general/test_hicCorrectMatrix.py @@ -16,14 +16,27 @@ os.path.dirname(os.path.abspath(__file__))), "test_data/") +def are_files_equal(file1, file2, pDifference=1): + with open(file1) as textfile1, open(file2) as textfile2: + for x, y in zip(textfile1, textfile2): + if x != y: + count = sum(1 for a, b in zip(x, y) if a != b) + if count > pDifference: + return False + return True + + def test_correct_matrix_ICE(): outfile = NamedTemporaryFile(suffix='.ICE.h5', delete=False) outfile.close() + outfile_filtered = NamedTemporaryFile(suffix='.bed', delete=True) + args = "correct --matrix {} --correctionMethod ICE --chromosomes "\ - "chrUextra chr3LHet --iterNum 500 --outFileName {} "\ + "chrUextra chr3LHet --iterNum 500 --outFileName {} --filteredBed {} "\ "--filterThreshold -1.5 5.0".format(ROOT + "small_test_matrix.h5", - outfile.name).split() + outfile.name, + outfile_filtered.name).split() # hicCorrectMatrix.main(args) compute(hicCorrectMatrix.main, args, 5) test = hm.hiCMatrix( @@ -31,6 +44,7 @@ def test_correct_matrix_ICE(): new = hm.hiCMatrix(outfile.name) nt.assert_equal(test.matrix.data, new.matrix.data) nt.assert_equal(test.cut_intervals, new.cut_intervals) + assert are_files_equal(outfile_filtered.name, ROOT + 'hicCorrectMatrix/filtered.bed') os.unlink(outfile.name) diff --git a/hicexplorer/test/test_data/cHi-C/chicAggregateStatistic/aggregate_target.hdf b/hicexplorer/test/test_data/cHi-C/chicAggregateStatistic/aggregate_target.hdf new file mode 100644 index 00000000..a4b4188b Binary files /dev/null and b/hicexplorer/test/test_data/cHi-C/chicAggregateStatistic/aggregate_target.hdf differ diff --git a/hicexplorer/test/test_data/cHi-C/chicAggregateStatistic/target_list.bed b/hicexplorer/test/test_data/cHi-C/chicAggregateStatistic/target_list.bed deleted file mode 100644 index a3dfba94..00000000 --- a/hicexplorer/test/test_data/cHi-C/chicAggregateStatistic/target_list.bed +++ /dev/null @@ -1,8 +0,0 @@ -chr1 4480000 4549000 -chr1 4555000 4688000 -chr1 14274000 14279000 -chr1 14290000 14439000 -chr1 14444000 14467000 -chr1 14476000 14501000 -chr1 19077000 19118000 -chr1 19120000 19274000 \ No newline at end of file diff --git a/hicexplorer/test/test_data/cHi-C/chicAggregateStatistic/target_list_3col.bed b/hicexplorer/test/test_data/cHi-C/chicAggregateStatistic/target_list_3col.bed new file mode 100644 index 00000000..6b510fc2 --- /dev/null +++ b/hicexplorer/test/test_data/cHi-C/chicAggregateStatistic/target_list_3col.bed @@ -0,0 +1,8 @@ +chr1 4480000 4549000 +chr1 4555000 4688000 +chr1 14274000 14279000 +chr1 14290000 14439000 +chr1 14444000 14467000 +chr1 14476000 14501000 +chr1 19077000 19118000 +chr1 19120000 19274000 \ No newline at end of file diff --git a/hicexplorer/test/test_data/cHi-C/chicAggregateStatistic/target_list_4col.bed b/hicexplorer/test/test_data/cHi-C/chicAggregateStatistic/target_list_4col.bed new file mode 100644 index 00000000..449c4d5c --- /dev/null +++ b/hicexplorer/test/test_data/cHi-C/chicAggregateStatistic/target_list_4col.bed @@ -0,0 +1,3 @@ +chr1 4487335 4487535 Sox17 +chr1 14300180 14300380 Eya1 +chr1 19093003 19093203 Tfap2d \ No newline at end of file diff --git a/hicexplorer/test/test_data/cHi-C/chicViewpoint/two_matrices_custom_keys.hdf5 b/hicexplorer/test/test_data/cHi-C/chicViewpoint/two_matrices_custom_keys.hdf5 new file mode 100644 index 00000000..a5174b11 Binary files /dev/null and b/hicexplorer/test/test_data/cHi-C/chicViewpoint/two_matrices_custom_keys.hdf5 differ diff --git a/hicexplorer/test/test_data/hicCorrectMatrix/filtered.bed b/hicexplorer/test/test_data/hicCorrectMatrix/filtered.bed new file mode 100644 index 00000000..303315e3 --- /dev/null +++ b/hicexplorer/test/test_data/hicCorrectMatrix/filtered.bed @@ -0,0 +1,64 @@ +chr3LHet 1690000 1695000 . 1.0 . +chrUextra 70000 75000 . 1.0 . +chr3LHet 1815000 1820000 . nan . +chrUextra 380000 385000 . 4.0 . +chr3LHet 1885000 1890000 . 2.0 . +chr3LHet 2000000 2005000 . 2.0 . +chr3LHet 210000 215000 . 1.0 . +chr3LHet 2025000 2030000 . 2.0 . +chr3LHet 245000 250000 . 1.0 . +chr3LHet 250000 255000 . 1.0 . +chr3LHet 265000 270000 . 2.0 . +chr3LHet 2085000 2090000 . 1.0 . +chr3LHet 2115000 2120000 . 1.0 . +chr3LHet 2130000 2135000 . 2.0 . +chr3LHet 2155000 2160000 . 2.0 . +chr3LHet 2285000 2290000 . 1.0 . +chr3LHet 385000 390000 . 1.0 . +chr3LHet 2400000 2405000 . 1.0 . +chr3LHet 395000 400000 . 1.0 . +chr3LHet 2405000 2410000 . 1.0 . +chr3LHet 415000 420000 . 1.0 . +chr3LHet 2410000 2415000 . 1.0 . +chr3LHet 455000 460000 . 1.0 . +chr3LHet 485000 490000 . 1.0 . +chr3LHet 2420000 2425000 . 1.0 . +chr3LHet 540000 545000 . 2.0 . +chr3LHet 550000 555000 . 2.0 . +chr3LHet 2460000 2465000 . 2.0 . +chr3LHet 2480000 2485000 . 1.0 . +chr3LHet 575000 580000 . 1.0 . +chr3LHet 2500000 2505000 . 1.0 . +chr3LHet 610000 615000 . 1.0 . +chr3LHet 630000 635000 . 2.0 . +chr3LHet 645000 650000 . 1.0 . +chr3LHet 685000 690000 . 1.0 . +chr3LHet 715000 720000 . 1.0 . +chr3LHet 720000 725000 . nan . +chr3LHet 740000 745000 . 1.0 . +chr3LHet 745000 750000 . 2.0 . +chr3LHet 805000 810000 . 2.0 . +chr3LHet 815000 820000 . 1.0 . +chr3LHet 910000 915000 . 1.0 . +chr3LHet 920000 925000 . 1.0 . +chr3LHet 945000 950000 . 1.0 . +chr3LHet 985000 990000 . 1.0 . +chr3LHet 1045000 1050000 . 2.0 . +chr3LHet 1055000 1060000 . 1.0 . +chr3LHet 1070000 1075000 . nan . +chr3LHet 1075000 1080000 . 2.0 . +chr3LHet 1090000 1095000 . 1.0 . +chr3LHet 1135000 1140000 . 1.0 . +chr3LHet 1155000 1160000 . 1.0 . +chr3LHet 1175000 1180000 . 2.0 . +chr3LHet 1195000 1200000 . 2.0 . +chr3LHet 1200000 1205000 . 1.0 . +chr3LHet 1230000 1235000 . 1.0 . +chr3LHet 1265000 1270000 . 2.0 . +chr3LHet 1270000 1275000 . 1.0 . +chr3LHet 1275000 1280000 . 1.0 . +chr3LHet 1330000 1335000 . 1.0 . +chr3LHet 1335000 1340000 . 1.0 . +chr3LHet 1365000 1370000 . 2.0 . +chr3LHet 1395000 1400000 . 2.0 . +chr3LHet 1405000 1410000 . 1.0 . diff --git a/hicexplorer/utilities.py b/hicexplorer/utilities.py index 16e62d15..88fb6c0d 100644 --- a/hicexplorer/utilities.py +++ b/hicexplorer/utilities.py @@ -35,6 +35,45 @@ def readBed(pBedFile): return viewpoints +def readTargetBed(pBedFile): + present_genes = {} + targetDict = {} + targetPosDict = {} + # The keys are hardcoded here. Usually, for hdf input file, read as h5py.File().keys() + # These keys should match the views.hdf + # TODO: need to find a encode these in the BED file + outer_matrix = 'c_adj_norm' + inner_matrix = 't_adj_norm' + present_genes[outer_matrix] = {} + present_genes[outer_matrix][inner_matrix] = [] + with open(pBedFile, 'r') as file: + for line in file.readlines(): + if line.startswith('#'): + continue + _line = line.strip().split('\t') + if len(line) == 0: + continue + try: + chrom, start, end = _line[:4] + except ValueError: + # in case the BED is space seprated instead of tabs, split it by whitespaces + _line = line.strip().split() + chrom, start, end, gene = _line[:4] + if gene not in present_genes[outer_matrix][inner_matrix]: + present_genes[outer_matrix][inner_matrix].append(gene) + targetDict[gene] = [outer_matrix, inner_matrix, 'genes', gene] + + if gene not in targetPosDict: + targetPosDict[gene] = {} + targetPosDict[gene]['chromosome'] = {} + targetPosDict[gene]['start_list'] = [] + targetPosDict[gene]['end_list'] = [] + targetPosDict[gene]['chromosome'] = chrom + targetPosDict[gene]['start_list'].append(start) + targetPosDict[gene]['end_list'].append(end) + return present_genes, targetDict, targetPosDict + + def writableFile(string): try: open(string, 'w').close() diff --git a/requirements.txt b/requirements.txt index 016f53e8..4e8e3511 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,6 +23,6 @@ future >= 0.18 tqdm >= 4.66 hyperopt >= 0.2.7 python-graphviz >= 0.20 -scikit-learn >= 1.3.1 +scikit-learn >= 1.3,<1.4 imbalanced-learn >= 0.11 cleanlab >= 2.5 diff --git a/setup.py b/setup.py index 87304cd4..841292a0 100755 --- a/setup.py +++ b/setup.py @@ -116,7 +116,7 @@ def checkProgramIsInstalled(self, program, args, where_to_download, "tqdm >= 4.66", "hyperopt >= 0.2.7", "graphviz >= 0.20", - "scikit-learn >= 1.3.1", + "scikit-learn >= 1.3,<1.4", "imbalanced-learn >= 0.11", "cleanlab >= 2.5" ]