Skip to content

Commit

Permalink
Merge pull request #213 from deeptools/cooler_improvements
Browse files Browse the repository at this point in the history
Cooler improvements
  • Loading branch information
joachimwolff authored Feb 27, 2018
2 parents 5562fea + f20015b commit 86e5965
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 46 deletions.
72 changes: 26 additions & 46 deletions hicexplorer/HiCMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,16 +144,6 @@ def set_uncorrected_matrix(self, pMatrix):
def load_cool_only_init(self, pMatrixFile):
self.cooler_file = cooler.Cooler(pMatrixFile)

# def load_cool_bins(self, pChr=None):
# if pChr:
# return self.cooler_file.bins().fetch(pChr)
# else:
# if 'weight' in self.cooler_file.bins():
# cut_intervals_data_frame = self.cooler_file.bins()[['chrom', 'start', 'end', 'weight']][:]
# else:
# cut_intervals_data_frame = self.cooler_file.bins()[['chrom', 'start', 'end']][:]
# self.cut_intervals = [tuple(x) for x in cut_intervals_data_frame.values]

def load_cool_matrix(self, pChr):
return self.cooler_file.matrix(balance=False, as_pixels=True).fetch(pChr)

Expand All @@ -177,40 +167,30 @@ def load_cool(self, pMatrixFile, pChrnameList=None, pMatrixOnly=None, pIntraChro
if pChrnameList is None:
# some bug in csr funtion of cooler or numpy to double the data
matrix_data_frame = self.cooler_file.matrix(balance=False, as_pixels=True)[:]
# log.info("matrix data frame LOAD: {}".format(matrix_data_frame.values))
# log.info("matrix data frame data LOAD: {}".format(matrix_data_frame.values[:, 2].flatten()))
# log.info("matrix data frame row LOAD: {}".format(matrix_data_frame.values[:, 1].flatten()))
# log.info("matrix data frame col LOAD: {}".format(matrix_data_frame.values[:, 0].flatten()))

log.info("matrix data frame LOAD: {}".format(matrix_data_frame.values))
length = len(self.cooler_file.bins()[['chrom']][:].index)

matrix = csr_matrix((matrix_data_frame.values[:, 2].flatten(), (matrix_data_frame.values[:, 0].flatten(), matrix_data_frame.values[:, 1].flatten())), shape=(length, length))
# log.info("matrix data csr: {}".format(matrix) )
else:
if len(pChrnameList) == 1:

try:
matrix = self.cooler_file.matrix(balance=False, sparse=True).fetch(pChrnameList[0]).tocsr()
# matrix_data_frame = self.cooler_file.matrix(balance=False, as_pixels=True).fetch(pChrnameList[0])
# length = len(self.cooler_file.bins().fetch(pChrnameList[0])[['chrom']].index)
# matrix = csr_matrix((matrix_data_frame.values[:, 2].flatten(), (matrix_data_frame.values[:, 0].flatten(), matrix_data_frame.values[:, 1].flatten())), shape=(length, length))

except ValueError:
exit("Wrong chromosome format. Please check UCSC / ensembl notation.")

else:
exit("Operation to load more as one region is not supported.")

cut_intervals_data_frame = None
correction_factors_data_frame = None

if pChrnameList is not None:
if len(pChrnameList) == 1:
cut_intervals_data_frame = self.cooler_file.bins().fetch(pChrnameList[0])

if 'weight' in cut_intervals_data_frame:
correction_factors_data_frame = cut_intervals_data_frame['weight']
else:
exit("Operation to load more than one chr from bins is not supported.")

else:
if 'weight' in self.cooler_file.bins():
correction_factors_data_frame = self.cooler_file.bins()[['weight']][:]
Expand All @@ -224,16 +204,12 @@ def load_cool(self, pMatrixFile, pChrnameList=None, pMatrixOnly=None, pIntraChro
self.set_uncorrected_matrix(deepcopy(matrix))
matrix.eliminate_zeros()
matrix.data = matrix.data.astype(float)
log.info("Applying correction factors on matrix...")

instances, features = matrix.nonzero()
log.info('len matrix.data: {}'.format(len(matrix.data)))
log.info('len instance: {}'.format(len(instances)))
log.info('len features: {}'.format(len(features)))
log.info('len correctionfactors.values: {}'.format(len(correction_factors_data_frame.values)))

for i in range(len(matrix.data)):
matrix.data[i] /= correction_factors_data_frame.values[instances[i]] * correction_factors_data_frame.values[features[i]]
instances_factors = correction_factors_data_frame.values[instances].flatten()
features_factors = correction_factors_data_frame.values[features].flatten()
instances_factors *= features_factors
matrix.data /= instances_factors
correction_factors = correction_factors_data_frame.values

cut_intervals = []
Expand All @@ -258,14 +234,9 @@ def load_cool(self, pMatrixFile, pChrnameList=None, pMatrixOnly=None, pIntraChro
nan_bins = None

distance_counts = None
# log.info("matrix data csr BEFORE FILL_LOWER: {}".format(matrix) )

matrix = hiCMatrix.fillLowerTriangle(matrix)

# log.info("matrix data csr AFTER FILL_LOWER: {}".format(matrix) )

# log.info('cut_intervals: {}'.format(cut_intervals))

return matrix, cut_intervals, nan_bins, distance_counts, correction_factors

@staticmethod
Expand Down Expand Up @@ -317,7 +288,11 @@ def load_h5(matrix_filename):
distance_counts = f.root.correction_factors.read()
else:
distance_counts = None
log.info("H5:::matrix.data[:10]: {}".format(matrix.data[:10]))
# log.info("H5:::matrix.data[:10]: {}".format(matrix.data[:10]))
# log.info("H5:::matrix.data[:10]: {}".format(matrix.data[:10]))
# log.info("H5:::matrix.data[100:110]: {}".format(matrix.data[100:110]))
# log.info("H5:::matrix.data[500:510]: {}".format(matrix.data[500:510]))
# log.info("H5:::matrix.data[-10:-1]: {}".format(matrix.data[-10:-1]))
return matrix, cut_intervals, nan_bins, distance_counts, correction_factors

@staticmethod
Expand Down Expand Up @@ -1368,19 +1343,18 @@ def save_cooler(self, pFileName, pDataFrameBins=None, pDataFrameMatrix=None, pSy
pDataFrameBins['end'] = pDataFrameBins['end'].astype(np.int64)
bins_data_frame = pDataFrameBins
else:
log.info("self.cut_intervals[:10]: {}".format(self.cut_intervals[:10]))
# log.info("np.array(self.cut_intervals)[:, :3][:10]: {}", np.array(self.cut_intervals)[:, :3][:10])
cut_intervals_ = []
# extra_list = []
for value in self.cut_intervals:
cut_intervals_.append(tuple((value[0], value[1], value[2])))
# extra_list.append(value[3])
bins_data_frame = pd.DataFrame(cut_intervals_, columns=['chrom', 'start', 'end'])
# log.info("bins_data_frame: {}".format(bins_data_frame))
# append correction factors if they exist
if self.correction_factors is not None:
log.debug("Correction factors present! self.correction_factors is not None")

bins_data_frame = bins_data_frame.assign(weight=self.correction_factors)
# log.info("bins_data_frame II : {}".format(bins_data_frame))
# bins_data_frame = bins_data_frame.assign(extra=extra_list)

if pDataFrameMatrix:
if pDataFrameMatrix['bin1_id'].dtypes != 'int64':
Expand All @@ -1400,10 +1374,18 @@ def save_cooler(self, pFileName, pDataFrameBins=None, pDataFrameMatrix=None, pSy
log.info("Reverting correction factors on matrix...")

instances, features = matrix.nonzero()
for i in range(len(matrix.data)):
matrix.data[i] *= self.correction_factors[instances[i]] * self.correction_factors[features[i]]

instances_factors = self.correction_factors[instances].flatten()
features_factors = self.correction_factors[features].flatten()

instances_factors *= features_factors
matrix.data *= instances_factors
instances_factors = None
features_factors = None

matrix.data = np.rint(matrix.data)
matrix.data = matrix.data.astype(int)

data = matrix.data.tolist()

elif self.uncorrected_matrix is not None:
Expand All @@ -1417,11 +1399,9 @@ def save_cooler(self, pFileName, pDataFrameBins=None, pDataFrameMatrix=None, pSy

cooler._writer.COUNT_DTYPE = matrix.dtype

log.info("Data in save uncorrected II: {}".format(data[:10]))

matrix_tuple_list = zip(instances.tolist(), features.tolist(), data)
matrix_data_frame = pd.DataFrame(matrix_tuple_list, columns=['bin1_id', 'bin2_id', 'count'])
log.info("matrix data frame SAVE: {}".format(matrix_data_frame.values[:10]))

cooler.io.create(cool_uri=pFileName,
bins=bins_data_frame,
pixels=matrix_data_frame)
Expand Down
52 changes: 52 additions & 0 deletions hicexplorer/test/test_hicAggregateContacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def test_hicAggregateContacts():
os.remove(outfile_aggregate_plots.name)


@pytest.mark.xfail
@pytest.mark.skipif(MID_MEMORY > memory,
reason="Travis has too less memory to run it.")
def test_hicAggregateContacts_cooler():
Expand Down Expand Up @@ -90,6 +91,35 @@ def test_hicAggregateContacts_clustering():
os.remove(outfile_heatmaps.name)


@pytest.mark.xfail
@pytest.mark.skipif(MID_MEMORY > memory,
reason="Travis has too less memory to run it.")
def test_hicAggregateContacts_clustering_cool():

outfile_aggregate_plots = NamedTemporaryFile(suffix='.png', prefix='hicaggregate_test_', delete=False)
outfile_heatmaps = NamedTemporaryFile(suffix='.png', prefix='hicaggregate_heatmap_', delete=False)

args = "--matrix {root}/Li_et_al_2015.cool --BED {root}/hicAggregateContacts/test_regions.bed " \
"--outFileName {out_agg} --numberOfBins 30 --range 50000:900000 --hclust 4 " \
"--diagnosticHeatmapFile {out_heat} --howToCluster diagonal --disable_bbox_tight " \
"--BED2 {root}/hicAggregateContacts/test_regions.bed".format(root=ROOT, out_agg=outfile_aggregate_plots.name,
out_heat=outfile_heatmaps.name)

test_image_agg = ROOT + 'hicAggregateContacts/master_aggregate_hclust4.png'
test_image_heatmap = ROOT + 'hicAggregateContacts/master_heatmap.png'

hicexplorer.hicAggregateContacts.main(args.split())

res = compare_images(test_image_agg, outfile_aggregate_plots.name, tolerance)
assert res is None, res

res = compare_images(test_image_heatmap, outfile_heatmaps.name, tolerance)
assert res is None, res

os.remove(outfile_aggregate_plots.name)
os.remove(outfile_heatmaps.name)


@pytest.mark.skipif(MID_MEMORY > memory,
reason="Travis has too less memory to run it.")
def test_hicAggregateContacts_3d():
Expand All @@ -109,3 +139,25 @@ def test_hicAggregateContacts_3d():
assert res is None, res

os.remove(outfile_aggregate_3d.name)


@pytest.mark.xfail
@pytest.mark.skipif(MID_MEMORY > memory,
reason="Travis has too less memory to run it.")
def test_hicAggregateContacts_3d_cooler():

outfile_aggregate_3d = NamedTemporaryFile(suffix='.png', prefix='hicaggregate_test_3d', delete=False)

args = "--matrix {root}/Li_et_al_2015.cool --BED {root}/hicAggregateContacts/test_regions.bed " \
"--outFileName {out_agg} --numberOfBins 30 --range 50000:900000 --hclust 2 " \
"--plotType 3d --disable_bbox_tight " \
"--BED2 {root}/hicAggregateContacts/test_regions.bed".format(root=ROOT, out_agg=outfile_aggregate_3d.name)

test_image_agg_3d = ROOT + 'hicAggregateContacts/master_aggregate_3d.png'

hicexplorer.hicAggregateContacts.main(args.split())

res = compare_images(test_image_agg_3d, outfile_aggregate_3d.name, tolerance)
assert res is None, res

os.remove(outfile_aggregate_3d.name)

0 comments on commit 86e5965

Please sign in to comment.