Skip to content

Commit 59f3438

Browse files
authored
Merge pull request #14 from deeptools/develop
Develop
2 parents 5097284 + 84d4863 commit 59f3438

File tree

5 files changed

+97
-24
lines changed

5 files changed

+97
-24
lines changed

hicmatrix/HiCMatrix.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import tables
2828
from intervaltree import IntervalTree, Interval
2929
import cooler
30+
import time
3031

3132
from .utilities import toBytes
3233
from .utilities import toString
@@ -57,29 +58,48 @@ def __init__(self, pMatrixFile=None, pChrnameList=None):
5758
self.orig_bin_ids = []
5859
self.orig_cut_intervals = [] # similar to orig_bin_ids. Used to identify the position of masked nan bins
5960
self.matrixFileHandler = None
60-
61+
start_time = time.time()
6162
if pMatrixFile is not None:
6263
log.debug('Load self.matrixFileHandler')
6364
fileType = 'cool'
6465
if pMatrixFile.endswith('.h5'):
6566
fileType = 'h5'
6667
self.matrixFileHandler = MatrixFileHandler(pFileType=fileType, pMatrixFile=pMatrixFile, pChrnameList=pChrnameList)
68+
log.debug('init time: {}'.format(time.time() - start_time))
6769
self.matrix, self.cut_intervals, self.nan_bins, \
6870
self.correction_factors, self.distance_counts = self.matrixFileHandler.load()
71+
log.debug('load time: {}'.format(time.time() - start_time))
72+
start_time = time.time()
6973

74+
log.debug('data loaded from file handler')
7075
if self.nan_bins is None:
7176
self.nan_bins = np.array([])
7277

7378
self.fillLowerTriangle()
79+
log.debug('triangle time: {}'.format(time.time() - start_time))
80+
start_time = time.time()
81+
82+
log.debug('fillLowerTriangle')
7483

7584
self.restoreMaskedBins()
85+
log.debug('restoreMaskedBins: {}'.format(time.time() - start_time))
86+
start_time = time.time()
87+
88+
log.debug('restoreMaskedBins')
89+
7690
self.interval_trees, self.chrBinBoundaries = \
7791
self.intervalListToIntervalTree(self.cut_intervals)
92+
log.debug('intervalListToIntervalTree: {}'.format(time.time() - start_time))
93+
start_time = time.time()
94+
95+
log.debug('intervalListToIntervalTree')
96+
7897
elif pMatrixFile is None:
7998
log.debug('Only init object, no matrix given.')
8099
else:
81100
log.error('matrix file not given')
82101
sys.exit(1)
102+
log.debug('data loaded!')
83103

84104
def save(self, pMatrixName, pSymmetric=True, pApplyCorrection=False, pHiCInfo=None):
85105
""" As an output format cooler and mcooler are supported.

hicmatrix/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
__version__ = '9'
1+
__version__ = '10'
22
# Version number differs from HiCExplorer!

hicmatrix/lib/cool.py

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
from past.builtins import zip
1414
from builtins import super
1515
from .matrixFile import MatrixFile
16+
import math
17+
import time
1618

17-
from hicmatrix.utilities import toString
19+
from hicmatrix.utilities import toString, toBytes
1820
from hicmatrix.utilities import convertNansToOnes
1921
from hicmatrix._version import __version__
2022

@@ -36,12 +38,15 @@ def __init__(self, pMatrixFile=None):
3638

3739
self.hic2cool_version = None
3840
self.hicmatrix_version = None
41+
self.scaleToOriginalRange = None
3942

4043
def getInformationCoolerBinNames(self):
4144
return cooler.Cooler(self.matrixFileName).bins().columns.values
4245

4346
def load(self):
4447
log.debug('Load in cool format')
48+
self.minValue = None
49+
self.maxValue = None
4550
if self.matrixFileName is None:
4651
log.info('No matrix is initialized')
4752

@@ -58,7 +63,7 @@ def load(self):
5863
log.info('The following file was tried to open: {}'.format(self.matrixFileName))
5964
log.info("The following nodes are available: {}".format(cooler.fileops.list_coolers(self.matrixFileName.split("::")[0])))
6065
exit()
61-
66+
log.debug('self.chrnameList {}'.format(self.chrnameList))
6267
if self.chrnameList is None:
6368
matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True)
6469
used_dtype = np.int32
@@ -84,6 +89,9 @@ def load(self):
8489
features[start_pos:start_pos + len(_features)] = _features
8590
start_pos += len(_features)
8691
i += size
92+
del _data
93+
del _instances
94+
del _features
8795

8896
# log.debug('max feature {}'.format(np.max(features)))
8997
# log.debug('max instance {}'.format(np.max(instances)))
@@ -92,13 +100,19 @@ def load(self):
92100
# log.debug('cooler_file.info[\'nbins\'] {}'.format(type(cooler_file.info['nbins'])))
93101

94102
matrix = csr_matrix((data, (instances, features)), shape=(np.int(cooler_file.info['nbins']), np.int(cooler_file.info['nbins'])), dtype=count_dtype)
95-
# del data
96-
# del instances
97-
# del features
103+
self.minValue = data.min()
104+
self.maxValue = data.max()
105+
106+
del data
107+
del instances
108+
del features
98109
else:
99110
if len(self.chrnameList) == 1:
100111
try:
112+
# self.chrnameList[0]
101113
matrix = cooler_file.matrix(balance=False, sparse=True).fetch(self.chrnameList[0]).tocsr()
114+
self.minValue = matrix.data.min()
115+
self.maxValue = matrix.data.max()
102116
except ValueError:
103117
exit("Wrong chromosome format. Please check UCSC / ensembl notation.")
104118
else:
@@ -175,11 +189,35 @@ def load(self):
175189
elif self.correctionOperator == '/':
176190
matrix.data /= instances_factors
177191

178-
cut_intervals = []
192+
# if self.scaleToOriginalRange is not None:
193+
min_value = matrix.data.min()
194+
max_value = matrix.data.max()
195+
# check if max smaller one or if not same mangnitude
196+
if max_value < 1 or (np.absolute(int(math.log10(max_value)) - int(math.log10(self.maxValue))) > 1):
197+
desired_range_difference = self.maxValue - self.minValue
198+
199+
min_value = matrix.data.min()
200+
max_value = matrix.data.max()
201+
202+
matrix.data = (matrix.data - min_value)
203+
matrix.data /= (max_value - min_value)
204+
matrix.data *= desired_range_difference
205+
matrix.data += self.minValue
206+
self.scaleToOriginalRange = True
207+
# diff_scale_factor = matrix.data.max() / max_value
208+
# if self.correctionOperator == '*':
209+
# correction_factors *= diff_scale_factor
210+
# if self.correctionOperator == '/':
211+
# correction_factors /= diff_scale_factor
179212

213+
cut_intervals = []
214+
time_start = time.time()
215+
log.debug('Creating cut_intervals {}'.format(time_start))
180216
for values in cut_intervals_data_frame.values:
181217
cut_intervals.append(tuple([toString(values[0]), values[1], values[2], 1.0]))
182-
218+
log.debug('Creating cut_intervals {} DONE'.format(time.time() - time_start))
219+
del cut_intervals_data_frame
220+
del correction_factors_data_frame
183221
# try to restore nan_bins.
184222
try:
185223
shape = matrix.shape[0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1]
@@ -266,6 +304,16 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):
266304
log.debug('self.correctionOperator: {}'.format(self.correctionOperator))
267305
log.debug('self.fileWasH5: {}'.format(self.fileWasH5))
268306

307+
if self.scaleToOriginalRange:
308+
min_value = self.matrix.data.min()
309+
max_value = self.matrix.data.max()
310+
desired_range_difference = max_value - min_value
311+
312+
self.matrix.data = (self.matrix.data - self.minValue)
313+
self.matrix.data /= (self.maxValue - self.minValue)
314+
self.matrix.data *= desired_range_difference
315+
self.matrix.data += min_value
316+
269317
if self.correctionOperator == '*' or self.correctionOperator is None:
270318
self.matrix.data /= instances_factors
271319
elif self.correctionOperator == '/' or self.fileWasH5:
@@ -276,6 +324,11 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):
276324

277325
self.matrix.eliminate_zeros()
278326

327+
if self.correction_factors is not None and pApplyCorrection is False:
328+
dtype_pixel['weight'] = np.float32
329+
weight = convertNansToOnes(np.array(self.correction_factors).flatten())
330+
bins_data_frame = bins_data_frame.assign(weight=weight)
331+
279332
instances, features = self.matrix.nonzero()
280333

281334
matrix_data_frame = pd.DataFrame(instances, columns=['bin1_id'], dtype=np.int32)
@@ -348,7 +401,7 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):
348401
metadata=self.hic_metadata,
349402
temp_dir=local_temp_dir)
350403

351-
log.debug('info {}'.format(info))
404+
# log.debug('info {}'.format(info))
352405
if self.appendData == 'w':
353406
fileName = pFileName.split('::')[0]
354407
with h5py.File(fileName, 'r+') as h5file:

requirements.txt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
numpy >= 1.13.*
2-
scipy >= 1.1.*
3-
pandas >= 0.23.*
4-
pytables >= 3.4.*
5-
future = 0.16.*
6-
cooler = 0.8.3
7-
intervaltree = 2.1.*
1+
numpy >= 1.16.*
2+
scipy >= 1.2.*
3+
pandas >= 0.24.*
4+
pytables >= 3.5.*
5+
future = 0.17.*
6+
cooler = 0.8.5
7+
intervaltree = 3.0.*

setup.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,13 @@ def checkProgramIsInstalled(self, program, args, where_to_download,
9393
sys.stderr.write("Error: {}".format(e))
9494

9595

96-
install_requires_py = ["numpy >= 1.13.*",
97-
"scipy >= 1.1.*",
98-
"tables >= 3.4.*",
99-
"pandas >= 0.23.*",
100-
"future >= 0.16.*",
101-
"cooler == 0.8.3",
102-
"intervaltree == 2.1.*"
96+
install_requires_py = ["numpy >= 1.16.*",
97+
"scipy >= 1.2.*",
98+
"tables >= 3.5.*",
99+
"pandas >= 0.24.*",
100+
"future >= 0.17.*",
101+
"cooler == 0.8.5",
102+
"intervaltree == 3.0.*"
103103
]
104104

105105
setup(

0 commit comments

Comments
 (0)