From 37d7e90f0b9e88fc77c11147c1590a720aa3f8b4 Mon Sep 17 00:00:00 2001 From: gfudenberg Date: Thu, 3 May 2018 11:33:48 -0700 Subject: [PATCH 1/4] added the option of a hard specification to convert_loops_to_sites as well as throwing an error if the size is 2x2 because otherwise autodetect silently fails --- looplib/looptools.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/looplib/looptools.py b/looplib/looptools.py index 6bfbfcf..64be8ec 100644 --- a/looplib/looptools.py +++ b/looplib/looptools.py @@ -7,17 +7,24 @@ # reload_support=True) from .looptools_c import get_parent_loops, get_stationary_loops -def convert_loops_to_sites(loops, r_sites=None): +def convert_loops_to_sites(loops, r_sites=None, specifyVerticalArray=False): """ Convert a list of loops defined by tuples (left_site, right_side) into - two np.arrays left_sites, right_sites. + two np.arrays left_sites, right_sites. + Vertical numpy array format can be specified, otherwise shape will be autodetected. """ + if specifyVerticalArray==True: + if loops.shape[1] != 2: + raise Exception('Shape specification Error') + return loops[:,0],loops[:,1] + if r_sites is None: - if (issubclass(type(loops), list) - and issubclass(type(loops[0]), tuple)): + if (issubclass(type(loops), list) and issubclass(type(loops[0]), tuple)): l_sites, r_sites = np.array([i for i,j in loops]), np.array([j for i,j in loops]) elif (issubclass(type(loops), np.ndarray) and (loops.ndim == 2)): - if (loops.shape[0] == 2): + if (loops.shape[0]==loops.shape[1]): + raise Exception('Loop array dimensions are ambiguous') + elif (loops.shape[0] == 2): return loops[0], loops[1] elif (loops.shape[1] == 2): return loops[:,0], loops[:,1] From ae60f25641b7b49e9117b6f45b9e684693203415 Mon Sep 17 00:00:00 2001 From: gfudenberg Date: Fri, 7 Sep 2018 18:09:15 -0700 Subject: [PATCH 2/4] loopstats alpha version --- looplib/loopstats.py | 78 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 looplib/loopstats.py diff --git a/looplib/loopstats.py b/looplib/loopstats.py new file mode 100644 index 0000000..95bd588 --- /dev/null +++ b/looplib/loopstats.py @@ -0,0 +1,78 @@ +import numpy as np + +#### utilities #### + +def load_pickle(filename): + return pickle.load(open( filename,'rb' ) ) + + +#### boundary_list independent statistics ##### +# requires: +# LEF_array (Mx2 numpy array of monomer positions), +# polymer_length (integer) + +def calc_avg_coverage_by_LEFs(filelist, polymer_length, loadFunction=load_pickle): + ''' + calculates the average coverage (fraction of polymer covered by at least one loop) + averages over files in filelist + ''' + avgCov = [] + for f in filelist: + try: + LEF_array = np.array(loadFunction(f),dtype=int) + loopCoverage = calc_coverage_by_LEFs(LEF_array, polymer_length) + avgCov.append( loopCoverage) + except: + print('bad file', f) + continue + return np.mean(avgCov) + +def calc_coverage_by_LEFs(LEF_array, polymer_length): + ''' + calculates the average coverage (fraction of polymer covered by at least one loop) + ''' + loopCoverage = np.zeros((polymer_length, )) + for p in LEF_array: loopCoverage[p[0]:p[1]+1] += 1 + return np.sum(loopCoverage>0) / (0.0+polymer_length) + + +#### boundary_list statistics ##### +# requires: +# LEF_array (Mx2 numpy array of monomer positions), +# polymer_length (integer), +# boundary_list (Kx1 array of monomer positions) + + +def calc_average_LEF_stalling(filelist, polymer_length, boundary_list, loadFunction=load_pickle): + ''' + calculates the average fraction of LEFs with two, one, or no legs stalled at boundaries. + averages over files in filelist + ''' + both_one_none = [] + for f in filelist: + try: + LEF_array = loadFunction(f) + both_one_none.append(calc_LEF_stalling_by_leg(LEF_array, polymer_length, boundary_list)) + except: + print('bad file', f) + continue + return np.mean(both_one_none,axis=0) + +def calc_LEF_stalling_by_leg(LEF_array, polymer_length, boundary_list): + ''' + calculates the fraction of LEFs with two, one, or no legs stalled at boundaries. + returns the fraction with both legs, one leg, or neither leg overlapping boundary_list + ''' + if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs') + boundary_list = np.array(boundary_list,dtype=int).flatten() + isBoundary = np.histogram(boundary_list, np.arange(0,polymer_length+1))[0] + LEF_arm_status = np.sum( isBoundary[LEF_array] , 1) + return [ (np.sum(LEF_arm_status==2)/len(LEF_array) ), (np.sum(LEF_arm_status==1)/len(LEF_array) ) ,(np.sum(LEF_arm_status==0)/len(LEF_array) ) ] + + + + + + + + From 247483d00a5b67b6bd1cffa11d8b1ff26eb8651b Mon Sep 17 00:00:00 2001 From: gfudenberg Date: Mon, 10 Sep 2018 18:56:58 -0700 Subject: [PATCH 3/4] now with _create_loopstats_report --- looplib/loopstats.py | 111 ++++++++++++++++++++++++++++--------------- 1 file changed, 72 insertions(+), 39 deletions(-) diff --git a/looplib/loopstats.py b/looplib/loopstats.py index 95bd588..a0278d3 100644 --- a/looplib/loopstats.py +++ b/looplib/loopstats.py @@ -1,40 +1,29 @@ import numpy as np +from inspect import getfullargspec +import pandas as pd #### utilities #### def load_pickle(filename): return pickle.load(open( filename,'rb' ) ) +stat_output_dict = {} # create a dictionary of statistic outputs, to pass to the report #### boundary_list independent statistics ##### # requires: # LEF_array (Mx2 numpy array of monomer positions), # polymer_length (integer) -def calc_avg_coverage_by_LEFs(filelist, polymer_length, loadFunction=load_pickle): - ''' - calculates the average coverage (fraction of polymer covered by at least one loop) - averages over files in filelist - ''' - avgCov = [] - for f in filelist: - try: - LEF_array = np.array(loadFunction(f),dtype=int) - loopCoverage = calc_coverage_by_LEFs(LEF_array, polymer_length) - avgCov.append( loopCoverage) - except: - print('bad file', f) - continue - return np.mean(avgCov) - def calc_coverage_by_LEFs(LEF_array, polymer_length): - ''' - calculates the average coverage (fraction of polymer covered by at least one loop) - ''' + ''' calculates the average coverage (fraction of polymer covered by at least one loop) ''' loopCoverage = np.zeros((polymer_length, )) for p in LEF_array: loopCoverage[p[0]:p[1]+1] += 1 return np.sum(loopCoverage>0) / (0.0+polymer_length) +stat_output_dict[calc_coverage_by_LEFs] = 'coverage' +def calc_loop_size(LEF_array,polymer_length): + return np.mean( LEF_array[:,1]-LEF_array[:,0] ) +stat_output_dict[calc_loop_size] = 'loop_size' #### boundary_list statistics ##### # requires: @@ -43,34 +32,78 @@ def calc_coverage_by_LEFs(LEF_array, polymer_length): # boundary_list (Kx1 array of monomer positions) -def calc_average_LEF_stalling(filelist, polymer_length, boundary_list, loadFunction=load_pickle): - ''' - calculates the average fraction of LEFs with two, one, or no legs stalled at boundaries. - averages over files in filelist - ''' - both_one_none = [] - for f in filelist: - try: - LEF_array = loadFunction(f) - both_one_none.append(calc_LEF_stalling_by_leg(LEF_array, polymer_length, boundary_list)) - except: - print('bad file', f) - continue - return np.mean(both_one_none,axis=0) - def calc_LEF_stalling_by_leg(LEF_array, polymer_length, boundary_list): - ''' - calculates the fraction of LEFs with two, one, or no legs stalled at boundaries. - returns the fraction with both legs, one leg, or neither leg overlapping boundary_list - ''' + ''' calculates the fraction of LEFs with two, one, or no legs stalled at boundaries. + returns the fraction with both legs, one leg, or neither leg overlapping boundary_list ''' if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs') boundary_list = np.array(boundary_list,dtype=int).flatten() isBoundary = np.histogram(boundary_list, np.arange(0,polymer_length+1))[0] LEF_arm_status = np.sum( isBoundary[LEF_array] , 1) - return [ (np.sum(LEF_arm_status==2)/len(LEF_array) ), (np.sum(LEF_arm_status==1)/len(LEF_array) ) ,(np.sum(LEF_arm_status==0)/len(LEF_array) ) ] + return [(np.sum(LEF_arm_status==2)/len(LEF_array) ), (np.sum(LEF_arm_status==1)/len(LEF_array) ) ,(np.sum(LEF_arm_status==0)/len(LEF_array) )] +stat_output_dict[ calc_LEF_stalling_by_leg] = ['both_stalled','one_stalled','none_stalled'] + +def calc_boundary_occupancy(LEF_array, polymer_length, boundary_list): + ''' calculates the fraction of LEFs with two, one, or no legs stalled at boundaries. + returns the fraction with both legs, one leg, or neither leg overlapping boundary_list ''' + if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs') + boundary_list = np.array(boundary_list,dtype=int).flatten() + bb = np.arange(0,polymer_length+1, 1); bb_mids = .5*(bb[:-1]+bb[1:]) + extruderHistogram, b = np.histogram( LEF_array ,bb) + boundary_occupancy = np.mean( extruderHistogram[boundary_list ] ) + non_boundary_list = np.setdiff1d(np.arange(1,polymer_length-1), boundary_list) + non_occupancy = np.mean( extruderHistogram[non_boundary_list ] ) + return [boundary_occupancy, non_occupancy] +stat_output_dict[ calc_boundary_occupancy] = ['boundary_occupancy','non_occupancy'] + +##### averaging and summarizing #### +def flatten(l): return flatten(l[0]) + (flatten(l[1:]) if len(l) > 1 else []) if type(l) is list else [l] # since list(itertools.chain.from_iterable(newlist)) doesn't quite cut it... + +def _calc_stats(filelist, stat_function_list=[calc_coverage_by_LEFs], load_function=load_pickle, **kwargs): + ''' takes a list of filenames returns loop statistics over these files. can be parallelized in the future...''' + stat_list = [] + for f in filelist: + try: + LEF_array = np.array(load_function(f),dtype=int) + filestats = [] + for stat_function in stat_function_list: + numInputs = len( getfullargspec(stat_function)[0]) + if numInputs == 2: + filestats.append( stat_function(LEF_array, kwargs['polymer_length'])) + elif numInputs == 3: + filestats.append( stat_function(LEF_array, kwargs['polymer_length'], kwargs['boundary_list'])) + stat_list.append(flatten(filestats)) + except: + print('bad file', f); continue + return stat_list + + +def _create_loopstats_report(filelist_tuples, stat_function_list=[calc_coverage_by_LEFs], + load_function=load_pickle,roundDecimals=2, **kwargs): + ''' averages the stat functions over each group of filelists; kwargs needed depend on functions called + usage: _create_loopstats_report([(filelist1,'smclife1'), (filelist2,'smclife2') ], + stat_function_list=[calc_loop_size,calc_coverage_by_LEFs, calc_LEF_stalling_by_leg, calc_boundary_occupancy], + load_function= load_pickle, roundDecimals=2, + **{'polymer_length': polymer_length , 'boundary_list':boundaries_all} ) ''' + loopstats_report = [] + loopstats_indicies =[] + for filelist, filename in filelist_tuples: + loopstats_indicies.append(filename) + loopstats_report.append( np.mean( _calc_stats(filelist, stat_function_list, load_function, **kwargs),axis = 0) ) + loopstats_report = np.array(loopstats_report) + if roundDecimals != None: loopstats_report = np.round(loopstats_report,roundDecimals) + + column_names = [] + for stat_function in stat_function_list: + column_names.append( stat_output_dict[stat_function]) + column_names = flatten(column_names) + + loopstats_report = pd.DataFrame( loopstats_report, columns = column_names)#, index=loopstats_indicies) + loopstats_report['row_names'] = loopstats_indicies + loopstats_report.set_index('row_names',inplace=True, drop=True) + return loopstats_report From d9dedda490c4100c9597b2b6e7d7f5fbac5e9d16 Mon Sep 17 00:00:00 2001 From: gfudenberg Date: Mon, 8 Oct 2018 00:31:31 -0700 Subject: [PATCH 4/4] a couple more stats --- looplib/loopstats.py | 69 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/looplib/loopstats.py b/looplib/loopstats.py index a0278d3..11cb247 100644 --- a/looplib/loopstats.py +++ b/looplib/loopstats.py @@ -2,11 +2,24 @@ from inspect import getfullargspec import pandas as pd +import bioframe +from bioframe.tools import tsv, bedtools +def bedtools_intersect_counts(left, right, **kwargs): + with tsv(left) as a, tsv(right) as b: + out = bedtools.intersect(a=a.name, b=b.name,c=True) + out.columns = list(left.columns) + ['counts'] + return out + #### utilities #### def load_pickle(filename): return pickle.load(open( filename,'rb' ) ) + +def load_joblib_data(filename): + return joblib.load(filename.replace('SMC','block'))['data'] + + stat_output_dict = {} # create a dictionary of statistic outputs, to pass to the report #### boundary_list independent statistics ##### @@ -35,7 +48,6 @@ def calc_loop_size(LEF_array,polymer_length): def calc_LEF_stalling_by_leg(LEF_array, polymer_length, boundary_list): ''' calculates the fraction of LEFs with two, one, or no legs stalled at boundaries. returns the fraction with both legs, one leg, or neither leg overlapping boundary_list ''' - if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs') boundary_list = np.array(boundary_list,dtype=int).flatten() isBoundary = np.histogram(boundary_list, np.arange(0,polymer_length+1))[0] LEF_arm_status = np.sum( isBoundary[LEF_array] , 1) @@ -45,7 +57,6 @@ def calc_LEF_stalling_by_leg(LEF_array, polymer_length, boundary_list): def calc_boundary_occupancy(LEF_array, polymer_length, boundary_list): ''' calculates the fraction of LEFs with two, one, or no legs stalled at boundaries. returns the fraction with both legs, one leg, or neither leg overlapping boundary_list ''' - if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs') boundary_list = np.array(boundary_list,dtype=int).flatten() bb = np.arange(0,polymer_length+1, 1); bb_mids = .5*(bb[:-1]+bb[1:]) extruderHistogram, b = np.histogram( LEF_array ,bb) @@ -56,17 +67,63 @@ def calc_boundary_occupancy(LEF_array, polymer_length, boundary_list): stat_output_dict[ calc_boundary_occupancy] = ['boundary_occupancy','non_occupancy'] +def calc_boundary_crossing_percent(LEF_array, polymer_length, boundary_list): + ''' calculates the fraction of LEFs with two, one, or no legs stalled at boundaries. + returns the fraction with both legs, one leg, or neither leg overlapping boundary_list ''' + boundary_list = np.array(boundary_list,dtype=int).flatten() + allLocs_df = pd.DataFrame(boundary_list, columns=['start']) + allLocs_df['end'] = boundary_list + allLocs_df['chr'] = 13 + allLocs_df = allLocs_df[['chr','start','end']] + + LEF_array_df = pd.DataFrame(LEF_array,columns=['start','end']) + LEF_array_df['chr'] = 13 + LEF_array_df= LEF_array_df[['chr','start','end']] + LEF_array_df.sort_values(['start'], inplace=True) + LEF_array_shrink_df = LEF_array_df.copy() + LEF_array_shrink_df['start'] = LEF_array_shrink_df['start'].values+1 + LEF_array_shrink_df['end'] = np.maximum( LEF_array_shrink_df['end'].values-1, LEF_array_shrink_df['start']) + + percent_crossing = np.sum((bedtools_intersect_counts( LEF_array_shrink_df, allLocs_df)['counts'] > 0) + / len(LEF_array) ) + return [percent_crossing] +stat_output_dict[ calc_boundary_crossing_percent] = ['percent_crossing'] + +def calc_numLoops_given_boundaryPair_contact(LEF_array, polymer_length, boundary_pair_array, data): + LEF_array_df = pd.DataFrame(LEF_array,columns=['start','end']) + LEF_array_df['chr'] = 13 + LEF_array_df= LEF_array_df[['chr','start','end']] + + sep2_dists = np.sum( (data[boundary_pair_array[:,0],:] + - data[boundary_pair_array[:,1],:])**2.,axis=1 )**.5 + sep2_contacts = boundary_pair_array[ sep2_dists < 3 ,:] + + sep2_contacts_df = pd.DataFrame(sep2_contacts,columns=['start','end']) + sep2_contacts_df['chr'] = 13 + sep2_contacts_df= sep2_contacts_df[['chr','start','end']] + sep2_contacts_df.sort_values(['start'], inplace=True) + + cbins = np.arange(0,len(LEF_array),1) + a,b = np.histogram( bedtools_intersect_counts(sep2_contacts_df, LEF_array_df)['counts'].values , cbins) + a = a/np.sum(a) + no_loops, single_loop, multiple_loops = ( a[0], a[1], np.sum(a[2:]) ) + return [no_loops, single_loop, multiple_loops] +stat_output_dict[calc_numLoops_given_boundaryPair_contact]= ['no_loops','one_loop','two_plus_loops'] + ##### averaging and summarizing #### def flatten(l): return flatten(l[0]) + (flatten(l[1:]) if len(l) > 1 else []) if type(l) is list else [l] # since list(itertools.chain.from_iterable(newlist)) doesn't quite cut it... -def _calc_stats(filelist, stat_function_list=[calc_coverage_by_LEFs], load_function=load_pickle, **kwargs): +def _calc_stats(filelist, stat_function_list=[calc_coverage_by_LEFs], load_function=load_pickle, load_function_data=None, **kwargs): ''' takes a list of filenames returns loop statistics over these files. can be parallelized in the future...''' stat_list = [] for f in filelist: try: LEF_array = np.array(load_function(f),dtype=int) + if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs') + if load_function_data != None: + data = np.array(load_function_data(f)) filestats = [] for stat_function in stat_function_list: numInputs = len( getfullargspec(stat_function)[0]) @@ -74,6 +131,8 @@ def _calc_stats(filelist, stat_function_list=[calc_coverage_by_LEFs], load_funct filestats.append( stat_function(LEF_array, kwargs['polymer_length'])) elif numInputs == 3: filestats.append( stat_function(LEF_array, kwargs['polymer_length'], kwargs['boundary_list'])) + elif numInputs == 4: + filestats.append( stat_function(LEF_array, kwargs['polymer_length'], kwargs['boundary_pair_array'], data )) stat_list.append(flatten(filestats)) except: print('bad file', f); continue @@ -81,7 +140,7 @@ def _calc_stats(filelist, stat_function_list=[calc_coverage_by_LEFs], load_funct def _create_loopstats_report(filelist_tuples, stat_function_list=[calc_coverage_by_LEFs], - load_function=load_pickle,roundDecimals=2, **kwargs): + load_function=load_pickle, load_function_data=None,roundDecimals=2, **kwargs): ''' averages the stat functions over each group of filelists; kwargs needed depend on functions called usage: _create_loopstats_report([(filelist1,'smclife1'), (filelist2,'smclife2') ], stat_function_list=[calc_loop_size,calc_coverage_by_LEFs, calc_LEF_stalling_by_leg, calc_boundary_occupancy], @@ -91,7 +150,7 @@ def _create_loopstats_report(filelist_tuples, stat_function_list=[calc_coverage_ loopstats_indicies =[] for filelist, filename in filelist_tuples: loopstats_indicies.append(filename) - loopstats_report.append( np.mean( _calc_stats(filelist, stat_function_list, load_function, **kwargs),axis = 0) ) + loopstats_report.append( np.mean( _calc_stats(filelist, stat_function_list, load_function, load_function_data, **kwargs),axis = 0) ) loopstats_report = np.array(loopstats_report) if roundDecimals != None: loopstats_report = np.round(loopstats_report,roundDecimals)