From 37d7e90f0b9e88fc77c11147c1590a720aa3f8b4 Mon Sep 17 00:00:00 2001
From: gfudenberg <geoff.fudenberg@gmail.com>
Date: Thu, 3 May 2018 11:33:48 -0700
Subject: [PATCH 1/4] added the option of a hard specification to
 convert_loops_to_sites as well as throwing an error if the size is 2x2
 because otherwise autodetect silently fails

---
 looplib/looptools.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/looplib/looptools.py b/looplib/looptools.py
index 6bfbfcf..64be8ec 100644
--- a/looplib/looptools.py
+++ b/looplib/looptools.py
@@ -7,17 +7,24 @@
 #    reload_support=True)
 from .looptools_c import get_parent_loops, get_stationary_loops
 
-def convert_loops_to_sites(loops, r_sites=None):
+def convert_loops_to_sites(loops, r_sites=None, specifyVerticalArray=False):
     """
     Convert a list of loops defined by tuples (left_site, right_side) into
-    two np.arrays left_sites, right_sites.
+    two np.arrays left_sites, right_sites.  
+    Vertical numpy array format can be specified, otherwise shape will be autodetected. 
     """
+    if specifyVerticalArray==True:
+        if loops.shape[1] != 2:
+            raise Exception('Shape specification Error')
+        return loops[:,0],loops[:,1]
+
     if r_sites is None:
-        if (issubclass(type(loops), list)
-            and issubclass(type(loops[0]), tuple)):
+        if (issubclass(type(loops), list) and issubclass(type(loops[0]), tuple)):
             l_sites, r_sites = np.array([i for i,j in loops]), np.array([j for i,j in loops])
         elif (issubclass(type(loops), np.ndarray) and (loops.ndim == 2)):
-            if (loops.shape[0] == 2):
+            if (loops.shape[0]==loops.shape[1]):
+                raise Exception('Loop array dimensions are ambiguous')
+            elif (loops.shape[0] == 2):
                 return loops[0], loops[1]
             elif (loops.shape[1] == 2):
                 return loops[:,0], loops[:,1]

From ae60f25641b7b49e9117b6f45b9e684693203415 Mon Sep 17 00:00:00 2001
From: gfudenberg <geoff.fudenberg@gmail.com>
Date: Fri, 7 Sep 2018 18:09:15 -0700
Subject: [PATCH 2/4] loopstats alpha version

---
 looplib/loopstats.py | 78 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 looplib/loopstats.py

diff --git a/looplib/loopstats.py b/looplib/loopstats.py
new file mode 100644
index 0000000..95bd588
--- /dev/null
+++ b/looplib/loopstats.py
@@ -0,0 +1,78 @@
+import numpy as np
+
+#### utilities ####
+
+def load_pickle(filename): 
+    return pickle.load(open( filename,'rb'  ) )
+
+
+#### boundary_list independent statistics #####
+#        requires:
+#            LEF_array (Mx2 numpy array of monomer positions), 
+#            polymer_length (integer)
+
+def calc_avg_coverage_by_LEFs(filelist, polymer_length, loadFunction=load_pickle):
+    '''
+        calculates the average coverage (fraction of polymer covered by at least one loop)
+        averages over files in filelist
+    '''
+    avgCov = []
+    for f in filelist:
+        try:
+            LEF_array  = np.array(loadFunction(f),dtype=int)
+            loopCoverage = calc_coverage_by_LEFs(LEF_array, polymer_length)
+            avgCov.append( loopCoverage)
+        except:
+            print('bad file', f)
+            continue
+    return np.mean(avgCov)
+
+def calc_coverage_by_LEFs(LEF_array, polymer_length):
+    '''
+        calculates the average coverage (fraction of polymer covered by at least one loop)
+    '''
+    loopCoverage = np.zeros((polymer_length, ))
+    for p in LEF_array: loopCoverage[p[0]:p[1]+1] += 1
+    return  np.sum(loopCoverage>0) / (0.0+polymer_length)
+
+
+#### boundary_list statistics #####
+#        requires:
+#            LEF_array (Mx2 numpy array of monomer positions), 
+#            polymer_length (integer), 
+#            boundary_list (Kx1 array of monomer positions)
+
+
+def calc_average_LEF_stalling(filelist, polymer_length, boundary_list, loadFunction=load_pickle):
+    '''
+        calculates the average fraction of LEFs with two, one, or no legs stalled at boundaries.
+        averages over files in filelist
+    '''
+    both_one_none = []
+    for f in filelist:
+        try:
+            LEF_array = loadFunction(f)
+            both_one_none.append(calc_LEF_stalling_by_leg(LEF_array, polymer_length, boundary_list))        
+        except:
+            print('bad file', f)
+            continue
+    return np.mean(both_one_none,axis=0)
+
+def calc_LEF_stalling_by_leg(LEF_array, polymer_length, boundary_list):
+    '''
+        calculates the fraction of LEFs with two, one, or no legs stalled at boundaries.
+        returns the fraction with both legs, one leg, or neither leg overlapping boundary_list
+    '''
+    if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs')
+    boundary_list = np.array(boundary_list,dtype=int).flatten()
+    isBoundary =  np.histogram(boundary_list,  np.arange(0,polymer_length+1))[0]
+    LEF_arm_status = np.sum( isBoundary[LEF_array] , 1)
+    return  [  (np.sum(LEF_arm_status==2)/len(LEF_array) ),  (np.sum(LEF_arm_status==1)/len(LEF_array) ) ,(np.sum(LEF_arm_status==0)/len(LEF_array) )  ] 
+
+
+
+
+
+
+
+

From 247483d00a5b67b6bd1cffa11d8b1ff26eb8651b Mon Sep 17 00:00:00 2001
From: gfudenberg <geoff.fudenberg@gmail.com>
Date: Mon, 10 Sep 2018 18:56:58 -0700
Subject: [PATCH 3/4] now with _create_loopstats_report

---
 looplib/loopstats.py | 111 ++++++++++++++++++++++++++++---------------
 1 file changed, 72 insertions(+), 39 deletions(-)

diff --git a/looplib/loopstats.py b/looplib/loopstats.py
index 95bd588..a0278d3 100644
--- a/looplib/loopstats.py
+++ b/looplib/loopstats.py
@@ -1,40 +1,29 @@
 import numpy as np
+from inspect import getfullargspec
+import pandas as pd
 
 #### utilities ####
 
 def load_pickle(filename): 
     return pickle.load(open( filename,'rb'  ) )
 
+stat_output_dict = {} # create a dictionary of statistic outputs, to pass to the report
 
 #### boundary_list independent statistics #####
 #        requires:
 #            LEF_array (Mx2 numpy array of monomer positions), 
 #            polymer_length (integer)
 
-def calc_avg_coverage_by_LEFs(filelist, polymer_length, loadFunction=load_pickle):
-    '''
-        calculates the average coverage (fraction of polymer covered by at least one loop)
-        averages over files in filelist
-    '''
-    avgCov = []
-    for f in filelist:
-        try:
-            LEF_array  = np.array(loadFunction(f),dtype=int)
-            loopCoverage = calc_coverage_by_LEFs(LEF_array, polymer_length)
-            avgCov.append( loopCoverage)
-        except:
-            print('bad file', f)
-            continue
-    return np.mean(avgCov)
-
 def calc_coverage_by_LEFs(LEF_array, polymer_length):
-    '''
-        calculates the average coverage (fraction of polymer covered by at least one loop)
-    '''
+    ''' calculates the average coverage (fraction of polymer covered by at least one loop)    '''
     loopCoverage = np.zeros((polymer_length, ))
     for p in LEF_array: loopCoverage[p[0]:p[1]+1] += 1
     return  np.sum(loopCoverage>0) / (0.0+polymer_length)
+stat_output_dict[calc_coverage_by_LEFs] = 'coverage'
 
+def calc_loop_size(LEF_array,polymer_length):
+    return np.mean( LEF_array[:,1]-LEF_array[:,0] )
+stat_output_dict[calc_loop_size] = 'loop_size'
 
 #### boundary_list statistics #####
 #        requires:
@@ -43,34 +32,78 @@ def calc_coverage_by_LEFs(LEF_array, polymer_length):
 #            boundary_list (Kx1 array of monomer positions)
 
 
-def calc_average_LEF_stalling(filelist, polymer_length, boundary_list, loadFunction=load_pickle):
-    '''
-        calculates the average fraction of LEFs with two, one, or no legs stalled at boundaries.
-        averages over files in filelist
-    '''
-    both_one_none = []
-    for f in filelist:
-        try:
-            LEF_array = loadFunction(f)
-            both_one_none.append(calc_LEF_stalling_by_leg(LEF_array, polymer_length, boundary_list))        
-        except:
-            print('bad file', f)
-            continue
-    return np.mean(both_one_none,axis=0)
-
 def calc_LEF_stalling_by_leg(LEF_array, polymer_length, boundary_list):
-    '''
-        calculates the fraction of LEFs with two, one, or no legs stalled at boundaries.
-        returns the fraction with both legs, one leg, or neither leg overlapping boundary_list
-    '''
+    ''' calculates the fraction of LEFs with two, one, or no legs stalled at boundaries.
+        returns the fraction with both legs, one leg, or neither leg overlapping boundary_list    '''
     if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs')
     boundary_list = np.array(boundary_list,dtype=int).flatten()
     isBoundary =  np.histogram(boundary_list,  np.arange(0,polymer_length+1))[0]
     LEF_arm_status = np.sum( isBoundary[LEF_array] , 1)
-    return  [  (np.sum(LEF_arm_status==2)/len(LEF_array) ),  (np.sum(LEF_arm_status==1)/len(LEF_array) ) ,(np.sum(LEF_arm_status==0)/len(LEF_array) )  ] 
+    return  [(np.sum(LEF_arm_status==2)/len(LEF_array) ),  (np.sum(LEF_arm_status==1)/len(LEF_array) ) ,(np.sum(LEF_arm_status==0)/len(LEF_array)  )]
+stat_output_dict[ calc_LEF_stalling_by_leg] = ['both_stalled','one_stalled','none_stalled']
+
+def calc_boundary_occupancy(LEF_array, polymer_length, boundary_list):
+    ''' calculates the fraction of LEFs with two, one, or no legs stalled at boundaries.
+        returns the fraction with both legs, one leg, or neither leg overlapping boundary_list  '''
+    if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs')
+    boundary_list = np.array(boundary_list,dtype=int).flatten()
+    bb = np.arange(0,polymer_length+1, 1); bb_mids = .5*(bb[:-1]+bb[1:])
+    extruderHistogram, b = np.histogram( LEF_array ,bb)
+    boundary_occupancy = np.mean( extruderHistogram[boundary_list ] )
+    non_boundary_list = np.setdiff1d(np.arange(1,polymer_length-1), boundary_list)
+    non_occupancy = np.mean( extruderHistogram[non_boundary_list ]  )
+    return [boundary_occupancy, non_occupancy]
+stat_output_dict[ calc_boundary_occupancy] = ['boundary_occupancy','non_occupancy']
+
 
 
+##### averaging and summarizing ####
 
+def flatten(l): return flatten(l[0]) + (flatten(l[1:]) if len(l) > 1 else []) if type(l) is list else [l] # since list(itertools.chain.from_iterable(newlist)) doesn't quite cut it...
+
+def _calc_stats(filelist, stat_function_list=[calc_coverage_by_LEFs], load_function=load_pickle, **kwargs):
+    ''' takes  a list of filenames returns loop statistics over these files. can be  parallelized in the future...'''
+    stat_list = []
+    for f in filelist:
+        try:
+            LEF_array  = np.array(load_function(f),dtype=int)
+            filestats = []
+            for stat_function in stat_function_list:
+                numInputs = len( getfullargspec(stat_function)[0])
+                if numInputs  == 2:
+                    filestats.append( stat_function(LEF_array, kwargs['polymer_length']))
+                elif numInputs == 3:
+                    filestats.append( stat_function(LEF_array, kwargs['polymer_length'], kwargs['boundary_list']))
+            stat_list.append(flatten(filestats))
+        except:
+           print('bad file', f);   continue
+    return stat_list
+
+
+def _create_loopstats_report(filelist_tuples, stat_function_list=[calc_coverage_by_LEFs], 
+                             load_function=load_pickle,roundDecimals=2, **kwargs):
+    ''' averages the stat functions over each group of filelists; kwargs needed depend on functions called
+      usage: _create_loopstats_report([(filelist1,'smclife1'), (filelist2,'smclife2') ], 
+                                        stat_function_list=[calc_loop_size,calc_coverage_by_LEFs, calc_LEF_stalling_by_leg, calc_boundary_occupancy], 
+                                        load_function= load_pickle, roundDecimals=2, 
+                                         **{'polymer_length': polymer_length , 'boundary_list':boundaries_all}  )     '''
+    loopstats_report = []
+    loopstats_indicies =[]
+    for filelist, filename in filelist_tuples:
+        loopstats_indicies.append(filename)
+        loopstats_report.append(  np.mean( _calc_stats(filelist, stat_function_list, load_function, **kwargs),axis = 0) )
+    loopstats_report = np.array(loopstats_report) 
+    if roundDecimals != None: loopstats_report = np.round(loopstats_report,roundDecimals)
+
+    column_names = []
+    for stat_function in stat_function_list:
+        column_names.append( stat_output_dict[stat_function])
+    column_names = flatten(column_names)
+
+    loopstats_report = pd.DataFrame( loopstats_report, columns = column_names)#, index=loopstats_indicies)
+    loopstats_report['row_names'] = loopstats_indicies
+    loopstats_report.set_index('row_names',inplace=True, drop=True)
+    return loopstats_report
 
 
 

From d9dedda490c4100c9597b2b6e7d7f5fbac5e9d16 Mon Sep 17 00:00:00 2001
From: gfudenberg <geoff.fudenberg@gmail.com>
Date: Mon, 8 Oct 2018 00:31:31 -0700
Subject: [PATCH 4/4] a couple more stats

---
 looplib/loopstats.py | 69 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 64 insertions(+), 5 deletions(-)

diff --git a/looplib/loopstats.py b/looplib/loopstats.py
index a0278d3..11cb247 100644
--- a/looplib/loopstats.py
+++ b/looplib/loopstats.py
@@ -2,11 +2,24 @@
 from inspect import getfullargspec
 import pandas as pd
 
+import bioframe
+from bioframe.tools import tsv, bedtools
+def bedtools_intersect_counts(left, right, **kwargs):
+    with tsv(left) as a, tsv(right) as b:
+        out = bedtools.intersect(a=a.name, b=b.name,c=True)
+        out.columns = list(left.columns) + ['counts']
+    return out
+
 #### utilities ####
 
 def load_pickle(filename): 
     return pickle.load(open( filename,'rb'  ) )
 
+
+def load_joblib_data(filename):
+    return joblib.load(filename.replace('SMC','block'))['data']
+
+
 stat_output_dict = {} # create a dictionary of statistic outputs, to pass to the report
 
 #### boundary_list independent statistics #####
@@ -35,7 +48,6 @@ def calc_loop_size(LEF_array,polymer_length):
 def calc_LEF_stalling_by_leg(LEF_array, polymer_length, boundary_list):
     ''' calculates the fraction of LEFs with two, one, or no legs stalled at boundaries.
         returns the fraction with both legs, one leg, or neither leg overlapping boundary_list    '''
-    if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs')
     boundary_list = np.array(boundary_list,dtype=int).flatten()
     isBoundary =  np.histogram(boundary_list,  np.arange(0,polymer_length+1))[0]
     LEF_arm_status = np.sum( isBoundary[LEF_array] , 1)
@@ -45,7 +57,6 @@ def calc_LEF_stalling_by_leg(LEF_array, polymer_length, boundary_list):
 def calc_boundary_occupancy(LEF_array, polymer_length, boundary_list):
     ''' calculates the fraction of LEFs with two, one, or no legs stalled at boundaries.
         returns the fraction with both legs, one leg, or neither leg overlapping boundary_list  '''
-    if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs')
     boundary_list = np.array(boundary_list,dtype=int).flatten()
     bb = np.arange(0,polymer_length+1, 1); bb_mids = .5*(bb[:-1]+bb[1:])
     extruderHistogram, b = np.histogram( LEF_array ,bb)
@@ -56,17 +67,63 @@ def calc_boundary_occupancy(LEF_array, polymer_length, boundary_list):
 stat_output_dict[ calc_boundary_occupancy] = ['boundary_occupancy','non_occupancy']
 
 
+def calc_boundary_crossing_percent(LEF_array, polymer_length, boundary_list):
+    ''' calculates the fraction of LEFs with two, one, or no legs stalled at boundaries.
+        returns the fraction with both legs, one leg, or neither leg overlapping boundary_list  '''
+    boundary_list = np.array(boundary_list,dtype=int).flatten()
+    allLocs_df = pd.DataFrame(boundary_list, columns=['start'])
+    allLocs_df['end'] = boundary_list
+    allLocs_df['chr'] = 13
+    allLocs_df = allLocs_df[['chr','start','end']]
+    
+    LEF_array_df = pd.DataFrame(LEF_array,columns=['start','end'])
+    LEF_array_df['chr'] = 13
+    LEF_array_df= LEF_array_df[['chr','start','end']]
+    LEF_array_df.sort_values(['start'], inplace=True)                    
+    LEF_array_shrink_df = LEF_array_df.copy()
+    LEF_array_shrink_df['start'] = LEF_array_shrink_df['start'].values+1
+    LEF_array_shrink_df['end']   = np.maximum( LEF_array_shrink_df['end'].values-1, LEF_array_shrink_df['start'])
+                    
+    percent_crossing = np.sum((bedtools_intersect_counts( LEF_array_shrink_df, allLocs_df)['counts'] > 0)  
+                            / len(LEF_array) )
+    return [percent_crossing]
+stat_output_dict[ calc_boundary_crossing_percent] = ['percent_crossing']
+
+def calc_numLoops_given_boundaryPair_contact(LEF_array, polymer_length, boundary_pair_array, data):
+    LEF_array_df = pd.DataFrame(LEF_array,columns=['start','end'])
+    LEF_array_df['chr'] = 13
+    LEF_array_df= LEF_array_df[['chr','start','end']]
+    
+    sep2_dists =   np.sum( (data[boundary_pair_array[:,0],:] 
+                            - data[boundary_pair_array[:,1],:])**2.,axis=1 )**.5 
+    sep2_contacts = boundary_pair_array[ sep2_dists < 3 ,:]
+
+    sep2_contacts_df = pd.DataFrame(sep2_contacts,columns=['start','end'])
+    sep2_contacts_df['chr'] = 13
+    sep2_contacts_df= sep2_contacts_df[['chr','start','end']]
+    sep2_contacts_df.sort_values(['start'], inplace=True)
+    
+    cbins = np.arange(0,len(LEF_array),1)
+    a,b = np.histogram( bedtools_intersect_counts(sep2_contacts_df, LEF_array_df)['counts'].values  , cbins)
+    a = a/np.sum(a)
+    no_loops, single_loop, multiple_loops = ( a[0], a[1], np.sum(a[2:]) )    
+    return [no_loops, single_loop, multiple_loops]
+stat_output_dict[calc_numLoops_given_boundaryPair_contact]= ['no_loops','one_loop','two_plus_loops']
+
 
 ##### averaging and summarizing ####
 
 def flatten(l): return flatten(l[0]) + (flatten(l[1:]) if len(l) > 1 else []) if type(l) is list else [l] # since list(itertools.chain.from_iterable(newlist)) doesn't quite cut it...
 
-def _calc_stats(filelist, stat_function_list=[calc_coverage_by_LEFs], load_function=load_pickle, **kwargs):
+def _calc_stats(filelist, stat_function_list=[calc_coverage_by_LEFs], load_function=load_pickle, load_function_data=None, **kwargs):
     ''' takes  a list of filenames returns loop statistics over these files. can be  parallelized in the future...'''
     stat_list = []
     for f in filelist:
         try:
             LEF_array  = np.array(load_function(f),dtype=int)
+            if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs')
+            if load_function_data != None:
+                data = np.array(load_function_data(f))
             filestats = []
             for stat_function in stat_function_list:
                 numInputs = len( getfullargspec(stat_function)[0])
@@ -74,6 +131,8 @@ def _calc_stats(filelist, stat_function_list=[calc_coverage_by_LEFs], load_funct
                     filestats.append( stat_function(LEF_array, kwargs['polymer_length']))
                 elif numInputs == 3:
                     filestats.append( stat_function(LEF_array, kwargs['polymer_length'], kwargs['boundary_list']))
+                elif numInputs == 4:
+                    filestats.append( stat_function(LEF_array, kwargs['polymer_length'], kwargs['boundary_pair_array'], data ))
             stat_list.append(flatten(filestats))
         except:
            print('bad file', f);   continue
@@ -81,7 +140,7 @@ def _calc_stats(filelist, stat_function_list=[calc_coverage_by_LEFs], load_funct
 
 
 def _create_loopstats_report(filelist_tuples, stat_function_list=[calc_coverage_by_LEFs], 
-                             load_function=load_pickle,roundDecimals=2, **kwargs):
+                             load_function=load_pickle, load_function_data=None,roundDecimals=2, **kwargs):
     ''' averages the stat functions over each group of filelists; kwargs needed depend on functions called
       usage: _create_loopstats_report([(filelist1,'smclife1'), (filelist2,'smclife2') ], 
                                         stat_function_list=[calc_loop_size,calc_coverage_by_LEFs, calc_LEF_stalling_by_leg, calc_boundary_occupancy], 
@@ -91,7 +150,7 @@ def _create_loopstats_report(filelist_tuples, stat_function_list=[calc_coverage_
     loopstats_indicies =[]
     for filelist, filename in filelist_tuples:
         loopstats_indicies.append(filename)
-        loopstats_report.append(  np.mean( _calc_stats(filelist, stat_function_list, load_function, **kwargs),axis = 0) )
+        loopstats_report.append(  np.mean( _calc_stats(filelist, stat_function_list, load_function, load_function_data, **kwargs),axis = 0) )
     loopstats_report = np.array(loopstats_report) 
     if roundDecimals != None: loopstats_report = np.round(loopstats_report,roundDecimals)