regression fixed for csv/tsv export

cancerregulome · Jul 29, 2014 · e9d6a00 · e9d6a00
commit e9d6a00
Show file tree

Hide file tree

Showing 83 changed files with 176,124 additions and 0 deletions.
diff --git a/.hgignore b/.hgignore
@@ -0,0 +1,11 @@
+#glob syntax
+
+syntax: glob
+
+*.png
+*.jpg
+*.ico
+*.gif
+*.pom
+.idea*
+.DS_Store
diff --git a/src/dataimport/META_example/META.example.pw b/src/dataimport/META_example/META.example.pw
@@ -0,0 +1,39 @@
+#drwxrwsr-x. 2 erkkila2 csbgroup 4.0K Jul  9 13:59 .
+#-rwxr-x---. 1 erkkila2 csbgroup 1.5G Jul  9 14:00 coadread.merge.12apr.rnaseq.276.hg18Plus.rface
+#
+[build]
+source=TCGA
+afm=/titan/cancerregulome3/TCGA/outputs/kirc/kirc.bigMerge.25jun.tsv
+annotations=na
+quantile_features=CNVR,GEXP,METH
+associations=/titan/cancerregulome3/TCGA/outputs/kirc/keep.pwpvD.short.sort
+interesting_scores=/titan/cancerregulome3/TCGA/outputs/kirc/featScores.pwpvD.short.txt
+afm_description=Kidney Renal AllPairs 
+comment=
+dataset_label=kirc_31july_pw
+dataset_date=31-07-12
+[email protected]
+disease_code=KIRC
+#python bin must be 2.5+ and imported the MySQLDb package
+python_bin=python
+
+#the directory needs to exist and end in /
+[results]
+path=/home/csbgroup/public_html/RE/dataimport/results/
+#/proj/ilyalab/jlin/load_associations/dataimport_meta/python/results/
+
+[dbetl]
+#only supports blank, absolute, negative, negative_log10
+pvalue_transform=absolute
+#collapse_edge_directions=1 implies taking the rf-ace higher important (A->B, B->A) set 
+#n/a for pairwise
+collapse_edge_directions=1
+reverse_directions=1
+keep_unmapped_associations=1
+
+
+#ISB specific - requires smtp python module
+[pubcrawl]
+dopubcrawl=no
+[email protected]
+
diff --git a/src/dataimport/META_example/META.example.rface b/src/dataimport/META_example/META.example.rface
@@ -0,0 +1,37 @@
+#RE DataImport META configs
+[build]
+source=TCGA
+afm=/home/csbgroup/re_data/tcga/crc/coadread.all.23jan14.TP.afm
+annotations=na
+quantile_features=CNVR
+associations=/home/csbgroup/re_data/tcga/crc/coadread.all.23jan14.TP.rface
+interesting_scores=/home/csbgroup/re_data/tcga/crc/featScores.coadread.all.23jan14.TP.txt
+#good idea to include method_version
+afm_description=CRC RFACE_1.0.4
+comment=
+dataset_label=crc_31july_test
+dataset_date=31-07-12
+[email protected]
+disease_code=COADREAD
+#python bin must be 2.5+ and imported the MySQLDb package
+python_bin=python
+
+#the directory needs to exist and end in /
+[results]
+path=/home/csbgroup/public_html/RE/dataimport/results/
+#/proj/ilyalab/jlin/load_associations/dataimport_meta/python/results/
+
+[dbetl]
+#only supports absolute, negative, negative_log10
+pvalue_transform=negative_log10
+#collapse_edge_directions=1 implies taking the rf-ace higher important (A->B, B->A) set 
+#the following settings are valid for rface only
+collapse_edge_directions=1
+reverse_directions=1
+process_gene_interest_score=0
+keep_unmapped_associations=1
+
+#ISB specific - requires smtp python module
+[pubcrawl]
+dopubcrawl=no
+[email protected]
diff --git a/src/dataimport/README b/src/dataimport/README
@@ -0,0 +1,46 @@
+#RE-Config and MySql Data Import notes and requirements
+#Requires python 2.5, 2.6, or 2.7 and libraries MySQLdb 
+
+The python executable is explicitly set to /tools/bin/python2.7 in the sh scripts, update it as needed
+
+Set up the RE admin dependencies and set permissions by executing rfex_admin.sql with root on the mysql server:
+
+mysql -u root -ppassword < rfex_admin.sql
+
+It is also recommended taking a look at MySQL's /etc/my.cnf If you have millions of associations, then you should consider following the my-large.cnf (See MySQL documentation) settings
+
+#Config
+if you want to call your database a different schema name then tcga, you will want to replace the rfex_admin.sql 'tcga' with name you want and then modify one of the example config/rfex_sql.config files.  
+The host can be localhost, or the actual server name. If you are running the data import on the db server, then localhost is best, if you are running the data import from a different server, then put in the entire address, such as machine.systemsbiology.net with the appropriate port. The server you are on must have access to the db server.  
+
+The rfex_admin.sql contains grant statements and it is important that you replace the existing server name of saskatoon with your server name. 
+Within the config file, the path dictated needs to exist, since the script only creates dir dynamically relative to this path.  
+[results]
+path=/titan/cancerregulome3/TCGA/outputs_sandbox/parsed_associations
+
+If you were to update the random_forest database name for the chrom and ref info, be aware that RE contains a google-dsapi-svc.config file containing this database name and you will need to update this and then redeploy the same name war file in your web app server. 
+
+#MySQL Engine
+RE db tables are using the MyISAM engine, versus InnoDB, since MyISAM is better fitted for read-intensive (select) and offers full-text indexing and simpler to design and drop.
+
+#SH
+From python dir, run sh:
+There should be corresponding sh files to the config files setup above, the commands required for the sh is:
+dataset_label feature_matrix_file associations_file dataset_comment dataset_description re_instance
+
+#RFACE analysis example:
+sh load_rface_associations.sh test_gbm_rface /titan/cancerregulome3/TCGA/outputs/gbm/gbm.merge.u133a.31oct.hg18.tsv /titan/cancerregulome3/TCGA/outputs/gbm/rf.u133a.31oct.mask1.F/all_associations.out "BRCA Her2 subset" "59 Her2-classified patients" internal
+
+#All Pairs example:
+sh load_pairwise_associations.sh test_kirc_0206_pw /titan/cancerregulome3/TCGA/outputs/kirc/bigMerge.06feb12.hg18.tsv /titan/cancerregulome3/TCGA/outputs/kirc/bigMerge.06feb12.pwpv "Kidney" "Kidney sandbox" public
+
+Regarding processing time, we are averaging about 800,000-1 million edges per minute. It is recommended that you modify your /etc/my.cnf to have high memory settings as quite a number of views, indexes and buffers are used. 
+
+You can load multiple associations (right now limited to RFACE and pairwise) by using 
+start_load_feature_associations.sh
+
+ie
+sh start_load_feature_associations.sh test_gbm /titan/cancerregulome3/TCGA/outputs/gbm/gbm.merge.u133a.31oct.hg18.tsv /titan/cancerregulome3/TCGA/outputs/gbm/rf.u133a.31oct.mask1.F/all_associations.out "test new flow" "test" internal /titan/cancerregulome3/TCGA/outputs/gbm/bigMerge.06feb12.1e04_1e08_8_0.pwpv
+
+Please contact [email protected] with any questions.
+Thanks
diff --git a/src/dataimport/config/META b/src/dataimport/config/META
@@ -0,0 +1,10 @@
+#” sh /home/csbgroup/public_html/RE/dataimport/python/load_rface_associations.sh tcga_coadread_nov10 /home/csbgroup/tcga/rf-ace/coadread/coadread.nov10.tsv /home/csbgroup/tcga/rf-ace/coadread/coadread.nov10.associations.filtered.tsv "TCGA Colorectal" "466 patients" public ”
+#
+[build]
+afm=/path/matrix.afm.tsv
+annotations=/path/matrix.feature.annotation.tsv
+associations=/path/rface.associations
+afm_description=TUT Prostate AFM 20K features 100 samples
+comment=your build comment
+dataset_label=gbm_dataset_01May
+intermediate_results_dir=./results
diff --git a/src/dataimport/config/rfex_sql.config b/src/dataimport/config/rfex_sql.config
@@ -0,0 +1,32 @@
+[mysql_configs]
+host=
+port=3306
+db=tcga
+username=
+password=
+
+[solr_configs]
+solrpath=http://host:8080/solr
+
+[cutoff]
+pvalue=.5
+#rface method
+importance=0.0001
+correlation=0
+#pairwise method
+loggedpvalue=-4
+
+#this is used for dataset label, ie cancer_type_method_date
+#the list tokens can be any string as long as it matches, otherwise
+#the update dataset script will fail
+[cancer_types]
+list=brca,coad,coadread,gbm,ov
+
+[pubcrawl]
+dopubcrawl=no
+pubcrawl_contact=
+
+[results]
+path=
+dosmtp=no
+notify=
diff --git a/src/dataimport/config/rfex_sql_breve.config b/src/dataimport/config/rfex_sql_breve.config
@@ -0,0 +1,33 @@
+[mysql_configs]
+host=breve
+port=3306
+db=tcga
+username=visquick_rw
+password=r34dwr1t3
+
+[solr_configs]
+solrpath=http://glados9:7080/solr
+
+#[cutoff]
+#pvalue=.5
+#rface method
+#importance=0.0001
+#correlation=0
+#pairwise method
+#loggedpvalue=-4
+
+#[cancer_types]
+#list=brca,coad,coadread,gbm,ov
+
+[pubcrawl]
+dopubcrawl=no
+[email protected],[email protected]
+
+#[results]
+#path=/local/tcga/re_dbetl
+#/proj/ilyalab/jlin/load_associations/dataimport_meta/python/results
+
+[notification]
+dosmtp=no
+[email protected]
+
diff --git a/src/dataimport/config/rfex_sql_sandbox.config b/src/dataimport/config/rfex_sql_sandbox.config
@@ -0,0 +1,29 @@
+[mysql_configs]
+host=
+port=
+db=
+username=
+password=
+
+[cutoff]
+pvalue=.5
+#rface method
+importance=0.0001
+correlation=0
+#pairwise method
+loggedpvalue=-4
+
+#this is used for dataset label, ie cancer_type_method_date
+#the list tokens can be any string as long as it matches, otherwise
+#the update dataset script will fail
+[cancer_types]
+list=brca,coad,coadread,gbm,ov,kirc
+
+[pubcrawl]
+dopubcrawl=no
+[email protected],[email protected]
+
+[results]
+path=/titan/cancerregulome3/TCGA/outputs_sandbox/parsed_associations
+dosmtp=no
+[email protected],[email protected],[email protected],[email protected],[email protected]