Skip to content

Commit

Permalink
regression fixed for csv/tsv export
Browse files Browse the repository at this point in the history
  • Loading branch information
Dick Kreisberg committed Jul 29, 2014
0 parents commit e9d6a00
Show file tree
Hide file tree
Showing 83 changed files with 176,124 additions and 0 deletions.
11 changes: 11 additions & 0 deletions .hgignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#glob syntax

syntax: glob

*.png
*.jpg
*.ico
*.gif
*.pom
.idea*
.DS_Store
39 changes: 39 additions & 0 deletions src/dataimport/META_example/META.example.pw
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#drwxrwsr-x. 2 erkkila2 csbgroup 4.0K Jul 9 13:59 .
#-rwxr-x---. 1 erkkila2 csbgroup 1.5G Jul 9 14:00 coadread.merge.12apr.rnaseq.276.hg18Plus.rface
#
[build]
source=TCGA
afm=/titan/cancerregulome3/TCGA/outputs/kirc/kirc.bigMerge.25jun.tsv
annotations=na
quantile_features=CNVR,GEXP,METH
associations=/titan/cancerregulome3/TCGA/outputs/kirc/keep.pwpvD.short.sort
interesting_scores=/titan/cancerregulome3/TCGA/outputs/kirc/featScores.pwpvD.short.txt
afm_description=Kidney Renal AllPairs
comment=
dataset_label=kirc_31july_pw
dataset_date=31-07-12
[email protected]
disease_code=KIRC
#python bin must be 2.5+ and imported the MySQLDb package
python_bin=python

#the directory needs to exist and end in /
[results]
path=/home/csbgroup/public_html/RE/dataimport/results/
#/proj/ilyalab/jlin/load_associations/dataimport_meta/python/results/

[dbetl]
#only supports blank, absolute, negative, negative_log10
pvalue_transform=absolute
#collapse_edge_directions=1 implies taking the rf-ace higher important (A->B, B->A) set
#n/a for pairwise
collapse_edge_directions=1
reverse_directions=1
keep_unmapped_associations=1


#ISB specific - requires smtp python module
[pubcrawl]
dopubcrawl=no
[email protected]

37 changes: 37 additions & 0 deletions src/dataimport/META_example/META.example.rface
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#RE DataImport META configs
[build]
source=TCGA
afm=/home/csbgroup/re_data/tcga/crc/coadread.all.23jan14.TP.afm
annotations=na
quantile_features=CNVR
associations=/home/csbgroup/re_data/tcga/crc/coadread.all.23jan14.TP.rface
interesting_scores=/home/csbgroup/re_data/tcga/crc/featScores.coadread.all.23jan14.TP.txt
#good idea to include method_version
afm_description=CRC RFACE_1.0.4
comment=
dataset_label=crc_31july_test
dataset_date=31-07-12
[email protected]
disease_code=COADREAD
#python bin must be 2.5+ and imported the MySQLDb package
python_bin=python

#the directory needs to exist and end in /
[results]
path=/home/csbgroup/public_html/RE/dataimport/results/
#/proj/ilyalab/jlin/load_associations/dataimport_meta/python/results/

[dbetl]
#only supports absolute, negative, negative_log10
pvalue_transform=negative_log10
#collapse_edge_directions=1 implies taking the rf-ace higher important (A->B, B->A) set
#the following settings are valid for rface only
collapse_edge_directions=1
reverse_directions=1
process_gene_interest_score=0
keep_unmapped_associations=1

#ISB specific - requires smtp python module
[pubcrawl]
dopubcrawl=no
[email protected]
46 changes: 46 additions & 0 deletions src/dataimport/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#RE-Config and MySql Data Import notes and requirements
#Requires python 2.5, 2.6, or 2.7 and libraries MySQLdb

The python executable is explicitly set to /tools/bin/python2.7 in the sh scripts, update it as needed

Set up the RE admin dependencies and set permissions by executing rfex_admin.sql with root on the mysql server:

mysql -u root -ppassword < rfex_admin.sql

It is also recommended taking a look at MySQL's /etc/my.cnf If you have millions of associations, then you should consider following the my-large.cnf (See MySQL documentation) settings

#Config
if you want to call your database a different schema name then tcga, you will want to replace the rfex_admin.sql 'tcga' with name you want and then modify one of the example config/rfex_sql.config files.
The host can be localhost, or the actual server name. If you are running the data import on the db server, then localhost is best, if you are running the data import from a different server, then put in the entire address, such as machine.systemsbiology.net with the appropriate port. The server you are on must have access to the db server.

The rfex_admin.sql contains grant statements and it is important that you replace the existing server name of saskatoon with your server name.
Within the config file, the path dictated needs to exist, since the script only creates dir dynamically relative to this path.
[results]
path=/titan/cancerregulome3/TCGA/outputs_sandbox/parsed_associations

If you were to update the random_forest database name for the chrom and ref info, be aware that RE contains a google-dsapi-svc.config file containing this database name and you will need to update this and then redeploy the same name war file in your web app server.

#MySQL Engine
RE db tables are using the MyISAM engine, versus InnoDB, since MyISAM is better fitted for read-intensive (select) and offers full-text indexing and simpler to design and drop.

#SH
From python dir, run sh:
There should be corresponding sh files to the config files setup above, the commands required for the sh is:
dataset_label feature_matrix_file associations_file dataset_comment dataset_description re_instance

#RFACE analysis example:
sh load_rface_associations.sh test_gbm_rface /titan/cancerregulome3/TCGA/outputs/gbm/gbm.merge.u133a.31oct.hg18.tsv /titan/cancerregulome3/TCGA/outputs/gbm/rf.u133a.31oct.mask1.F/all_associations.out "BRCA Her2 subset" "59 Her2-classified patients" internal

#All Pairs example:
sh load_pairwise_associations.sh test_kirc_0206_pw /titan/cancerregulome3/TCGA/outputs/kirc/bigMerge.06feb12.hg18.tsv /titan/cancerregulome3/TCGA/outputs/kirc/bigMerge.06feb12.pwpv "Kidney" "Kidney sandbox" public

Regarding processing time, we are averaging about 800,000-1 million edges per minute. It is recommended that you modify your /etc/my.cnf to have high memory settings as quite a number of views, indexes and buffers are used.

You can load multiple associations (right now limited to RFACE and pairwise) by using
start_load_feature_associations.sh

ie
sh start_load_feature_associations.sh test_gbm /titan/cancerregulome3/TCGA/outputs/gbm/gbm.merge.u133a.31oct.hg18.tsv /titan/cancerregulome3/TCGA/outputs/gbm/rf.u133a.31oct.mask1.F/all_associations.out "test new flow" "test" internal /titan/cancerregulome3/TCGA/outputs/gbm/bigMerge.06feb12.1e04_1e08_8_0.pwpv

Please contact [email protected] with any questions.
Thanks
10 changes: 10 additions & 0 deletions src/dataimport/config/META
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#” sh /home/csbgroup/public_html/RE/dataimport/python/load_rface_associations.sh tcga_coadread_nov10 /home/csbgroup/tcga/rf-ace/coadread/coadread.nov10.tsv /home/csbgroup/tcga/rf-ace/coadread/coadread.nov10.associations.filtered.tsv "TCGA Colorectal" "466 patients" public ”
#
[build]
afm=/path/matrix.afm.tsv
annotations=/path/matrix.feature.annotation.tsv
associations=/path/rface.associations
afm_description=TUT Prostate AFM 20K features 100 samples
comment=your build comment
dataset_label=gbm_dataset_01May
intermediate_results_dir=./results
32 changes: 32 additions & 0 deletions src/dataimport/config/rfex_sql.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[mysql_configs]
host=
port=3306
db=tcga
username=
password=

[solr_configs]
solrpath=http://host:8080/solr

[cutoff]
pvalue=.5
#rface method
importance=0.0001
correlation=0
#pairwise method
loggedpvalue=-4

#this is used for dataset label, ie cancer_type_method_date
#the list tokens can be any string as long as it matches, otherwise
#the update dataset script will fail
[cancer_types]
list=brca,coad,coadread,gbm,ov

[pubcrawl]
dopubcrawl=no
pubcrawl_contact=

[results]
path=
dosmtp=no
notify=
33 changes: 33 additions & 0 deletions src/dataimport/config/rfex_sql_breve.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[mysql_configs]
host=breve
port=3306
db=tcga
username=visquick_rw
password=r34dwr1t3

[solr_configs]
solrpath=http://glados9:7080/solr

#[cutoff]
#pvalue=.5
#rface method
#importance=0.0001
#correlation=0
#pairwise method
#loggedpvalue=-4

#[cancer_types]
#list=brca,coad,coadread,gbm,ov

[pubcrawl]
dopubcrawl=no
[email protected],[email protected]

#[results]
#path=/local/tcga/re_dbetl
#/proj/ilyalab/jlin/load_associations/dataimport_meta/python/results

[notification]
dosmtp=no
[email protected]

29 changes: 29 additions & 0 deletions src/dataimport/config/rfex_sql_sandbox.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[mysql_configs]
host=
port=
db=
username=
password=

[cutoff]
pvalue=.5
#rface method
importance=0.0001
correlation=0
#pairwise method
loggedpvalue=-4

#this is used for dataset label, ie cancer_type_method_date
#the list tokens can be any string as long as it matches, otherwise
#the update dataset script will fail
[cancer_types]
list=brca,coad,coadread,gbm,ov,kirc

[pubcrawl]
dopubcrawl=no
[email protected],[email protected]

[results]
path=/titan/cancerregulome3/TCGA/outputs_sandbox/parsed_associations
dosmtp=no
[email protected],[email protected],[email protected],[email protected],[email protected]
Loading

0 comments on commit e9d6a00

Please sign in to comment.