Skip to content

Commit

Permalink
Merge branch 'master' into doc
Browse files Browse the repository at this point in the history
  • Loading branch information
choishingwan committed Oct 8, 2019
2 parents 18fb951 + 0bad376 commit 10ff53c
Show file tree
Hide file tree
Showing 45 changed files with 4,488 additions and 5,501 deletions.
7 changes: 7 additions & 0 deletions .bettercodehub.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
exclude:
- /lib/.*
- /src/dcdflib.cpp
- /src/cgranges.cpp
- /src/bgen_lib.cpp
- /src/plink_common.cpp
- /src/gzstream.cpp
3 changes: 2 additions & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[submodule "lib/eigen-git-mirror"]
path = lib/eigen-git-mirror
url = https://github.com/eigenteam/eigen-git-mirror
url = https://github.com/eigenteam/eigen-git-mirror.git
branch = branches/3.3
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
CXXFLAGS=-Wall -O3 -std=c++11 -DNDEBUG -march=native
ZLIB=/mnt/lustre/groups/ukbiobank/Edinburgh_Data/Software/PRSice-cpp_development/PRSice.code/lib/zlib-1.2.11/build/libz.a
CXX=/opt/apps/compilers/gcc/6.2.0/bin/g++
INCLUDES := -I inc/ -isystem lib/ -isystem lib/zlib-1.2.11/
INCLUDES := -I inc/ -isystem lib/ -isystem lib/zlib-1.2.11/ -isystem lib/eigen-git-mirror
THREAD := -Wl,--whole-archive -lpthread
SERVER := -L /usr/lib/x86_64-redhat-linux5E/lib64
GCC := -Wl,--no-whole-archive -static-libstdc++ -static-libgcc -static
Expand Down
11 changes: 7 additions & 4 deletions Makefile.win
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
#CXX=x86_64-w64-mingw32-g++
CXX=/usr/local/Cellar/mingw-w64/6.0.0_1/toolchain-x86_64/bin/x86_64-w64-mingw32-g++
version=6.0.0_2
build?=x86_64
dir=/usr/local/Cellar/mingw-w64/${version}/toolchain-${build}/
CXX=${dir}/bin/${build}-w64-mingw32-g++
CXXFLAGS=-Wall -O3 -std=gnu++11 -DNDEBUG -static -lpthread -lpsapi
INCLUDES := -I inc/ -isystem lib/ -isystem window/zlib-1.2.11/
INCLUDES := -I inc/ -isystem lib/ -isystem window/zlib-1.2.11_${build}/ -isystem lib/eigen-git-mirror/
CPPSRC := src/*.cpp
OBJ := bgen_lib.o binaryplink.o genotype.o misc.o regression.o snp.o binarygen.o commander.o main.o plink_common.o prsice.o region.o reporter.o gzstream.o dcdflib.o fastlm.o
ZLIB := window/zlib-1.2.11/libz.a /usr/local/Cellar/mingw-w64/6.0.0_1/toolchain-x86_64/x86_64-w64-mingw32/lib/libpsapi.a
ZLIB := window/zlib-1.2.11_${build}/libz.a ${dir}/${build}-w64-mingw32/lib/libpsapi.a
%.o: src/%.cpp
$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@

PRSice_win64.exe: $(OBJ)
PRSice_win${build}.exe: $(OBJ)
$(CXX) $(CXXFLAGS) $(INCLUDES) $^ $(ZLIB) -o $@
32 changes: 25 additions & 7 deletions PRSice.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ In_Regression <-
R2 <-
print.p <- R <- P <- value <- Phenotype <- Set <- PRS.R2 <- LCI <- UCI <- quant.ref <- NULL

r.version <- "2.2.7"
r.version <- "2.2.10"
# Help Messages --------------------------------------
help_message <-
"usage: Rscript PRSice.R [options] <-b base_file> <-t target_file> <--prsice prsice_location>\n
Expand Down Expand Up @@ -2072,7 +2072,7 @@ extract_matrix <- function(x, y) {
# With this update, we only allow a single base file therefore we don't even need the
# information of base here

if (!provided("target", argv)) {
if (!provided("target", argv) & !provided("target-list", argv)) {
stop("Target file name not found. You'll need to provide the target name for plotting! (even with --plot)")
}

Expand Down Expand Up @@ -2424,13 +2424,31 @@ if (provided("pheno_file", argv)) {
}
}else if(provided("target_list", argv)){
# Assume no header
target.info <- strsplit(argv$target_list, split = ",")[[1]]
target.list <- read.table(argv$target_list)
target.prefix <- target.list[1]
pheno.file <- paste0(target.prefix, ".fam")
if(provided("type", argv)){
if(argv$type=="bgen"){
stop("Error: You must provide a phenotype file for bgen list input")
}
if (length(target.info) == 2) {
pheno.file <- target.info[2]
if (provided("type", argv)) {
if (argv$type == "bgen") {
# sample file should contain FID and IID by format requirement
pheno.index <- 3
if (ignore_fid &
!is_sample_format(pheno.file))
pheno.index <- 2
}
}
} else{
if (provided("type", argv)) {
if (argv$type == "bgen") {
stop("Error: You must provide either a phenotype or sample file for bgen input")
} else if (argv$type == "bed") {
pheno.file <- paste0(target.prefix, ".fam")
}
} else{
# Because default is always plink
pheno.file <- paste0(target.prefix, ".fam")
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion helpers/help_messages.txt
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ m_help_message =
" --logit-perm When performing permutation, still use logistic\n"
" regression instead of linear regression. This\n"
" will substantially slow down PRSice\n"
" --memory Maximum memory usage allowed. PRSice will try\n"
" --memory Maximum memory usage allowed (in Mb). PRSice will try\n"
" its best to honor this setting\n"
" --non-cumulate Calculate non-cumulative PRS. PRS will be reset\n"
" to 0 for each new P-value threshold instead of\n"
Expand Down
72 changes: 29 additions & 43 deletions inc/binarygen.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,47 +33,38 @@
class BinaryGen : public Genotype
{
public:
BinaryGen(const std::string& list_file, const std::string& file,
const std::string& pheno_file, const std::string& out_prefix,
const double hard_threshold, const double dose_threshold,
const size_t thread, const bool use_inter,
const bool use_hard_coded, const bool no_regress,
const bool ignore_fid, const bool keep_nonfounder,
const bool keep_ambig, const bool is_ref, Reporter* reporter);
BinaryGen(const GenoFile& geno, const Phenotype& pheno,
const std::string& delim, Reporter* reporter);
~BinaryGen();

private:
//
/*!
* \brief check if the sample file is of the sample format specified by bgen
* or just a simple text file
* \return
*/
static bool check_is_sample_format(const std::string& input);

protected:
typedef std::vector<std::vector<double>> Data;
std::unordered_map<size_t, genfile::bgen::Context> m_context_map;
std::vector<genfile::byte_t> m_buffer1, m_buffer2;
std::string m_intermediate_file;
std::string m_id_delim;
unsigned long long m_data_size = 0;
bool m_intermediate = false;
bool m_target_plink = false;
bool m_ref_plink = false;
bool m_has_external_sample = false;

/*!
* \brief Generate the sample vector
* \return Vector containing the sample information
*/
std::vector<Sample_ID> gen_sample_vector(const std::string& delim);
//
/*!
* \brief check if the sample file is of the sample format specified by bgen
* or just a simple text file
* \return
*/
bool check_is_sample_format();
std::vector<Sample_ID> gen_sample_vector();
void
gen_snp_vector(const std::vector<IITree<size_t, size_t>>& exclusion_regions,
const std::string& out_prefix, Genotype* target = nullptr);
void calc_freq_gen_inter(const double& maf_threshold,
const double& geno_threshold,
const double& info_threshold,
const bool maf_filter, const bool geno_filter,
const bool info_filter, const bool hard_coded,
Genotype* target = nullptr);
bool calc_freq_gen_inter(const QCFiltering& filter_info,
const std::string& prefix,
Genotype* target = nullptr,
bool force_cal = false);
/*!
* \brief Read in the context information for the bgen. This will propergate
* the m_context_map
Expand All @@ -88,7 +79,6 @@ class BinaryGen : public Genotype
* \return true if the sample is consistent
*/
bool check_sample_consistent(const std::string& bgen_name,
const std::string& delim,
const genfile::bgen::Context& context);

/*!
Expand All @@ -99,14 +89,11 @@ class BinaryGen : public Genotype
* \param byte_pos is the streampos of the bgen file, for quick seek
* \param file_name is the file name of the bgen file
*/
inline void read_genotype(uintptr_t* genotype,
const unsigned long long byte_pos,
inline void read_genotype(uintptr_t* genotype, const long long byte_pos,
const size_t& file_idx)
{
const uintptr_t unfiltered_sample_ct4 =
(m_unfiltered_sample_ct + 3) / 4;
if (!m_genotype_file.mem_calculated())
{ m_genotype_file.init_memory_map(g_allowed_memory, m_data_size); }
if (m_ref_plink)
{
// when m_ref_plink is set, it suggest we are using the
Expand Down Expand Up @@ -145,7 +132,7 @@ class BinaryGen : public Genotype
* intermediate file
* \return 0 if sucessful
*/
uint32_t load_and_collapse_incl(const unsigned long long byte_pos,
uint32_t load_and_collapse_incl(const long long byte_pos,
const size_t& file_idx,
uintptr_t* __restrict mainbuf)
{
Expand Down Expand Up @@ -185,20 +172,21 @@ class BinaryGen : public Genotype

void read_score(const std::vector<size_t>::const_iterator& start_idx,
const std::vector<size_t>::const_iterator& end_idx,
bool reset_zero, const bool use_ref_maf);
bool reset_zero);
void hard_code_score(const std::vector<size_t>::const_iterator& start_idx,
const std::vector<size_t>::const_iterator& end_idx,
bool reset_zero, const bool use_ref_maf);
bool reset_zero);
void dosage_score(const std::vector<size_t>::const_iterator& start_idx,
const std::vector<size_t>::const_iterator& end_idx,
bool reset_zero);

/*
* Different structures use for reading in the bgen info
*/
// TODO: Use ref MAf for dosage score too
struct PRS_Interpreter
{
~PRS_Interpreter() {};
~PRS_Interpreter() {}
/*!
* \brief PRS_Interpreter is the structure used by BGEN library to parse
* the probability data
Expand Down Expand Up @@ -395,7 +383,7 @@ class BinaryGen : public Genotype
{
(*m_sample_prs)[i].prs =
(*m_sample_prs)[i].prs * m_not_first + m_miss_score;
cur_idx++;
++cur_idx;
}
else if (m_centre)
{
Expand Down Expand Up @@ -535,8 +523,6 @@ class BinaryGen : public Genotype
// when we calculate the expected value, we want to multiply the
// probability with our coding instead of just using byte
// representation
// checked with PLINK, the expected value should be coded as 0,
// 1, 2 instead of 0, 0.5, 1 as in PRS
m_exp_value += static_cast<double>(geno) * value;
}
/*!
Expand Down Expand Up @@ -594,18 +580,18 @@ class BinaryGen : public Genotype
m_genotype[m_index] |= m_geno << m_shift;
switch (m_geno)
{
case 3: m_homrar_ct++; break;
case 2: m_het_ct++; break;
case 1: m_missing_ct++; break;
case 0: m_homcom_ct++; break;
case 3: ++m_homrar_ct; break;
case 2: ++m_het_ct; break;
case 1: ++m_missing_ct; break;
case 0: ++m_homcom_ct; break;
}
// as the genotype is represented by two bit, we will +=2
m_shift += 2;
// if we reach the boundary, we will now add the index and reset
// the shift
if (m_shift == BITCT)
{
m_index++;
++m_index;
m_shift = 0;
}
// we can now push in the expected value for this sample. This
Expand Down
32 changes: 8 additions & 24 deletions inc/binaryplink.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,30 +41,24 @@ class BinaryPlink : public Genotype
* (F)
* \param reporter is the logger
*/
BinaryPlink(const std::string& file_list, const std::string& file,
const size_t thread, const bool ignore_fid,
const bool keep_nonfounder, const bool keep_ambig,
const bool is_ref, Reporter* reporter);
BinaryPlink() {}
BinaryPlink(const GenoFile& geno, const Phenotype& pheno,
const std::string& delim, Reporter* reporter);
~BinaryPlink();

protected:
std::vector<uintptr_t> m_sample_mask;
std::streampos m_prev_loc = 0;
std::vector<Sample_ID> gen_sample_vector(const std::string& delim);
std::vector<Sample_ID> gen_sample_vector();
void
gen_snp_vector(const std::vector<IITree<size_t, size_t>>& exclusion_regions,
const std::string& out_prefix, Genotype* target = nullptr);
void calc_freq_gen_inter(const double& maf_threshold,
const double& geno_threshold, const double&,
const bool maf_filter, const bool geno_filter,
const bool, const bool,
Genotype* target = nullptr);
bool calc_freq_gen_inter(const QCFiltering& filter_info, const std::string&,
Genotype* target = nullptr,
bool force_cal = false);
void check_bed(const std::string& bed_name, size_t num_marker,
uintptr_t& bed_offset);
inline void read_genotype(uintptr_t* __restrict genotype,
const unsigned long long byte_pos,
const size_t& file_idx)
const long long byte_pos, const size_t& file_idx)
{
// first, generate the mask to mask out the last few byte that we don't
// want (if our sample number isn't a multiple of 16, it is possible
Expand All @@ -76,16 +70,6 @@ class BinaryPlink : public Genotype

// now we start reading / parsing the binary from the file
assert(unfiltered_sample_ct);
// if we don't perform selection, we can directly perform the read on
// the mainbuf

// auto&& cur_map = m_genotype_files[file_idx];
if (!m_genotype_file.mem_calculated())
{
m_genotype_file.init_memory_map(g_allowed_memory,
unfiltered_sample_ct4);
}
// m_genotype_file.no_mmap();
if (m_unfiltered_sample_ct == m_founder_ct)
{
m_genotype_file.read(m_genotype_file_names[file_idx] + ".bed",
Expand Down Expand Up @@ -115,7 +99,7 @@ class BinaryPlink : public Genotype
virtual void
read_score(const std::vector<size_t>::const_iterator& start_idx,
const std::vector<size_t>::const_iterator& end_idx,
bool reset_zero, const bool use_ref_maf);
bool reset_zero);
};

#endif
Loading

0 comments on commit 10ff53c

Please sign in to comment.