From 01df5ecba00ab9012083904dda9371dcf454c686 Mon Sep 17 00:00:00 2001
From: Rahul Pisupati <rahul.pisupati@gmi.oeaw.ac.at>
Date: Fri, 26 Jan 2018 16:08:59 +0100
Subject: [PATCH] few changes and bugs. version up

---
 README.md                 |   2 +-
 setup.py                  |   2 +-
 snpmatch/__init__.py      |  14 +++-
 snpmatch/core/csmatch.py  |  21 +++--
 snpmatch/core/makedb.py   |   2 +-
 snpmatch/core/parsers.py  | 125 ++++++++++++++++++++++++++++++
 snpmatch/core/snpmatch.py | 159 +++++---------------------------------
 7 files changed, 169 insertions(+), 156 deletions(-)
 create mode 100644 snpmatch/core/parsers.py

diff --git a/README.md b/README.md
index b4fd2ee..8262a28 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,7 @@ These scripts are implemented based on the *A. thaliana* genome sizes. But the g
 - 1.7.2: Stable version, 15-12-2016
 - 1.8.2: Stable version, 16-02-2017
 - 1.9.2: Stable version, 24-08-2017
+- 2.0.0: Stable version, 26-01-2018
 
 
 ## Credits
@@ -116,4 +117,3 @@ These scripts are implemented based on the *A. thaliana* genome sizes. But the g
 
 Pisupati, R. *et al.*. Verification of *Arabidopsis* stock collections using SNPmatch, a tool for genotyping high-plexed samples.  *Nature Scientific Data*  **4**, 170184 (2017).
 [doi:10.1038/sdata.2017.184](https://www.nature.com/articles/sdata2017184)
-
diff --git a/setup.py b/setup.py
index 089b6b9..eb13e80 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='SNPmatch',
-    version='1.9.2',
+    version='2.0.0',
     description='A simple python library to identify the most likely strain given the SNPs for a sample',
     long_description=long_description,
     url='https://github.com/Gregor-Mendel-Institute/SNPmatch',
diff --git a/snpmatch/__init__.py b/snpmatch/__init__.py
index 5342dfa..da531a6 100644
--- a/snpmatch/__init__.py
+++ b/snpmatch/__init__.py
@@ -15,8 +15,8 @@
 from snpmatch.core import simulate
 import logging, logging.config
 
-__version__ = '1.9.2'
-__updated__ = "31.8.2017"
+__version__ = '2.0.0'
+__updated__ = "26.01.2018"
 __date__ = "25.10.2016"
 
 def setLog(logDebug):
@@ -47,6 +47,7 @@ def get_options(program_license,program_version_message):
   inbred_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
   inbred_parser.add_argument("-o", "--output", dest="outFile", help="Output file with the probability scores")
   inbred_parser.set_defaults(func=snpmatch_inbred)
+
   cross_parser = subparsers.add_parser('cross', help="SNPmatch on the crosses (F2s and F3s) of A. thaliana")
   cross_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF/BED file for the variants in the sample")
   cross_parser.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise")
@@ -55,6 +56,7 @@ def get_options(program_license,program_version_message):
   cross_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
   cross_parser.add_argument("-o", "--output", dest="outFile", help="Output files with the probability scores and scores along windows")
   cross_parser.set_defaults(func=snpmatch_cross)
+
   genocross_parser = subparsers.add_parser('genotype_cross', help="Genotype the crosses by windows given parents")
   genocross_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF file for the variants in the sample")
   genocross_parser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
@@ -69,18 +71,21 @@ def get_options(program_license,program_version_message):
   parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
   parser.add_argument("-o", "--output", dest="outFile", help="output + .npz file is generater required for SNPmatch")
   parser.set_defaults(func=snpmatch_parser)
+
   pairparser = subparsers.add_parser('pairsnp', help="pairwise comparison of two snp files")
   pairparser.add_argument("-i", "--input_file_1", dest="inFile_1", help="VCF/BED file for the variants in the sample one")
   pairparser.add_argument("-j", "--input_file_2", dest="inFile_2", help="VCF/BED file for the variants in the sample two")
   pairparser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
   pairparser.add_argument("-o", "--output", dest="outFile", help="output json file")
   pairparser.set_defaults(func=snpmatch_paircomparions)
+
   makedbparser = subparsers.add_parser('makedb', help="Create database files from given VCF, only give biallelic SNPs")
-  makedbparser.add_argument("-i", "--input_vcf", dest="inFile", help="input VCF file for the known strains.")
+  makedbparser.add_argument("-i", "--input_vcf", dest="inFile", help="input VCF file for the known strains. You can also provide a CSV file which is an intermediate file in the process.")
   makedbparser.add_argument("-p", "--bcftools_path", dest="bcfpath", help="path to the bcftools executable. Not necessary if present in BASH PATH", default='')
   makedbparser.add_argument("-o", "--out_db_id", dest="db_id", help="output id for database files")
   makedbparser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
   makedbparser.set_defaults(func=makedb_vcf_to_hdf5)
+
   simparser = subparsers.add_parser('simulate', help="Given SNP database, check the genotyping efficiency randomly selecting 'n' number of SNPs")
   simparser.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise")
   simparser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
@@ -122,7 +127,8 @@ def snpmatch_parser(args):
   if not args['outFile']:
     if os.path.isfile(args['inFile'] + ".snpmatch.npz"):
       os.remove(args['inFile'] + ".snpmatch.npz")
-  snpmatch.parseInput(inFile = args['inFile'], logDebug =  args['logDebug'], outFile = args['outFile'])
+  from snpmatch.core import parsers
+  parsers.parseInput(inFile = args['inFile'], logDebug =  args['logDebug'], outFile = args['outFile'])
 
 def genotype_cross(args):
   #checkARGs(args)
diff --git a/snpmatch/core/csmatch.py b/snpmatch/core/csmatch.py
index b1e4eea..6779a5b 100644
--- a/snpmatch/core/csmatch.py
+++ b/snpmatch/core/csmatch.py
@@ -10,6 +10,7 @@
 import os
 import StringIO
 import snpmatch
+import parsers
 import json
 import itertools
 
@@ -92,10 +93,8 @@ def crossWindower(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, outFile)
       tempScore1 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs1, dtype=int).T, matchedTarWei[j:j+chunk_size,1]).T, axis=0)
       tempScore2 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs2, dtype=int).T, matchedTarWei[j:j+chunk_size,2]).T, axis=0)
       ScoreList = ScoreList + tempScore0 + tempScore1 + tempScore2
-      if(len(TarGTs0[j:j+chunk_size]) > 1):
+      if(len(TarGTs0[j:j+chunk_size]) >= 1):
         NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0)
-      elif(len(TarGTs0[j:j+chunk_size]) == 1):
-        NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int)
     TotScoreList = TotScoreList + ScoreList
     TotNumInfoSites = TotNumInfoSites + NumInfoSites
     writeBinData(out_file, i, GenotypeData, ScoreList, NumInfoSites)
@@ -197,7 +196,7 @@ def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, Genoty
     score = 0
     numinfo = 0
     NumMatSNPs = 0
-    for ind,echr in enumerate(snpmatch.parseChrName(GenotypeData.chrs)[0]):
+    for ind,echr in enumerate(parsers.parseChrName(GenotypeData.chrs)[0]):
       perchrTarPos = np.where(snpCHR == echr)[0]
       perchrtarSNPpos = snpPOS[perchrTarPos]
       start = GenotypeData.chr_regions[ind][0]
@@ -223,7 +222,7 @@ def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, Genoty
   crossInterpreter(GenotypeData, binLen, outID)
 
 def potatoCrossIdentifier(args):
-  (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = snpmatch.parseInput(inFile = args['inFile'], logDebug = args['logDebug'])
+  (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = args['inFile'], logDebug = args['logDebug'])
   log.info("loading genotype files!")
   GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File'])
   GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
@@ -252,7 +251,7 @@ def getWindowGenotype(matchedP1, totalMarkers, lr_thres = 2.706):
 
 def crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2, inFile, binLen, outFile, logDebug = True):
     ## inFile are the SNPs of the sample
-    (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = snpmatch.parseInput(inFile = inFile, logDebug = logDebug)
+    (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = inFile, logDebug = logDebug)
     # identifying the segregating SNPs between the accessions
     # only selecting 0 or 1
     segSNPsind = np.where((snpsP1 != snpsP2) & (snpsP1 >= 0) & (snpsP2 >= 0) & (snpsP1 < 2) & (snpsP2 < 2))[0]
@@ -272,7 +271,7 @@ def crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2, inFile, b
       matchedTarInd = perchrTarPosind[np.where(np.in1d(perchrTarPos, reqPOS))[0]]
       matchedTarGTs = snpGT[matchedTarInd]
       try:
-        TarGTBinary = snpmatch.parseGT(matchedTarGTs)
+        TarGTBinary = parsers.parseGT(matchedTarGTs)
         TarGTBinary[np.where(TarGTBinary == 2)[0]] = 4
         genP1 = np.subtract(TarGTBinary, snpsP1[matchedAccInd])
         genP1no = len(np.where(genP1 == 0)[0])
@@ -299,8 +298,8 @@ def crossGenotyper(args):
         log.info("input files: %s and %s" % (args['parents'], args['father']))
         if not os.path.isfile(args['parents']) and os.path.isfile(args['father']):
             die("either of the input files do not exists, please provide VCF/BED file for parent genotype information")
-        (p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI, p1DPmean) = snpmatch.parseInput(inFile = args['parents'], logDebug = args['logDebug'])
-        (p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI, p2DPmean) = snpmatch.parseInput(inFile = args['father'], logDebug = args['logDebug'])
+        (p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI, p1DPmean) = parsers.parseInput(inFile = args['parents'], logDebug = args['logDebug'])
+        (p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI, p2DPmean) = parsers.parseInput(inFile = args['father'], logDebug = args['logDebug'])
         commonCHRs_ids = np.union1d(p1snpCHR, p2snpCHR)
         commonSNPsCHR = np.zeros(0, dtype=commonCHRs_ids.dtype)
         commonSNPsPOS = np.zeros(0, dtype=int)
@@ -316,8 +315,8 @@ def crossGenotyper(args):
             perchrsnpsP2 = np.repeat(-1, len(perchrPositions)).astype('int8')
             perchrsnpsP1_inds = np.where(np.in1d(p1snpPOS[perchrP1inds], perchrPositions))[0]
             perchrsnpsP2_inds = np.where(np.in1d(p2snpPOS[perchrP2inds], perchrPositions))[0]
-            snpsP1 = np.append(snpsP1, snpmatch.parseGT(p1snpGT[perchrsnpsP1_inds]))
-            snpsP2 = np.append(snpsP2, snpmatch.parseGT(p2snpGT[perchrsnpsP2_inds]))
+            snpsP1 = np.append(snpsP1, parsers.parseGT(p1snpGT[perchrsnpsP1_inds]))
+            snpsP2 = np.append(snpsP2, parsers.parseGT(p2snpGT[perchrsnpsP2_inds]))
         log.info("done!")
     else:
         parents = args['parents']
diff --git a/snpmatch/core/makedb.py b/snpmatch/core/makedb.py
index 7e8f91e..38457e3 100644
--- a/snpmatch/core/makedb.py
+++ b/snpmatch/core/makedb.py
@@ -89,7 +89,7 @@ def makedb_from_vcf(args):
         log.info('done!')
     elif inType == '.csv':
         log.info("converting CSV to hdf5!")
-        makeHDF5s(args['db_id'] + '.csv', args['db_id'])
+        makeHDF5s(args['inFile'], args['db_id'])
         log.info('done!')
     else:
         die("please provide either a VCF file or a CSV!")
diff --git a/snpmatch/core/parsers.py b/snpmatch/core/parsers.py
new file mode 100644
index 0000000..7064787
--- /dev/null
+++ b/snpmatch/core/parsers.py
@@ -0,0 +1,125 @@
+import pandas as pd
+import numpy as np
+import allel
+import snpmatch
+import logging
+import os
+import json
+
+log = logging.getLogger(__name__)
+
+def parseGT(snpGT):
+    first = snpGT[0]
+    snpBinary = np.zeros(len(snpGT), dtype = "int8")
+    if first.find('|') != -1:
+        ## GT is phased
+        separator = "|"
+    elif first.find('/') != -1:
+        ## GT is not phased
+        separator = "/"
+    elif np.char.isdigit(first):
+        return np.array(np.copy(snpGT), dtype = "int8")
+    else:
+        snpmatch.die("unable to parse the format of GT in vcf!")
+    hetGT = "0" + separator + "1"
+    refGT = "0" + separator + "0"
+    altGT = "1" + separator + "1"
+    nocall = "." + separator + "."
+    snpBinary[np.where(snpGT == altGT)[0]] = 1
+    snpBinary[np.where(snpGT == hetGT)[0]] = 2
+    snpBinary[np.where(snpGT == nocall)[0]] = -1
+    return snpBinary
+
+def parseChrName(targetCHR):
+    snpCHROM = np.char.replace(np.core.defchararray.lower(np.array(targetCHR, dtype="string")), "chr", "")
+    snpsREQ = np.where(np.char.isdigit(snpCHROM))[0]   ## Filtering positions from mitochondrial and chloroplast
+    snpCHR = snpCHROM[snpsREQ]
+    return (snpCHR, snpsREQ)
+
+def readBED(inFile, logDebug):
+    log.info("reading the position file")
+    targetSNPs = pd.read_table(inFile, header=None, usecols=[0,1,2])
+    (snpCHR, snpsREQ) = parseChrName(targetSNPs[0])
+    snpPOS = np.array(targetSNPs[1], dtype=int)[snpsREQ]
+    snpGT = np.array(targetSNPs[2])[snpsREQ]
+    snpBinary = parseGT(snpGT)
+    snpWEI = np.ones((len(snpCHR), 3))  ## for homo and het
+    snpWEI[np.where(snpBinary != 0),0] = 0
+    snpWEI[np.where(snpBinary != 1),2] = 0
+    snpWEI[np.where(snpBinary != 2),1] = 0
+    return (snpCHR, snpPOS, snpGT, snpWEI)
+
+def readVcf(inFile, logDebug):
+    log.info("reading the VCF file")
+    ## We read only one sample from the VCF file
+    if logDebug:
+        vcf = allel.read_vcf(inFile, samples = [0], fields = '*')
+    else:
+        import StringIO
+        import sys
+        sys.stderr = StringIO.StringIO()
+        vcf = allel.read_vcf(inFile, samples = [0], fields = '*')
+        #vcf = vcfnp.variants(inFile, cache=False).view(np.recarray)
+        #vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray)
+        sys.stderr = sys.__stderr__
+    (snpCHR, snpsREQ) = parseChrName(vcf['variants/CHROM'])
+    try:
+        snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0]
+    except AttributeError:
+        snpmatch.die("input VCF file doesnt have required GT field")
+    snpsREQ = snpsREQ[np.where(snpGT != './.')[0]]
+    snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0]
+    if 'calldata/PL' in sorted(vcf.keys()):
+        snpWEI = np.copy(vcf['calldata/PL'][snpsREQ, 0]).astype('float')
+        snpWEI = snpWEI/(-10)
+        snpWEI = np.exp(snpWEI)
+
+    else:
+        snpBinary = parseGT(snpGT)
+        snpWEI = np.ones((len(snpsREQ), 3))  ## for homo and het
+        snpWEI[np.where(snpBinary != 0),0] = 0
+        snpWEI[np.where(snpBinary != 1),2] = 0
+        snpWEI[np.where(snpBinary != 2),1] = 0
+    snpCHR = snpCHR[snpsREQ]
+    DPmean = np.mean(vcf['calldata/DP'][snpsREQ,0])
+    snpPOS = np.array(vcf['variants/POS'][snpsREQ])
+    return (DPmean, snpCHR, snpPOS, snpGT, snpWEI)
+
+def parseInput(inFile, logDebug, outFile = "parser"):
+    if outFile == "parser" or not outFile:
+        outFile = inFile + ".snpmatch"
+    if os.path.isfile(inFile + ".snpmatch.npz"):
+        log.info("snpmatch parser dump found! loading %s", inFile + ".snpmatch.npz")
+        snps = np.load(inFile + ".snpmatch.npz")
+        (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp'])
+    else:
+        _,inType = os.path.splitext(inFile)
+        if inType == '.npz':
+            log.info("loading snpmatch parser file! %s", inFile)
+            snps = np.load(inFile)
+            (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp'])
+        else:
+            log.info('running snpmatch parser!')
+            if inType == '.vcf':
+                (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = readVcf(inFile, logDebug)
+            elif inType == '.bed':
+                (snpCHR, snpPOS, snpGT, snpWEI) = readBED(inFile, logDebug)
+                DPmean = "NA"
+            else:
+                snpmatch.die("input file type %s not supported" % inType)
+            log.info("creating snpmatch parser file: %s", outFile + '.npz')
+            np.savez(outFile, chr = snpCHR, pos = snpPOS, gt = snpGT, wei = snpWEI, dp = DPmean)
+            NumSNPs = len(snpCHR)
+            case = 0
+            note = "Sufficient number of SNPs"
+            if NumSNPs < snpmatch.snp_thres:
+                note = "Attention: low number of SNPs provided"
+                case = 1
+            snpst = np.unique(snpCHR, return_counts=True)
+            snpdict = dict(('Chr%s' % snpst[0][i], snpst[1][i]) for i in range(len(snpst[0])))
+            statdict = {"interpretation": {"case": case, "text": note}, "snps": snpdict, "num_of_snps": NumSNPs, "depth": DPmean}
+            statdict['percent_heterozygosity'] = snpmatch.getHeterozygosity(snpGT)
+            with open(outFile + ".stats.json", "w") as out_stats:
+                out_stats.write(json.dumps(statdict))
+    log.info("done!")
+    return (snpCHR, snpPOS, snpGT, snpWEI, DPmean)
diff --git a/snpmatch/core/snpmatch.py b/snpmatch/core/snpmatch.py
index d51b0e5..9829407 100644
--- a/snpmatch/core/snpmatch.py
+++ b/snpmatch/core/snpmatch.py
@@ -4,12 +4,10 @@
 import numpy as np
 import numpy.ma
 from pygwas.core import genotype
-import allel
-import pandas
 import logging
 import sys
 import os
-import StringIO
+import parsers
 import json
 
 log = logging.getLogger(__name__)
@@ -18,8 +16,8 @@
 prob_thres = 0.98
 
 def die(msg):
-  sys.stderr.write('Error: ' + msg + '\n')
-  sys.exit(1)
+    sys.stderr.write('Error: ' + msg + '\n')
+    sys.exit(1)
 
 def likeliTest(n, y):
   p = 0.99999999
@@ -79,95 +77,21 @@ def CaseInterpreter(overlap, NumSNPs, topHits, probScore):
   return (case, note)
 
 def print_topHits(outFile, AccList, ScoreList, NumInfoSites, overlap, NumMatSNPs):
-  num_lines = len(ScoreList)
-  (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites)
-  topHits = np.where(LikeLiHoodRatios < lr_thres)[0]
-  probScore = [float(ScoreList[i])/NumInfoSites[i] for i in range(num_lines)]
-  overlapScore = [float(NumInfoSites[i])/NumMatSNPs for i in range(num_lines)]
-  probScore = np.array(probScore, dtype = float)
-  sorted_order = topHits[np.argsort(-probScore[topHits])]
-  (case, note) = CaseInterpreter(overlap, NumMatSNPs, topHits, probScore)
-  matches_dict = [(AccList[i], probScore[i], NumInfoSites[i], overlapScore[i]) for i in sorted_order]
-  topHitsDict = {'overlap': [overlap, NumMatSNPs], 'matches': matches_dict, 'interpretation':{'case': case, 'text': note}}
-  with open(outFile, "w") as out_stats:
-    out_stats.write(json.dumps(topHitsDict))
-
-def parseGT(snpGT):
-    first = snpGT[0]
-    snpBinary = np.zeros(len(snpGT), dtype = "int8")
-    if first.find('|') != -1:
-        ## GT is phased
-        separator = "|"
-    elif first.find('/') != -1:
-        ## GT is not phased
-        separator = "/"
-    elif np.char.isdigit(first):
-        return np.array(np.copy(snpGT), dtype = "int8")
-    else:
-        die("unable to parse the format of GT in vcf!")
-    hetGT = "0" + separator + "1"
-    refGT = "0" + separator + "0"
-    altGT = "1" + separator + "1"
-    nocall = "." + separator + "."
-    snpBinary[np.where(snpGT == altGT)[0]] = 1
-    snpBinary[np.where(snpGT == hetGT)[0]] = 2
-    snpBinary[np.where(snpGT == nocall)[0]] = -1
-    return snpBinary
-
-def parseChrName(targetCHR):
-    snpCHROM = np.char.replace(np.core.defchararray.lower(np.array(targetCHR, dtype="string")), "chr", "")
-    snpsREQ = np.where(np.char.isdigit(snpCHROM))[0]   ## Filtering positions from mitochondrial and chloroplast
-    snpCHR = snpCHROM[snpsREQ]
-    return (snpCHR, snpsREQ)
-
-def readBED(inFile, logDebug):
-    log.info("reading the position file")
-    targetSNPs = pandas.read_table(inFile, header=None, usecols=[0,1,2])
-    (snpCHR, snpsREQ) = parseChrName(targetSNPs[0])
-    snpPOS = np.array(targetSNPs[1], dtype=int)[snpsREQ]
-    snpGT = np.array(targetSNPs[2])[snpsREQ]
-    snpBinary = parseGT(snpGT)
-    snpWEI = np.ones((len(snpCHR), 3))  ## for homo and het
-    snpWEI[np.where(snpBinary != 0),0] = 0
-    snpWEI[np.where(snpBinary != 1),2] = 0
-    snpWEI[np.where(snpBinary != 2),1] = 0
-    return (snpCHR, snpPOS, snpGT, snpWEI)
-
-def readVcf(inFile, logDebug):
-    log.info("reading the VCF file")
-    ## We read only one sample from the VCF file
-    if logDebug:
-        vcf = allel.read_vcf(inFile, samples = [0], fields = '*')
-    else:
-        sys.stderr = StringIO.StringIO()
-        vcf = allel.read_vcf(inFile, samples = [0], fields = '*')
-        #vcf = vcfnp.variants(inFile, cache=False).view(np.recarray)
-        #vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray)
-        sys.stderr = sys.__stderr__
-    (snpCHR, snpsREQ) = parseChrName(vcf['variants/CHROM'])
-    try:
-        snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0]
-    except AttributeError:
-        die("input VCF file doesnt have required GT field")
-    snpsREQ = snpsREQ[np.where(snpGT != './.')[0]]
-    snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0]
-    if 'calldata/PL' in sorted(vcf.keys()):
-        snpWEI = np.copy(vcf['calldata/PL'][snpsREQ, 0]).astype('float')
-        snpWEI = snpWEI/(-10)
-        snpWEI = np.exp(snpWEI)
-    else:
-        snpBinary = parseGT(snpGT)
-        snpWEI = np.ones((len(snpsREQ), 3))  ## for homo and het
-        snpWEI[np.where(snpBinary != 0),0] = 0
-        snpWEI[np.where(snpBinary != 1),2] = 0
-        snpWEI[np.where(snpBinary != 2),1] = 0
-    snpCHR = snpCHR[snpsREQ]
-    DPmean = np.mean(vcf['calldata/DP'][snpsREQ,0])
-    snpPOS = np.array(vcf['variants/POS'][snpsREQ])
-    return (DPmean, snpCHR, snpPOS, snpGT, snpWEI)
+    num_lines = len(ScoreList)
+    (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites)
+    topHits = np.where(LikeLiHoodRatios < lr_thres)[0]
+    probScore = [float(ScoreList[i])/NumInfoSites[i] for i in range(num_lines)]
+    overlapScore = [float(NumInfoSites[i])/NumMatSNPs for i in range(num_lines)]
+    probScore = np.array(probScore, dtype = float)
+    sorted_order = topHits[np.argsort(-probScore[topHits])]
+    (case, note) = CaseInterpreter(overlap, NumMatSNPs, topHits, probScore)
+    matches_dict = [(AccList[i], probScore[i], NumInfoSites[i], overlapScore[i]) for i in sorted_order]
+    topHitsDict = {'overlap': [overlap, NumMatSNPs], 'matches': matches_dict, 'interpretation':{'case': case, 'text': note}}
+    with open(outFile, "w") as out_stats:
+        out_stats.write(json.dumps(topHitsDict))
 
 def getHeterozygosity(snpGT, outFile='default'):
-    snpBinary = parseGT(snpGT)
+    snpBinary = parsers.parseGT(snpGT)
     numHets = len(np.where(snpBinary == 2)[0])
     if outFile != 'default':
         with open(outFile) as json_out:
@@ -177,45 +101,6 @@ def getHeterozygosity(snpGT, outFile='default'):
           out_stats.write(json.dumps(topHitsDict))
     return float(numHets)/len(snpGT)
 
-def parseInput(inFile, logDebug, outFile = "parser"):
-    if outFile == "parser" or not outFile:
-        outFile = inFile + ".snpmatch"
-    if os.path.isfile(inFile + ".snpmatch.npz"):
-        log.info("snpmatch parser dump found! loading %s", inFile + ".snpmatch.npz")
-        snps = np.load(inFile + ".snpmatch.npz")
-        (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp'])
-    else:
-        _,inType = os.path.splitext(inFile)
-        if inType == '.npz':
-            log.info("loading snpmatch parser file! %s", inFile)
-            snps = np.load(inFile)
-            (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp'])
-        else:
-            log.info('running snpmatch parser!')
-            if inType == '.vcf':
-                (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = readVcf(inFile, logDebug)
-            elif inType == '.bed':
-                (snpCHR, snpPOS, snpGT, snpWEI) = readBED(inFile, logDebug)
-                DPmean = "NA"
-            else:
-                die("input file type %s not supported" % inType)
-            log.info("creating snpmatch parser file: %s", outFile + '.npz')
-            np.savez(outFile, chr = snpCHR, pos = snpPOS, gt = snpGT, wei = snpWEI, dp = DPmean)
-            NumSNPs = len(snpCHR)
-            case = 0
-            note = "Sufficient number of SNPs"
-            if NumSNPs < snp_thres:
-                note = "Attention: low number of SNPs provided"
-                case = 1
-            snpst = np.unique(snpCHR, return_counts=True)
-            snpdict = dict(('Chr%s' % snpst[0][i], snpst[1][i]) for i in range(len(snpst[0])))
-            statdict = {"interpretation": {"case": case, "text": note}, "snps": snpdict, "num_of_snps": NumSNPs, "depth": DPmean}
-            statdict['percent_heterozygosity'] = getHeterozygosity(snpGT)
-            with open(outFile + ".stats.json", "w") as out_stats:
-                out_stats.write(json.dumps(statdict))
-    log.info("done!")
-    return (snpCHR, snpPOS, snpGT, snpWEI, DPmean)
-
 def genotyper(snpCHR, snpPOS, snpGT, snpWEI, DPmean, hdf5File, hdf5accFile, outFile):
   NumSNPs = len(snpCHR)
   log.info("loading database files")
@@ -228,7 +113,7 @@ def genotyper(snpCHR, snpPOS, snpGT, snpWEI, DPmean, hdf5File, hdf5accFile, outF
   NumMatSNPs = 0
   overlappedInds = np.zeros(0, dtype=int)
   chunk_size = 1000
-  for ind,echr in enumerate(parseChrName(GenotypeData.chrs)[0]):
+  for ind,echr in enumerate(parsers.parseChrName(GenotypeData.chrs)[0]):
     perchrTarPos = np.where(snpCHR == echr)[0]
     perchrtarSNPpos = snpPOS[perchrTarPos]
     log.info("Analysing positions in chromosome %s", echr)
@@ -252,10 +137,8 @@ def genotyper(snpCHR, snpPOS, snpGT, snpWEI, DPmean, hdf5File, hdf5accFile, outF
       tempScore1 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs1, dtype=int).T, matchedTarWei[j:j+chunk_size,1]).T, axis=0)
       tempScore2 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs2, dtype=int).T, matchedTarWei[j:j+chunk_size,2]).T, axis=0)
       ScoreList = ScoreList + tempScore0 + tempScore1 + tempScore2
-      if(len(TarGTs0[j:j+chunk_size]) > 1):
+      if(len(TarGTs0[j:j+chunk_size]) >= 1):
         NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites
-      elif(len(TarGTs0[j:j+chunk_size]) == 1):
-        NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int)
     log.info("Done analysing %s positions", NumMatSNPs)
   log.info("writing score file!")
   overlap = float(NumMatSNPs)/NumSNPs
@@ -267,14 +150,14 @@ def genotyper(snpCHR, snpPOS, snpGT, snpWEI, DPmean, hdf5File, hdf5accFile, outF
   return (ScoreList, NumInfoSites)
 
 def potatoGenotyper(args):
-  (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parseInput(inFile = args['inFile'], logDebug = args['logDebug'])
+  (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = args['inFile'], logDebug = args['logDebug'])
   log.info("running genotyper!")
   (ScoreList, NumInfoSites) = genotyper(snpCHR, snpPOS, snpGT, snpWEI, DPmean, args['hdf5File'], args['hdf5accFile'], args['outFile'])
   log.info("finished!")
 
 def pairwiseScore(inFile_1, inFile_2, logDebug, outFile):
-    (snpCHR1, snpPOS1, snpGT1, snpWEI1, DPmean1) = parseInput(inFile = inFile_1, logDebug = logDebug)
-    (snpCHR2, snpPOS2, snpGT2, snpWEI2, DPmean2) = parseInput(inFile = inFile_2, logDebug = logDebug)
+    (snpCHR1, snpPOS1, snpGT1, snpWEI1, DPmean1) = parsers.parseInput(inFile = inFile_1, logDebug = logDebug)
+    (snpCHR2, snpPOS2, snpGT2, snpWEI2, DPmean2) = parsers.parseInput(inFile = inFile_2, logDebug = logDebug)
     snpmatch_stats = {}
     unique_1, unique_2, common, scores = 0, 0, 0, 0
     chrs = np.union1d(snpCHR1, snpCHR2)