add mvcf2consensus.py

6c7c63d9 · Kristian Ullrich · e62b5dd9 · 6c7c63d9
Commit 6c7c63d9 authored 8 years ago by Kristian Ullrich
--- a/scripts/mvcf2consensus.py
+++ b/scripts/mvcf2consensus.py
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+
+'''
+Author: Krisian Ullrich
+date: January 2017
+email: ullrich@evolbio.mpg.de
+License: MIT
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Kristian Ullrich
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+This software relies on the following python packages:
+sys
+os
+re
+numpy https://pypi.python.org/pypi/numpy
+random
+argparse
+tabix https://pypi.python.org/pypi/pytabix
+gzip 
+itertools
+Bio https://github.com/biopython/DIST
+collections
+'''
+
+
+import sys
+import os
+import re
+import numpy as np
+import random
+import argparse
+import tabix
+import gzip
+import itertools
+from itertools import repeat
+from Bio import SeqIO
+import collections
+
+
+IUPACdict = {'A':'A','C':'C','G':'G','T':'T','-':'-','N':'N','.':'.','AG':'R','GA':'R','CT':'Y','TC':'Y','CG':'S',
+'GC':'S','AT':'W', 'TA': 'W', 'GT': 'K', 'TG': 'K', 'AC': 'M', 'CA': 'M','CGT':'B','CTG':'B','GCT':'B','GTC':'B',
+'TCG':'B','TGC':'B','AGT':'D','ATG':'D','GAT':'D','GTA':'D','TAG':'D','TGA':'D','ACT':'H','ATC':'H','CAT':'H',
+'CTA':'H','TAC':'H','TCA':'H','ACG':'V','AGC':'V','CAG':'V','CGA':'V','GAC':'V','GCA':'V','ACGT':'N','ACTG':'N',
+'AGCT':'N','AGTC':'N','ATCG':'N','ATGC':'N','CAGT':'N','CATG':'N','CGAT':'N','CGTA':'N','CTAG':'N','CTGA':'N',
+'GACT':'N','GATC':'N','GCAT':'N','GCTA':'N','GTAC':'N','GTCA':'N','TACG':'N','TAGC':'N','TCAG':'N','TCGA':'N',
+'TGAC':'N','TGCA':'N','A*':'N','C*':'N','G*':'N','T*':'N'}
+
+
+def getIUPAC(x):
+    return IUPACdict[x]
+
+
+def POINT2zero(x):
+    if x == '.':
+        return '0'
+    else:
+        return x
+
+
+def POINTNA2zero(x):
+    if x == '.':
+        return '0'
+    if x == 'NA':
+        return '0'
+    else:
+        return x
+
+
+def NA2zero(x):
+    if x == 'NA':
+        return int(0)
+    else:
+        return int(x)
+
+
+def nCk(n, k):
+    return len(list(itertools.combinations(range(n), k)))
+
+
+def evaluate_ref_alt_max(x,y):
+    if x[0]==0 and x[1]==0:
+        return './.'
+    if x[0]>0 and x[1]==0:
+        return str(y[0])+'/'+str(y[0])
+    if x[0]==0 and x[1]>0:
+        return str(y[1])+'/'+str(y[1])
+    if x[0]>0 and x[1]>0:
+        return str(y[0])+'/'+str(y[1])
+
+
+def get_header_from_vcf( argsDICT ):
+    HEADER = []
+    with open(argsDICT['ivcf'],'rb') as f:
+        for flines in f:
+           if flines[:2]=='##':
+               HEADER.append(flines)
+           if flines[:4]=='#CHR':
+               HEADER.append('##INFO=<ID=NCFnumber,Number=1,Type=String,Description="NotCalledFraction number">\n')
+               HEADER.append('##INFO=<ID=NCFfraction,Number=1,Type=Float,Description="NotCalledFraction fraction">\n')
+               HEADER.append('##mvcf2consensus.py '+argsDICT['args'][0]+'\n')
+               HEADER.append(changeHEADER(flines, argsDICT['add'], argsDICT['id'], argsDICT['samples']))
+           if flines[0]!='#':
+               break
+    return HEADER
+
+
+def get_header_from_gz( argsDICT ):
+    HEADER = []
+    with gzip.open(argsDICT['ivcf'],'rb') as f:
+        for flines in f:
+           if flines[:2]=='##':
+               HEADER.append(flines)
+           if flines[:4]=='#CHR':
+               HEADER.append('##INFO=<ID=NCFnumber,Number=1,Type=String,Description="NotCalledFraction number">\n')
+               HEADER.append('##INFO=<ID=NCFfraction,Number=1,Type=Float,Description="NotCalledFraction fraction">\n')
+               HEADER.append('##mvcf2consensus.py '+argsDICT['args'][0]+'\n')
+               HEADER.append(changeHEADER(flines, argsDICT['add'], argsDICT['id'], argsDICT['samples']))
+           if flines[0]!='#':
+               break
+    return HEADER
+
+
+def get_sample_pos_from_header( argsDICT ):
+    SAMPLE_POS = []
+    if argsDICT['tabix']:
+        with gzip.open(argsDICT['ivcf'],'rb') as f:
+            for flines in f:
+               if flines[:4]=='#CHR':
+                   flines_stripped = flines.strip().split('\t')
+               if flines[0]!='#':
+                   break
+    if not argsDICT['tabix']:
+        with open(argsDICT['ivcf'],'rb') as f:
+            for flines in f:
+                if flines[:4]=='#CHR':
+                    flines_stripped = flines.strip().split('\t')
+                if flines[0]!='#':
+                    break
+    for s in argsDICT['samples']:
+        SAMPLE_POS.append([x for x,y in enumerate(flines_stripped) if y == s][0])
+    return SAMPLE_POS
+
+
+def get_sample_pos_dict_from_header( argsDICT ):
+    sampleposDICT = {}
+    if argsDICT['tabix']:
+        with gzip.open(argsDICT['ivcf'],'rb') as f:
+            for flines in f:
+               if flines[:4]=='#CHR':
+                   flines_stripped = flines.strip().split('\t')
+               if flines[0]!='#':
+                   break
+    if not argsDICT['tabix']:
+        with open(argsDICT['ivcf'],'rb') as f:
+            for flines in f:
+                if flines[:4]=='#CHR':
+                    flines_stripped = flines.strip().split('\t')
+                if flines[0]!='#':
+                    break
+    for s in argsDICT['samples']:
+        sampleposDICT[s]=[x for x,y in enumerate(flines_stripped) if y == s][0]
+    return sampleposDICT
+
+
+
+def changeHEADER(header, add, name, samples):
+    if add:
+        headersplit = header.strip().split('\tFORMAT')
+        return('\t'.join([headersplit[0]]+['FORMAT']+samples+[name])+'\n')
+    if not add:
+        headersplit = header.strip().split('\tFORMAT')
+        return('\t'.join([headersplit[0]]+['FORMAT']+[name])+'\n')
+
+
+def get_missing_type(x):
+    if x == 'keep':
+        return ['keep', 0, 0]
+    if x == 'zero':
+        return ['zero', 0, 0]
+    if 'set' in x:
+        return ['set', int(x.split(':')[1]), int(x.split(':')[2])]
+
+
+def get_reference_len( argsDICT ):
+    print 'start obtaining reference length'
+    length_dict = {}
+    for seq in SeqIO.parse( argsDICT['R'], 'fasta' ):
+        if argsDICT['verbose']:
+            print seq.id
+        if seq.id not in argsDICT['chr']:
+            continue
+        if seq.id in argsDICT['chr']:
+            length_dict[seq.id] = len( seq )
+    print 'finished obtaining reference length'    
+    return length_dict
+
+
+class vcfRecord:
+    
+    
+    def __init__(self, record, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, SAMPLES, tabix):
+        if not tabix:
+            self.splitted = record.strip().split('\t')
+        if tabix:
+            self.splitted = record
+        self.CHROM = self.splitted[CHROM]
+        self.POS = self.splitted[POS]
+        self.ID = self.splitted[ID]
+        self.REF = [self.splitted[REF]]
+        self.ALT = self.splitted[ALT].split(',')
+        self.QUAL = self.splitted[QUAL]
+        self.FILTER = self.splitted[FILTER]
+        self.INFO = self.splitted[INFO]
+        self.FORMAT = self.splitted[FORMAT].split(':')
+        self.SAMPLES = collections.OrderedDict()
+        for s in SAMPLES:
+            self.SAMPLES[s] = collections.OrderedDict()
+            if len(self.splitted[s].split(':')) != len(self.FORMAT):
+                self.splitted[s]+=':'+':'.join(list(np.repeat("0",len(self.FORMAT)-len(self.splitted[s].split(':')))))
+            for p,f in enumerate(self.FORMAT):
+                self.SAMPLES[s][f] = self.splitted[s].split(':')[p].split(',')
+    
+    
+    def get_var_len(self):
+        """" Return length of alleles """
+        var_len = [len(self.REF)]
+        var_len += [len(x) for x in self.ALT]
+        return var_len
+    
+    
+    def get_var_number(self):
+        """ Return total number of alleles """
+        return len([self.REF])+len(self.ALT)
+    
+    
+    def get_ref_alt(self):
+        """ Return alleles """
+        REF_ALT = self.REF+self.ALT
+        return REF_ALT
+    
+    
+    def is_filtered(self, argsDICT):
+        """ Return True if FILTER of record is in filterarg """
+        if len(argsDICT['filterarg'])!=1:
+            if self.FILTER not in argsDICT['filterarg']:
+                return True
+            if self.FILTER in argsDICT['filterarg']:
+                return False
+        if len(argsDICT['filterarg']) == 1 and argsDICT['filterarg'][0]!='':
+            if self.FILTER not in argsDICT['filterarg']:
+                return True
+            if self.FILTER in argsDICT['filterarg']:
+                return False
+        if len(argsDICT['filterarg']) == 1 and argsDICT['filterarg'][0] == '':
+            return False
+    
+    
+    def get_var_type(self):
+        """ Return variant type according to GATK """
+        var_type = 'NA'
+        var_len = self.get_var_len()
+        var_number = self.get_var_number()
+        if len(self.ALT) == 1 and self.ALT[0] == '.':
+            var_type = 'noVAR'
+        if len(self.ALT) == 1 and self.ALT[0] == '*':
+            var_type = 'complexVAR'
+        if var_number == 2:
+            if var_len[0] == 1 and var_len[1] == 1:
+                #equals GATK SNP
+                var_type = 'biSNP'
+            if var_len[0] > 1 and var_len[1] > 1 and var_len[0] == var_len[1]:
+                #equals GATK MNP
+                var_type = 'repSNP'
+            if var_len[0] > var_len[1]:
+                var_type = 'deletion'
+            if var_len[0] < var_len[1]:
+                var_type = 'insertion'
+        #equals GATK MULTI-ALLELIC
+        if var_number != 2:
+            if all( [x == var_len[0] for x in var_len[1::]] ) and all( [x == 1 for x in var_len] ):
+                #equals GATK SNP
+                var_type = 'muSNP'
+            #TODO 'murepSNP'
+            if all( [x < var_len[0] for x in var_len[1::]] ):
+                var_type = 'mudeletion'
+            if all( [x > var_len[0] for x in var_len[1::]] ):
+                var_type = 'muinsertion'
+            if any( [x < var_len[0] for x in var_len[1::]] ) and any( [x == var_len[0] for x in var_len[1::]] ):
+                var_type = 'complexDS'
+            if any( [x > var_len[0] for x in var_len[1::]] ) and any( [x == var_len[0] for x in var_len[1::]] ):
+                var_type = 'complexIS'
+            if any( [x < var_len[0] for x in var_len[1::]] ) and any( [x > var_len[0] for x in var_len[1::]] ):
+                var_type = 'complexDI'
+            if any( [x < var_len[0] for x in var_len[1::]] ) and any( [x > var_len[0] for x in var_len[1::]] ) and any( [x == var_len[0] for x in var_len[1::]] ):
+                var_type = 'complexDIS'
+        return var_type
+    
+    
+    def keep_samples(self, argsDICT):
+        """ Keep only samples in record which are listed """
+        for s in self.SAMPLES.copy():
+            if s not in argsDICT['samples_pos']:
+                self.SAMPLES.pop(s)
+    
+    
+    def get_info(self, value):
+        """" Return INFO value """
+        return [x[1] for x in [x.split('=') for x in self.INFO.split(';')] if x[0] == value][0]
+    
+    
+    def add_info(self, value):
+        """ Add INFO value """
+        self.INFO+=str(value)
+    
+    
+    def add_format_on_samples(self, s, f, v):
+        """ Add new FORMAT field to sample """
+        self.SAMPLES[s][f] = v
+    
+    
+    def replace_format_on_samples(self, s, f, v):
+        """ Replace FORMAT field of sample """ 
+        self.SAMPLES[s][f] = v
+    
+    
+    def get_samples_ad(self):
+        """ Return samples AD as list """
+        return [self.SAMPLES[x]['AD'] for x in self.SAMPLES]
+    
+    
+    def get_samples_gt(self):
+        """ Return samples GT as list """
+        return [self.SAMPLES[x]['GT'] for x in self.SAMPLES]
+    
+    
+    def get_samples_gq(self):
+        """ Return samples GQ as list """
+        return [self.SAMPLES[x]['GQ'] for x in self.SAMPLES]
+    
+    
+    def get_samples_dp(self):
+        """ Return samples GQ as list """
+        return [self.SAMPLES[x]['DP'] for x in self.SAMPLES]
+    
+    
+    def get_samples_pl(self):
+        """ Return samples GQ as list """
+        return [self.SAMPLES[x]['PL'] for x in self.SAMPLES]
+    
+    
+    def get_samples_ad_len(self):
+        """ Return samples AD len as list """
+        return [len(self.SAMPLES[x]['AD']) for x in self.SAMPLES]
+    
+    
+    def get_samples_ad_ref(self):
+        """ Return samples AD REF as list """
+        return [self.SAMPLES[x]['AD'][0] for x in self.SAMPLES]
+    
+    
+    def get_samples_ad_alt(self):
+        """ Return samples AD ALT as list """
+        return [self.SAMPLES[x]['AD'][1:] for x in self.SAMPLES]
+    
+    
+    def gt_type(self, x):
+        if len(x)==1:
+            if x[0] == '.':
+                return 'NA'
+            if x[0] != '.':
+                return 'hap'
+        if x[0] == '.' and x[1] == '.':
+            return 'NA'
+        if x[0] != '.' and x[1] != '.' and x[0] == x[1]:
+            return 'hom'
+        if x[0] != '.' and x[1] != '.' and x[0] != x[1]:
+            return 'het'
+    
+    
+    def get_gt_type(self):
+        """ Return GT type """
+        gt_split = re.compile("[|/]")
+        GTs = [re.split(gt_split,x[0]) for x in self.get_samples_gt()]
+        return [self.gt_type(x) for x in GTs]
+    
+    
+    def get_gt_na_pos(self):
+        """ Return index of GT which is NA """
+        samplesgttype = self.get_gt_type()
+        samplesgtnapos = [x for x,y in enumerate(samplesgttype) if y == 'NA']
+        return samplesgtnapos
+    
+    
+    def fill_empty_ad(self):
+        """ Replaces "." with 0 and fills to var_number if less """
+        x_ = [x+list(np.repeat(0,self.get_var_number()-len(x))) for x in [[int(x.replace(".","0")) for x in y] for y in self.get_samples_ad()]]
+        for x,y in zip(x_, self.SAMPLES.keys()):
+            self.SAMPLES[y]['AD'] = x
+     
+    
+    def fill_empty_pl(self):
+        """ Replaces "." with [0,0,0] """
+        x_ = [x+list(np.repeat(0,nCk(self.get_var_number()+1,2)-len(x))) for x in [[int(x.replace(".","0")) for x in y] for y in self.get_samples_pl()]]
+        for x,y in zip(x_, self.SAMPLES.keys()):
+            self.SAMPLES[y]['PL'] = x
+    
+    
+    def set_samples_ad_by_gt(self, argsDICT):
+        if argsDICT['missingGT'][0] == 'keep':
+            return
+        if argsDICT['missingGT'][0] == 'zero':
+            samplesgtnapos = slef.get_gt_na_pos()
+            #set AD to zero for all samples with GT equals NA
+            for s in [x.sample for y,x in enumerate(self.samples) if y in samplesgtnapos]:
+                replace_format_on_samples(self, s, 'AD', list(np.repeat(0, get_var_number(self))))
+        if argsDICT['missingGT'][0] == 'set':
+            samplesgtnapos = self.get_gt_na_pos()
+            #set AD to zero for all samples with GT equals NA
+            for s in self.get_gt_na_pos():
+                self.replace_format_on_samples(self.SAMPLES.keys()[s], 'AD', [argsDICT['missingGT'][1]]+list(np.repeat(argsDICT['missingGT'][2], self.get_var_number()-1)))
+    
+    
+    def set_dp_by_ad(self):
+        samplesad = self.get_samples_ad()
+        samplesadsum = [np.sum([int(x) for x in x]) for x in samplesad]
+        #set DP to ADSUM
+        for i in range(0, len(self.SAMPLES)):
+            self.replace_format_on_samples(self.SAMPLES.keys()[i], 'DP', samplesadsum[i])
+    
+    
+    def set_samples_gt_by_ad(self, argsDICT):
+        if argsDICT['resetGT'] == 'keep':
+            return
+        if argsDICT['resetGT'] == 'AD':
+            #setGTbyAD
+            samplesad_ref = self.get_samples_ad_ref()
+            samplesad_alt = self.get_samples_ad_alt()
+            #NOTE np.argmax return the first element for equal values which would bias GT setting always to the first ALT entry if equal depth exist
+            #NOTE this is only used to determine NotCalledFraction, AD for other alleles will be considered
+            samplesad_alt_maxpos = [np.argmax(x) for x in samplesad_alt]
+            samplesad_alt_max = [np.max(x) for x in samplesad_alt]
+            samplesad_ref_alt_max = map(lambda a, b: [a,b], samplesad_ref, samplesad_alt_max)
+            samplesad_ref_alt_maxpos = map(lambda a, b: [a,b], list(np.repeat(0,len(samplesad_ref))), [x+1 for x in samplesad_alt_maxpos])
+            samplesgt = []
+            for a, b in zip(samplesad_ref_alt_max, samplesad_ref_alt_maxpos):
+                samplesgt.append(evaluate_ref_alt_max(a, b))
+            #get NotCalled positions
+            minDP_NCpos = [x for x,y in enumerate(self.get_samples_dp()) if y < argsDICT['minDP']]
+            maxDP_NCpos = [x for x,y in enumerate(self.get_samples_dp()) if y > argsDICT['maxDP']]
+            minGQ_NCpos = [x for x,y in enumerate(self.get_samples_gq()) if y < argsDICT['minGQ']]
+            #combine and unique NotCalled positions
+            NCpos = [x for x in set(minDP_NCpos+maxDP_NCpos+minGQ_NCpos)]
+            for x in NCpos:
+                samplesgt[x]='./.'
+            for i in range(0, len(self.SAMPLES)):
+                self.replace_format_on_samples(self.SAMPLES.keys()[i], 'GT', samplesgt[i])
+    
+    
+    def get_consensus_calldata(self, argsDICT):
+        """ Return consensus of existing record samples """
+        consensusdict = collections.OrderedDict()
+        for rf in self.FORMAT:
+            consensusdict[rf] = None
+        consensusdict['AD'] = self.merge_ad()
+        consensusdict['DP'] = np.sum(consensusdict['AD'])
+        consensusdict['GQ'] = self.merge_gq()
+        #evaluate CONSENSUSPL from SAMPLESPL AND SAMPLESGTNAPOS
+        consensusdict['PL'] = self.merge_pl()
+        consensusdict['GT'] = self.set_consensus_gt_by_ad(consensusdict)
+        return consensusdict
+    
+    
+    def set_consensus_gt_by_ad(self, consensusdict):
+        consensusad_ref = consensusdict['AD'][0]
+        consensusad_alt = consensusdict['AD'][1:]
+        #NOTE np.argmax return the first element for equal values which would bias GT setting always to the first ALT entry if equal depth exist
+        consensusad_alt_maxpos = np.argmax(consensusad_alt)
+        consensusad_alt_max = np.max(consensusad_alt)
+        consensusad_ref_alt_max = [consensusad_ref, consensusad_alt_max]
+        consensusad_ref_alt_maxpos = [0, consensusad_alt_maxpos+1]
+        consensusgt = evaluate_ref_alt_max(consensusad_ref_alt_max, consensusad_ref_alt_maxpos)
+        return consensusgt
+    
+    
+    def merge_ad(self):
+        ADOUT = []
+        for i in range(0, self.get_var_number()):
+            ADOUT.append(np.sum([int(y[i]) for x,y in enumerate(self.get_samples_ad()) if x not in self.get_gt_na_pos()]))
+        return ADOUT
+    
+    
+    def merge_gq(self):
+        return int(np.median([[0 if z == "." else int(z) for z in y] for x,y in enumerate(self.get_samples_gq()) if x not in self.get_gt_na_pos()]))
+    
+    
+    def merge_dp(self):
+        return np.sum([y for x,y in enumerate(self.get_samples_dp()) if x not in self.get_gt_na_pos()])
+    
+    
+    def merge_pl(self):
+        PLOUT = []
+        for i in range(0,nCk(self.get_var_number()+1,2)):
+            PLOUT.append(int(np.median([y[i] for x,y in enumerate(self.get_samples_pl()) if x not in self.get_gt_na_pos()])))
+        return PLOUT
+    
+    
+    def get_alleles_muSNP(self):
+        MAJOUT = list(np.repeat('N',len(self.SAMPLES)))
+        MINOUT = list(np.repeat('N',len(self.SAMPLES)))
+        IUPACOUT = list(np.repeat('N',len(self.SAMPLES)))
+        for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
+            MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _muSNP(self, sample_, 'refmajorsample')
+        return [MAJOUT,MINOUT,IUPACOUT]
+    
+    def get_alleles_biSNP(self):
+        MAJOUT = list(np.repeat('N',len(self.SAMPLES)))
+        MINOUT = list(np.repeat('N',len(self.SAMPLES)))
+        IUPACOUT = list(np.repeat('N',len(self.SAMPLES)))
+        for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
+            MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _biSNP(self, sample_, 'refmajorsample')
+        return [MAJOUT,MINOUT,IUPACOUT]
+    
+    
+    def get_alleles_repSNP(self):
+        MAJOUT = list(np.repeat('N'*len(get_ref_alt(self)[0]),len(self.SAMPLES)))
+        MINOUT = list(np.repeat('N'*len(get_ref_alt(self)[0]),len(self.SAMPLES)))
+        IUPACOUT = list(np.repeat('N'*len(get_ref_alt(self)[0]),len(self.SAMPLES)))
+        for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
+            MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _repSNP(self, sample_, 'refmajorsample')
+        return [MAJOUT,MINOUT,IUPACOUT]
+
+def _biSNP(self, sample_pos, type_):
+    if sample_pos not in self.SAMPLES.keys():
+        return
+    samplesad_ = self.SAMPLES[sample_pos]['AD']
+    ref_alt_ = self.get_ref_alt()
+    ref_ = ref_alt_[0]
+    alt_ = ref_alt_[1:]
+    ref_out = 'N'
+    alt_out = 'N'
+    iupac_out = 'N'
+    samplesad_ = [int(x) for x in samplesad_]
+    ref_alt_nonzero = [ref_alt_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
+    samplesad_nonzero = [samplesad_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
+    if len(samplesad_nonzero) == 0:
+        return [ref_out, alt_out, iupac_out]
+    admax = np.max([int(x) for x in samplesad_nonzero])
+    admin = np.min([int(x) for x in samplesad_nonzero])
+    admaxpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admax]
+    adminpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]!=admax]
+    if type_ == 'refaltN':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2:
+            return [ref_out, alt_out, iupac_out]
+    if type_ == 'refaltsample':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2:
+            ref_out = random.sample(ref_alt_nonzero,1)[0]
+            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+    if type_ == 'refmajorsample':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
+            ref_out = ref_alt_nonzero[admaxpos[0]]
+            alt_out = ref_alt_nonzero[adminpos[0]]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
+            ref_out = random.sample(ref_alt_nonzero,1)[0]
+            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+    if type_ == 'refminorsample':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
+            ref_out = ref_alt_nonzero[adminpos[0]]
+            alt_out = ref_alt_nonzero[admaxpos[0]]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
+            ref_out = random.sample(ref_alt_nonzero,1)[0]
+            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+    if type_ == 'iupac':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2:
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+
+
+def _muSNP(self, sample_pos, type_):
+    if sample_pos not in self.SAMPLES.keys():
+        return
+    samplesad_ = self.SAMPLES[sample_pos]['AD']
+    ref_alt_ = self.get_ref_alt()
+    ref_ = ref_alt_[0]
+    alt_ = ref_alt_[1:]
+    ref_out = 'N'
+    alt_out = 'N'
+    iupac_out = 'N'
+    samplesad_ = [int(x) for x in samplesad_]
+    ref_alt_nonzero = [ref_alt_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
+    samplesad_nonzero = [samplesad_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
+    if len(samplesad_nonzero) == 0:
+        return [ref_out, alt_out, iupac_out]
+    admax = np.max([int(x) for x in samplesad_nonzero])
+    admin = np.min([int(x) for x in samplesad_nonzero])
+    admaxpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admax]
+    adminpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]!=admax]
+    if type_ == 'refaltN':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)>1:
+            return [ref_out, alt_out, iupac_out]
+    if type_ == 'refaltsample':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2:
+            ref_out = random.sample(ref_alt_nonzero,1)[0]
+            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)>2:
+            ref_out = random.sample(ref_alt_nonzero,1)[0]
+            alt_out = random.sample([x for x in ref_alt_nonzero if x != ref_out],1)[0]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+    if type_ == 'refmajorsample':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
+            ref_out = ref_alt_nonzero[admaxpos[0]]
+            alt_out = ref_alt_nonzero[adminpos[0]]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
+            ref_out = random.sample(ref_alt_nonzero,1)[0]
+            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) != 1:
+            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
+            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
+            ref_out = random.sample(max_nonzero,1)[0]
+            alt_out = random.sample(min_nonzero,1)[0]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) == 1:
+            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
+            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
+            ref_out = random.sample(max_nonzero,1)[0]
+            alt_out = random.sample([x for x in max_nonzero if x != ref_out],1)[0]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+    if type_ == 'refminorsample':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
+            ref_out = ref_alt_nonzero[adminpos[0]]
+            alt_out = ref_alt_nonzero[admaxpos[0]]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
+            ref_out = random.sample(ref_alt_nonzero,1)[0]
+            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) != 1:
+            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
+            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
+            ref_out = random.sample(max_nonzero,1)[0]
+            alt_out = random.sample(min_nonzero,1)[0]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) == 1:
+            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
+            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
+            ref_out = random.sample(min_nonzero,1)[0]
+            alt_out = random.sample([x for x in min_nonzero if x != ref_out],1)[0]
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+    if type_ == 'iupac':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)>1:
+            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
+            return [ref_out, alt_out, iupac_out]
+
+
+def _repSNP(self, sample_pos, type_):
+    if sample_pos not in self.SAMPLES.keys():
+        return
+    samplesad_ = self.SAMPLES[sample_pos]['AD']
+    ref_alt_ = self.get_ref_alt()
+    ref_ = ref_alt_[0]
+    alt_ = ref_alt_[1:]
+    ref_out = ''.join(list(repeat('N',len(ref_))))
+    alt_out = ''.join(list(repeat('N',len(ref_))))
+    iupac_out = ''.join(list(repeat('N',len(ref_))))
+    samplesad_ = [int(x) for x in samplesad_]
+    ref_alt_nonzero = [ref_alt_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
+    samplesad_nonzero = [samplesad_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
+    if len(samplesad_nonzero) == 0:
+        return [ref_out, alt_out, iupac_out]
+    admax = np.max([int(x) for x in samplesad_nonzero])
+    admin = np.min([int(x) for x in samplesad_nonzero])
+    admaxpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admax]
+    adminpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admin]
+    if type_ == 'refaltN':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2:
+            return [ref_out, alt_out, iupac_out]
+    if type_ == 'refaltsample':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2:
+            ref_out = random.sample(ref_alt_nonzero,1)[0]
+            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
+            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
+            return [ref_out, alt_out, iupac_out]
+    if type_ == 'refmajorsample':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
+            ref_out = ref_alt_nonzero[admaxpos[0]]
+            alt_out = ref_alt_nonzero[adminpos[0]]
+            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
+            ref_out = random.sample(ref_alt_nonzero,1)[0]
+            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
+            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
+            return [ref_out, alt_out, iupac_out]
+    if type_ == 'refminorsample':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
+            ref_out = ref_alt_nonzero[adminpos[0]]
+            alt_out = ref_alt_nonzero[admaxpos[0]]
+            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
+            ref_out = random.sample(ref_alt_nonzero,1)[0]
+            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
+            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
+            return [ref_out, alt_out, iupac_out]
+    if type_ == 'iupac':
+        if len(ref_alt_nonzero)==1:
+            ref_out = ref_alt_nonzero[0]
+            alt_out = ref_alt_nonzero[0]
+            iupac_out = ref_alt_nonzero[0]
+            return [ref_out, alt_out, iupac_out]
+        if len(ref_alt_nonzero)==2:
+            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
+            return [ref_out, alt_out, iupac_out]
+
+
+
+def parse_record(self, argsDICT, polytypeDICT, filterDICT, tsvouthandle, vcfouthandle):
+    """ Parse each record and writes into outhandle if variant is kept after filtering """
+    REF_ALT = self.get_ref_alt()
+    POLYTYPE = self.get_var_type()
+    POLYNUMBER = self.get_var_number()
+    POLYLEN = self.get_var_len()
+    if POLYTYPE in polytypeDICT:
+        polytypeDICT[POLYTYPE]+=1
+    if POLYTYPE not in polytypeDICT:
+        polytypeDICT[POLYTYPE]=1
+    #0. REDUCE record TO SAMPLES and ADD mandatory FORMAT TO SAMPLES [GT:AD:DP]:GQ:PL
+    self.keep_samples(argsDICT)
+    if 'GQ' not in self.FORMAT:
+        for s in argsDICT['samples_pos']:
+            self.add_format_on_samples(s, 'GQ', 0)
+        self.FORMAT.append('GQ')
+    if 'PL' not in self.FORMAT:
+        for s in argsDICT['samples_pos']:
+            self.add_format_on_samples(s, 'PL', list(np.repeat(0,nCk(self.get_var_number()+1,2))))
+        self.FORMAT.append('PL')
+    #1. FILTER BY filterarg
+    if self.is_filtered(argsDICT):
+        if argsDICT['verbose']:
+            print self
+        filterDICT['filterarg']+=1
+        return
+    #2. FILTER BY minMQ
+    MQ = self.get_info('MQ')
+    if MQ == ".":
+        MQ = 0.0
+    if float(MQ) < argsDICT['minMQ']:
+        if argsDICT['verbose']:
+            print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
+        filterDICT['MQ']+=1
+        return
+    #3. GET AD
+    #fill empty AD
+    self.fill_empty_ad()
+    SAMPLESAD = self.get_samples_ad()
+    SAMPLESADLEN = self.get_samples_ad_len()
+    if not all([x==POLYNUMBER for x in SAMPLESADLEN]):
+        if argsDICT['verbose']:
+            print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
+        sys.exit('\nSAMPLES AD length differs from total POLYNUMBER')
+    #4. SET AD BY MISSING GT OPTION
+    SAMPLESGT = self.get_samples_gt()
+    SAMPLESGQ = self.get_samples_gq()
+    self.set_samples_ad_by_gt(argsDICT)
+    SAMPLESAD = self.get_samples_ad()
+    #set dp based on altered ad
+    self.set_dp_by_ad()
+    #5. KEEP OR RESET GT USING THE DP OPTION AND AD INFORMATION
+    SAMPLESDP = self.get_samples_dp()
+    self.set_samples_gt_by_ad(argsDICT)
+    #6. FILTER BY NotCalledFraction
+    SAMPLESGTTYPE = self.get_gt_type()
+    SAMPLESGTNAPOS = self.get_gt_na_pos()
+    NOTCALLED = len(SAMPLESGTNAPOS)
+    NOTCALLEDOUT = str(NOTCALLED)+'/'+str(len(self.SAMPLES))
+    self.add_info(';NCFnumber='+str(NOTCALLEDOUT))
+    NOTCALLEDFRACTION = float(NOTCALLED)/float(len(self.SAMPLES))
+    self.add_info(';NCFfraction='+str(NOTCALLEDFRACTION))
+    if NOTCALLEDFRACTION >= argsDICT['ncf']:
+        if argsDICT['verbose']:
+            print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
+        filterDICT['NCF']+=1
+        return
+    #7. GET CONSENSUS AND EVALUATE CDP OPTION
+    #fill empty PL
+    self.fill_empty_pl()    
+    CONSENSUS_CALLDATA = self.get_consensus_calldata(argsDICT)
+    if CONSENSUS_CALLDATA['DP'] < argsDICT['cdp']:
+        if argsDICT['verbose']:
+            print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
+        filterDICT['CDP']+=1
+        return
+    if argsDICT['add']:
+        self.SAMPLES[argsDICT['id']] = CONSENSUS_CALLDATA
+    if not argsDICT['add']:
+        self.SAMPLES = collections.OrderedDict()
+        self.SAMPLES[argsDICT['id']] = CONSENSUS_CALLDATA
+    VCFLINE = '\t'.join([self.CHROM, self.POS, self.ID, self.REF[0], ','.join(self.ALT), self.QUAL, self.FILTER, self.INFO, ':'.join(self.FORMAT)])
+    SAMPLES_LINE = []
+    for s in self.SAMPLES.keys():
+        SAMPLE_LINE = []
+        for k in self.SAMPLES[s].keys():
+            if isinstance(self.SAMPLES[s][k], list):
+                SAMPLE_LINE.append(','.join([str(x) for x in self.SAMPLES[s][k]]))
+            elif self.SAMPLES[s][k] is None:
+                SAMPLE_LINE.append(".")
+            else:
+                SAMPLE_LINE.append(str(self.SAMPLES[s][k]))
+        SAMPLES_LINE.append(':'.join(SAMPLE_LINE))
+    vcfouthandle.write(VCFLINE+'\t'+'\t'.join(SAMPLES_LINE)+'\n')
+    CONSENSUSOUT = SAMPLES_LINE[-1]
+    if POLYTYPE == 'repSNP':
+        if argsDICT['add']:
+            CONSENSUSMAJ, CONSENSUSMIN, CONSENSUSIUPAC = self.get_alleles_repSNP()
+            CONSENSUSMAJ = CONSENSUSMAJ[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
+            CONSENSUSMIN = CONSENSUSMIN[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
+            CONSENSUSIUPAC = CONSENSUSIUPAC[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
+        if not argsDICT['add']:
+            CONSENSUSMAJ, CONSENSUSMIN, CONSENSUSIUPAC = self.get_alleles_repSNP()
+            CONSENSUSMAJ = CONSENSUSMAJ[0]
+            CONSENSUSMIN = CONSENSUSMIN[0]
+            CONSENSUSIUPAC = CONSENSUSIUPAC[0]
+    if POLYTYPE == 'biSNP':
+        if argsDICT['add']:
+            CONSENSUSMAJ, CONSENSUSMIN, CONSENSUSIUPAC = self.get_alleles_biSNP()
+            CONSENSUSMAJ = CONSENSUSMAJ[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
+            CONSENSUSMIN = CONSENSUSMIN[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
+            CONSENSUSIUPAC = CONSENSUSIUPAC[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
+        if not argsDICT['add']:
+            CONSENSUSMAJ, CONSENSUSMIN, CONSENSUSIUPAC = self.get_alleles_biSNP()
+            CONSENSUSMAJ = CONSENSUSMAJ[0]
+            CONSENSUSMIN = CONSENSUSMIN[0]
+            CONSENSUSIUPAC = CONSENSUSIUPAC[0]
+    if POLYTYPE == 'muSNP':
+        if argsDICT['add']:
+            CONSENSUSMAJ, CONSENSUSMIN, CONSENSUSIUPAC = self.get_alleles_muSNP()
+            CONSENSUSMAJ = CONSENSUSMAJ[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
+            CONSENSUSMIN = CONSENSUSMIN[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
+            CONSENSUSIUPAC = CONSENSUSIUPAC[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
+        if not argsDICT['add']:
+            CONSENSUSMAJ, CONSENSUSMIN, CONSENSUSIUPAC = self.get_alleles_muSNP()
+            CONSENSUSMAJ = CONSENSUSMAJ[0]
+            CONSENSUSMIN = CONSENSUSMIN[0]
+            CONSENSUSIUPAC = CONSENSUSIUPAC[0]
+    if POLYTYPE != 'repSNP' and POLYTYPE != 'biSNP' and POLYTYPE != 'muSNP':
+        print POLYTYPE
+        sys.exit('\nPOLYTYPE not considered for CONSENSUS\nPlease reduce to SNP and MNP')
+    #write to tsv
+    TSVLINE = '\t'.join([self.CHROM,str(self.POS),NOTCALLEDOUT,CONSENSUSOUT,CONSENSUSMAJ,CONSENSUSMIN,CONSENSUSIUPAC])+'\n'
+    tsvouthandle.write(TSVLINE)
+    filterDICT['passed']+=1
+
+
+def main():
+    """ main """
+    parser = argparse.ArgumentParser(usage='%(prog)s [options]',
+                                     description='Given a multiple sample vcf file reduced to SNP and MNP and the reference fasta file, ' \
+                                                 'this script merges the samples into a consensus. ' \
+                                                 'Major, minor and IUPAC code of the consensus will be written to a tab separated file.'
+                                                 '\nTo extract a subpopulations from e.g. GATK multiple vcf file use the following command from GATK: ' \
+                                                 'java -jar GenomeAnalysisTK.jar -T SelectVariants -sn $SAMPLE1 -sn $SAMPLE2 -V $MULTISAMPLEVCF -R $REFERENCE ' \
+                                                 '[-trimAlternates : removeUnusedAlternates] ' \
+                                                 '[-env : excludeNonVariants] ' \
+                                                 '[-ef : excludeFiltered] ' \
+                                                 '-selectType SNP -selectType MNP -o $OUTPUT ' \
+                                                 '\nor vcftools: ' \
+                                                 'vcftools --vcf $MULTISAMPLEVCF [--gvcf] --out $OUTPUT --recode --indv $SAMPLE1 --indc $SAMPLE2 ' \
+                                                 '[--minGQ : filter for genotype quality] ' \
+                                                 '[--minDP : filter for depth] ' \
+                                                 '--remove-indels ')
+    parser.add_argument('-v', '--verbose', help='increase output verbosity', action='store_true')
+    #parser.add_argument('-tsv', help='specify if tsv should be written [default: False]', action='store_true')
+    parser.add_argument('-tabix', help='specify if input is tabix file. ' \
+                                       'NOTE: if set to true reference file needs to be specified to get length of each entry [default: False]'
+                                       , action='store_true')
+    parser.add_argument('-R', help='specify reference fasta file. Only needed if [tabix] option')
+    parser.add_argument('-ivcf', help='specify input vcf file')
+    parser.add_argument('-o', help='specify output prefix')
+    parser.add_argument('-H', help='specify if HEADER of vcf file should be ommitted [default: False]', action='store_true')
+    parser.add_argument('-chr', help='secify chromosomes to be parsed as a comma-seperated list')
+    parser.add_argument('-filterarg', help='specify FILTER arguments to keep variants, all variants without these will be discarded. ' \
+                                           'use a comma-seperated list e.g. PASS,VQSRTTranchesSNP90. ' \
+                                           'By default all variants are retained [default: ""]', default='')
+    parser.add_argument('-samples', help='specify sample names to be merged as a comma-seperated list, they need to exactly match sample ids from vcf input')
+    parser.add_argument('-add', help='specify if CONSENSUS should be added as a new column to existing vcf file [default: False]', action='store_true')
+    parser.add_argument('-id', help='specify CONSENSUS name to be added to HEADER')
+    parser.add_argument('-minMQ', help='specify mapping quality threshold necessary to exceeded to retain variant [default: "0.0"]', type=float, default=0.0)
+    parser.add_argument('-missingGT', help='specify how missing GT data (based on e.g. calling SNPs with GATK on multiple samples simultaneously; GT: ./.) ' \
+                                         'should be handled, either keep AD coverage [default: "keep"], set AD to 0 for all samples which have GT: ./. for all alleles ["zero"], ' \
+                                         'set REF allele to certain value and ALT alleles to certain value for all samples that have GT: ./. ["set:value:value"] e.g. ref:10:10 ' \
+                                         'NOTE: there might be cases where the VariantCaller has defined GT: ./. however AD still shows counts AD: 3,0 ' \
+                                         'here one might not want to keep this variant as NotCalled (GT: ./.) and use the AD information to reset GT based on AD', default='keep')
+    parser.add_argument('-resetGT', help='specify how GT should be reset, either reset using AD information [default: "AD"] or keep as defined by VariantCaller ["keep"] ' \
+                                         'NOTE: the DP value needs to be exceeded otherwise the sample will be set to ./. ' \
+                                         'NOTE: this will influence the calculation of the NotCalledFraction', default='AD', choices=['AD','keep'])
+    parser.add_argument('-minDP', help='specify minimal sample depth threshold to set position to be NotCalled during GT reset [default: "0"]', type=int, default=0)
+    parser.add_argument('-maxDP', help='specify maximal sample depth threshold to set position to be NotCalled during GT reset [default: "9999999"]', type=int, default=9999999)
+    parser.add_argument('-minGQ', help='specify minimal genotype quality threshold to set position to Called during GT reset [default: "0"]', type=int, default=0)
+    parser.add_argument('-ncf', help='specify NotCalledFraction to exclude SNP/MNP [default: "1.0"] for consensus calculation', type=float, default=1.0)
+    parser.add_argument('-cdp', help='specify consensus depth threshold necessary to exceeded to retain SNP/MNP [default: "5"]' \
+                                     'NOTE: only Called samples will be considered after GT reset', type=int, default=5)
+    parser.add_argument('-po', help='specify if progress should not be printed based on variant number processed [default: False]', action='store_true')
+    parser.add_argument('-ps', help='specify progress steps [default: "1e5"]', default='1e5')
+    parser.add_argument('-CHROM', help='specify CHROM column [default: 0]', type=int, default=0)
+    parser.add_argument('-POS', help='specify POS column [default: 1]', type=int, default=1)
+    parser.add_argument('-ID', help='specify ID column [default: 2]', type=int, default=2)
+    parser.add_argument('-REF', help='specify REF column [default: 3]', type=int, default=3)
+    parser.add_argument('-ALT', help='specify ALT column [default: 4]', type=int, default=4)
+    parser.add_argument('-QUAL', help='specify ALT column [default: 5]', type=int, default=5)
+    parser.add_argument('-FILTER', help='specify FILTER column [default: 6]', type=int, default=6)
+    parser.add_argument('-INFO', help='specify INFO column [default: 7]', type=int, default=7)
+    parser.add_argument('-FORMAT', help='specify FORMAT column [default: 8]', type=int, default=8)
+    args = parser.parse_args()
+    #IVCF
+    if args.ivcf is None:
+        parser.print_help()
+        sys.exit('\nPlease specify input vcf file')
+    args.ivcf = os.path.abspath(args.ivcf)
+    #TABIX
+    if args.tabix:
+        if args.R is None:
+            parser.print_help()
+            sys.exit('\nPlease specify reference fasta file, since [tabix: "true"]')
+        args.R = os.path.abspath(args.R)
+    #OUTPUT
+    if args.o is None:
+        parser.print_help()
+        sys.exit('\nPlease specify output path')
+    args.o = os.path.abspath(args.o)
+    #CHROMOSOMES
+    if args.chr is None:
+        sys.exit('\nPlease specify chromosomes to be parsed as a comma-seperated list')
+    args.chr = args.chr.split(',')
+    #SAMPLES
+    if args.samples is None:
+        parser.print_help()
+        sys.exit('\nPlease specify sample names to be parsed as comma-seperated list\ngrep "#CHROM" vcf')
+    args.samples = args.samples.split(',')
+    #SAMPLES ID
+    if args.id is None:
+        parser.print_help()
+        sys.exit('\nPlease specify CONSENSUS name')
+    #FILTER
+    args.filterarg = args.filterarg.split(',')
+    #MQ OPTION
+    args.missingGT = get_missing_type(args.missingGT)
+    args.ps = int(float(args.ps))
+    print(args)
+    #GET ARGS DICT
+    argsDICT = vars(args)
+    argsDICT['args'] = ['"'+str(args).replace("'","")+'"']
+    #STATIC DICT
+    polytypeDICT = {}
+    filterDICT = {'passed':0, 'filterarg':0, 'MQ':0, 'NCF':0, 'CDP':0}
+    if argsDICT['tabix']:
+        with open(argsDICT['o']+'.tsv','w') as tsvouthandle:
+            with open(argsDICT['o']+'.vcf','w') as vcfouthandle:
+                reference_length = get_reference_len(argsDICT)
+                header = get_header_from_gz(argsDICT)
+                argsDICT['samples_pos'] = get_sample_pos_from_header(argsDICT)
+                tb = tabix.open(argsDICT['ivcf'])
+                counter = 0
+                if not argsDICT['H']:
+                    for h in header:
+                        vcfouthandle.write(h)
+                for tmp_chr in argsDICT['chr']:
+                    tmp_chr_length = reference_length[tmp_chr]
+                    records = tb.query(tmp_chr,1,tmp_chr_length)
+                    for lines in records:
+                        record = vcfRecord(lines, argsDICT['CHROM'], argsDICT['POS'], argsDICT['ID'], argsDICT['REF'], argsDICT['ALT'], argsDICT['QUAL'], argsDICT['FILTER'], argsDICT['INFO'], argsDICT['FORMAT'], argsDICT['samples_pos'], argsDICT['tabix'])
+                        parse_record(record, argsDICT, polytypeDICT, filterDICT, tsvouthandle, vcfouthandle)
+                        if not argsDICT['po']:
+                            counter += 1
+                            if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) != 0:
+                                sys.stderr.write('.')
+                            if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) == 0:
+                                sys.stderr.write('o')
+    if not argsDICT['tabix']:
+        with open(argsDICT['o']+'.tsv','w') as tsvouthandle:
+            with open(argsDICT['o']+'.vcf','w') as vcfouthandle:
+                header = get_header_from_vcf(argsDICT)
+                argsDICT['samples_pos'] = get_sample_pos_from_header(argsDICT)
+                counter = 0
+                if not argsDICT['H']:
+                    for h in header:
+                        vcfouthandle.write(h)
+                with open(argsDICT['ivcf'],'rU') as vcfhandle:
+                    for lines in vcfhandle:
+                        if lines[0]=='#':
+                            continue
+                        record = vcfRecord(lines, argsDICT['CHROM'], argsDICT['POS'], argsDICT['ID'], argsDICT['REF'], argsDICT['ALT'], argsDICT['QUAL'], argsDICT['FILTER'], argsDICT['INFO'], argsDICT['FORMAT'], argsDICT['samples_pos'], argsDICT['tabix'])
+                        #continue if not in chromosome_ids
+                        if record.CHROM not in argsDICT['chr']:
+                            continue
+                        #else
+                        parse_record(record, argsDICT, polytypeDICT, filterDICT, tsvouthandle, vcfouthandle)
+                        if not argsDICT['po']:
+                            counter += 1
+                            if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) != 0:
+                                sys.stderr.write('.')
+                            if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) == 0:
+                                sys.stderr.write('o')
+    sys.stderr.write('\n')
+    print polytypeDICT
+    print filterDICT
+
+
+if __name__ == '__main__':
+    main()
+
+
+#TESTING
+#args.ivcf='test.vcf'
+#args.o='test.out'
+#args.chr='chr22'
+#args.id='test'
+#args.samples='BLANK,NA12878,NA12892,NA19238'