#!/usr/bin/python
# -*- coding: UTF-8 -*-

'''
author: Kristian K Ullrich
date: July 2017
email: ullrich@evolbio.mpg.de
License: MIT

The MIT License (MIT)

Copyright (c) 2017 Kristian K Ullrich

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

This software relies on the following python packages:
sys
os
re
numpy https://pypi.python.org/pypi/numpy
interval https://pypi.python.org/pypi/interval
random
argparse
tabix https://pypi.python.org/pypi/pytabix
gzip 
itertools
Bio https://github.com/biopython/DIST
collections
'''


import sys
import os
import re
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
from interval import interval, imath, inf
import random
import argparse
import tabix
import gzip
import itertools
from itertools import repeat
from Bio import SeqIO
from Bio.Data import CodonTable as CT
from Bio import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC
import collections


#classes
class vcfRecord:
    
    
    def __init__(self, record, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, SAMPLES, tabix):
        if not tabix:
            self.splitted = record.strip().split('\t')
        if tabix:
            self.splitted = record
        self.CHROM = self.splitted[CHROM]
        self.POS = self.splitted[POS]
        self.ID = self.splitted[ID]
        self.REF = [self.splitted[REF]]
        self.ALT = self.splitted[ALT].split(',')
        self.QUAL = self.splitted[QUAL]
        self.FILTER = self.splitted[FILTER]
        self.INFO = self.splitted[INFO]
        self.FORMAT = self.splitted[FORMAT].split(':')
        self.SAMPLES = collections.OrderedDict()
        for s in SAMPLES:
            self.SAMPLES[s] = collections.OrderedDict()
            if len(self.splitted[s].split(':')) != len(self.FORMAT):
                self.splitted[s]+=':'+':'.join(list(np.repeat("0",len(self.FORMAT)-len(self.splitted[s].split(':')))))
            for p,f in enumerate(self.FORMAT):
                self.SAMPLES[s][f] = self.splitted[s].split(':')[p].split(',')
                if len(self.SAMPLES[s][f]) == 1:
                    self.SAMPLES[s][f] = self.SAMPLES[s][f][0]
    
    
    def get_var_len(self):
        """" Return length of alleles """
        var_len = [len(x) for x in self.REF]
        var_len += [len(x) for x in self.ALT]
        return var_len
    
    
    def get_var_number(self):
        """ Return total number of alleles """
        return len([self.REF])+len(self.ALT)
    
    
    def get_ref_alt(self):
        """ Return alleles """
        REF_ALT = self.REF+self.ALT
        return REF_ALT
    
    
    def is_filtered(self, argsDICT):
        """ Return True if FILTER of record is in filterarg """
        if len(argsDICT['filterarg'])!=1:
            if self.FILTER not in argsDICT['filterarg']:
                return True
            if self.FILTER in argsDICT['filterarg']:
                return False
        if len(argsDICT['filterarg']) == 1 and argsDICT['filterarg'][0]!='':
            if self.FILTER not in argsDICT['filterarg']:
                return True
            if self.FILTER in argsDICT['filterarg']:
                return False
        if len(argsDICT['filterarg']) == 1 and argsDICT['filterarg'][0] == '':
            return False
    
    
    def get_var_type(self):
        """ Return variant type according to GATK """
        var_type = 'NA'
        var_len = self.get_var_len()
        var_number = self.get_var_number()
        if len(self.ALT) == 1 and self.ALT[0] == '.':
            var_type = 'noVAR'
            return var_type
        if len(self.ALT) == 1 and self.ALT[0] == '*':
            var_type = 'complexVAR'
            return var_type
        if var_number == 2:
            if var_len[0] == 1 and var_len[1] == 1:
                #equals GATK SNP
                var_type = 'biSNP'
            if var_len[0] > 1 and var_len[1] > 1 and var_len[0] == var_len[1]:
                #equals GATK MNP
                var_type = 'repSNP'
            if var_len[0] > var_len[1]:
                var_type = 'deletion'
            if var_len[0] < var_len[1]:
                var_type = 'insertion'
            return var_type
        #equals GATK MULTI-ALLELIC
        if var_number != 2:
            if all( [x == var_len[0] for x in var_len[1::]] ) and all( [x == 1 for x in var_len] ):
                #equals GATK SNP
                var_type = 'muSNP'
            #TODO 'murepSNP'
            if all( [x < var_len[0] for x in var_len[1::]] ):
                var_type = 'mudeletion'
            if all( [x > var_len[0] for x in var_len[1::]] ):
                var_type = 'muinsertion'
            if any( [x < var_len[0] for x in var_len[1::]] ) and any( [x == var_len[0] for x in var_len[1::]] ):
                var_type = 'complexDS'
            if any( [x > var_len[0] for x in var_len[1::]] ) and any( [x == var_len[0] for x in var_len[1::]] ):
                var_type = 'complexIS'
            if any( [x < var_len[0] for x in var_len[1::]] ) and any( [x > var_len[0] for x in var_len[1::]] ):
                var_type = 'complexDI'
            if any( [x < var_len[0] for x in var_len[1::]] ) and any( [x > var_len[0] for x in var_len[1::]] ) and any( [x == var_len[0] for x in var_len[1::]] ):
                var_type = 'complexDIS'
            return var_type
    
    
    def keep_samples(self, argsDICT):
        """ Keep only samples in record which are listed """
        for s in self.SAMPLES.copy():
            if s not in argsDICT['samples_pos']:
                self.SAMPLES.pop(s)
    
    
    def get_info(self, value):
        """" Return INFO value """
        return [x[1] for x in [x.split('=') for x in self.INFO.split(';')] if x[0] == value][0]
    
    
    def add_info(self, value):
        """ Add INFO value """
        self.INFO+=str(value)
    
    
    def add_format_on_samples(self, s, f, v):
        """ Add new FORMAT field to sample """
        self.SAMPLES[s][f] = v
    
    
    def replace_format_on_samples(self, s, f, v):
        """ Replace FORMAT field of sample """ 
        self.SAMPLES[s][f] = v
    
    
    def get_samples_ad(self):
        """ Return samples AD as list """
        return [self.SAMPLES[x]['AD'] for x in self.SAMPLES]
    
    
    def get_samples_gt(self):
        """ Return samples GT as list """
        return [self.SAMPLES[x]['GT'] for x in self.SAMPLES]
    
    
    def get_samples_gq(self):
        """ Return samples GQ as list """
        return [0 if self.SAMPLES[x]['GQ'] == "." else int(self.SAMPLES[x]['GQ']) for x in self.SAMPLES]


    def get_samples_gp(self):
        """ Return samples GP as list """
        return [[0 if y == "." else float(y) for y in self.SAMPLES[x]['GP']] for x in self.SAMPLES]

    
    def get_samples_dp(self):
        """ Return samples DP as list """
        return [0 if self.SAMPLES[x]['DP'] == "." else int(self.SAMPLES[x]['DP']) for x in self.SAMPLES]
    
    
    def get_samples_pl(self):
        """ Return samples PL as list """
        return [self.SAMPLES[x]['PL'] for x in self.SAMPLES]
    
   
    def convert_pl_gp(self):
        """ Converts phred-scale PL into GP """
        return [list([np.exp(float(x)/-10) for x in y]/np.sum([np.exp(float(x)/-10) for x in y])) for y in self.get_samples_pl()]
    
    
    def get_samples_ad_len(self):
        """ Return samples AD len as list """
        return [len(self.SAMPLES[x]['AD']) for x in self.SAMPLES]
    
    
    def get_samples_ad_ref(self):
        """ Return samples AD REF as list """
        return [self.SAMPLES[x]['AD'][0] for x in self.SAMPLES]
    
    
    def get_samples_ad_alt(self):
        """ Return samples AD ALT as list """
        return [self.SAMPLES[x]['AD'][1:] for x in self.SAMPLES]
    
    
    def gt_type(self, x):
        if len(x)==1:
            if x[0] == '.':
                return 'NA'
            if x[0] != '.':
                return 'hap'
        if x[0] == '.' and x[1] == '.':
            return 'NA'
        if x[0] != '.' and x[1] != '.' and x[0] == x[1]:
            return 'hom'
        if x[0] != '.' and x[1] != '.' and x[0] != x[1]:
            return 'het'
    
    
    def get_gt_type(self):
        """ Return GT type """
        gt_split = re.compile("[|/]")
        GTs = [re.split(gt_split,x) for x in self.get_samples_gt()]
        return [self.gt_type(x) for x in GTs]
    
    
    def get_gt_na_pos(self):
        """ Return index of GT which is NA """
        samplesgttype = self.get_gt_type()
        samplesgtnapos = [x for x,y in enumerate(samplesgttype) if y == 'NA']
        return samplesgtnapos
    
    
    def fill_empty_ad(self):
        """ Replaces "." with 0 and fills to var_number if less """
        x_ = [x+list(np.repeat(0,self.get_var_number()-len(x))) for x in [[int(x.replace(".","0")) for x in y] for y in self.get_samples_ad()]]
        for x,y in zip(x_, self.SAMPLES.keys()):
            self.SAMPLES[y]['AD'] = x
     
     
    def fill_empty_pl(self):
        """ Replaces "." with [0,0,0] """
        x_ = [x+list(np.repeat(0,nCk(self.get_var_number()+1,2)-len(x))) for x in [[int(x.replace(".","0")) for x in y] for y in self.get_samples_pl()]]
        for x,y in zip(x_, self.SAMPLES.keys()):
            self.SAMPLES[y]['PL'] = x
    
    
    def set_samples_ad_by_gt(self, argsDICT):
        if argsDICT['missingGT'][0] == 'keep':
            return
        if argsDICT['missingGT'][0] == 'zero':
            samplesgtnapos = self.get_gt_na_pos()
            #set AD to zero for all samples with GT equals NA
            for s in self.get_gt_na_pos():
                self.replace_format_on_samples(self.SAMPLES.keys()[s], 'AD', list(np.repeat(0, self.get_var_number())))
        if argsDICT['missingGT'][0] == 'set':
            samplesgtnapos = self.get_gt_na_pos()
            #set AD to certain value for all samples with GT equals NA
            for s in self.get_gt_na_pos():
                self.replace_format_on_samples(self.SAMPLES.keys()[s], 'AD', [argsDICT['missingGT'][1]]+list(np.repeat(argsDICT['missingGT'][2], self.get_var_number()-1)))
    
    
    def set_dp_by_ad(self):
        samplesad = self.get_samples_ad()
        samplesadsum = [np.sum([int(x) for x in x]) for x in samplesad]
        #set DP to ADSUM
        for i in range(0, len(self.SAMPLES)):
            self.replace_format_on_samples(self.SAMPLES.keys()[i], 'DP', samplesadsum[i])
    
    
    def set_samples_gp_by_gt(self, argsDICT):
        samplespl = self.get_samples_pl()
        samplesgp = self.convert_pl_gp()
        if argsDICT['missingGT'][0] == 'keep':
            for i in range(0, len(self.SAMPLES)):
                self.replace_format_on_samples(self.SAMPLES.keys()[i], 'GP', samplesgp[i])
        if argsDICT['missingGT'][0] == 'equal':
            samplesgtnapos = self.get_gt_na_pos()
            #set GP to 0.333 for all samples with GT equals NA
            for s in self.get_gt_na_pos():
                samplesgp[s]=list(np.repeat(float(0.333),3))
            for i in range(0, len(self.SAMPLES)):
                self.replace_format_on_samples(self.SAMPLES.keys()[i], 'GP', samplesgp[i])
        if argsDICT['missingGT'][0] == 'set':
            samplesgtnapos = self.get_gt_na_pos()
            #set GP to certain value for all samples with GT equals NA
            for s in self.get_gt_na_pos():
                samplesgp[s]=[argsDICT['missingGT'][1]]+[argsDICT['missingGT'][2]]+[argsDICT['missingGT'][3]]
            for i in range(0, len(self.SAMPLES)):
                self.replace_format_on_samples(self.SAMPLES.keys()[i], 'GP', samplesgp[i])
    

    def set_samples_gp_by_dp(self, argsDICT):
        #setGPbyDP
        samplesdp = self.get_samples_dp()
        samplesgp = self.get_samples_gp()
        #get NotCalled positions
        minDP_NCpos = [x for x,y in enumerate(self.get_samples_dp()) if y < argsDICT['minDP']]
        maxDP_NCpos = [x for x,y in enumerate(self.get_samples_dp()) if y > argsDICT['maxDP']]
        #combine and unique NotCalled positions
        NCpos = [x for x in set(minDP_NCpos+maxDP_NCpos)]
        for x in NCpos:
            samplesgp[x]=list(np.repeat(float(0.333),3))
        for i in range(0, len(self.SAMPLES)):
            self.replace_format_on_samples(self.SAMPLES.keys()[i], 'GP', samplesgp[i])
    
    
    def set_samples_gp_by_gq(self, argsDICT):
        #setGPbyDP
        samplesgq = self.get_samples_gq()
        samplesgp = self.get_samples_gp()
        #get NotCalled positions
        minGQ_NCpos = [x for x,y in enumerate(self.get_samples_gq()) if y < argsDICT['minGQ']]
        #combine and unique NotCalled positions
        NCpos = [x for x in set(minGQ_NCpos)]
        for x in NCpos:
            samplesgp[x]=list(np.repeat(float(0.333),3))
        for i in range(0, len(self.SAMPLES)):
            self.replace_format_on_samples(self.SAMPLES.keys()[i], 'GP', samplesgp[i])
    
    
    def set_samples_gt_by_ad(self, argsDICT):
        if argsDICT['resetGT'] == 'keep':
            return
        if argsDICT['resetGT'] == 'AD':
            #setGTbyAD
            samplesad_ref = self.get_samples_ad_ref()
            samplesad_alt = self.get_samples_ad_alt()
            #NOTE np.argmax return the first element for equal values which would bias GT setting always to the first ALT entry if equal depth exist
            #NOTE this is only used to determine NotCalledFraction, AD for other alleles will be considered
            samplesad_alt_maxpos = [np.argmax(x) for x in samplesad_alt]
            samplesad_alt_max = [np.max(x) for x in samplesad_alt]
            samplesad_ref_alt_max = map(lambda a, b: [a,b], samplesad_ref, samplesad_alt_max)
            samplesad_ref_alt_maxpos = map(lambda a, b: [a,b], list(np.repeat(0,len(samplesad_ref))), [x+1 for x in samplesad_alt_maxpos])
            samplesgt = []
            for a, b in zip(samplesad_ref_alt_max, samplesad_ref_alt_maxpos):
                samplesgt.append(evaluate_ref_alt_max(a, b))
            #get NotCalled positions
            minDP_NCpos = [x for x,y in enumerate(self.get_samples_dp()) if y < argsDICT['minDP']]
            maxDP_NCpos = [x for x,y in enumerate(self.get_samples_dp()) if y > argsDICT['maxDP']]
            minGQ_NCpos = [x for x,y in enumerate(self.get_samples_gq()) if y < argsDICT['minGQ']]
            #combine and unique NotCalled positions
            NCpos = [x for x in set(minDP_NCpos+maxDP_NCpos+minGQ_NCpos)]
            for x in NCpos:
                samplesgt[x]='./.'
            for i in range(0, len(self.SAMPLES)):
                self.replace_format_on_samples(self.SAMPLES.keys()[i], 'GT', samplesgt[i])
    
    
    def get_consensus_calldata(self, argsDICT):
        """ Return consensus of existing record samples """
        consensusdict = collections.OrderedDict()
        for rf in self.FORMAT:
            consensusdict[rf] = None
        consensusdict['AD'] = self.merge_ad()
        consensusdict['DP'] = np.sum(consensusdict['AD'])
        consensusdict['GQ'] = self.merge_gq()
        #evaluate CONSENSUSPL from SAMPLESPL AND SAMPLESGTNAPOS
        consensusdict['PL'] = self.merge_pl()
        consensusdict['GT'] = self.set_consensus_gt_by_ad(consensusdict)
        return consensusdict
    
    
    def set_consensus_gt_by_ad(self, consensusdict):
        consensusad_ref = consensusdict['AD'][0]
        consensusad_alt = consensusdict['AD'][1:]
        #NOTE np.argmax return the first element for equal values which would bias GT setting always to the first ALT entry if equal depth exist
        consensusad_alt_maxpos = np.argmax(consensusad_alt)
        consensusad_alt_max = np.max(consensusad_alt)
        consensusad_ref_alt_max = [consensusad_ref, consensusad_alt_max]
        consensusad_ref_alt_maxpos = [0, consensusad_alt_maxpos+1]
        consensusgt = evaluate_ref_alt_max(consensusad_ref_alt_max, consensusad_ref_alt_maxpos)
        return consensusgt
    
    
    def merge_ad(self):
        ADOUT = []
        for i in range(0, self.get_var_number()):
            ADOUT.append(np.sum([int(y[i]) for x,y in enumerate(self.get_samples_ad()) if x not in self.get_gt_na_pos()]))
        return ADOUT
    
    
    def merge_gq(self):
        return int(np.median([y for x,y in enumerate(self.get_samples_gq()) if x not in self.get_gt_na_pos()]))
    
    
    def merge_dp(self):
        return np.sum([y for x,y in enumerate(self.get_samples_dp()) if x not in self.get_gt_na_pos()])
    
    
    def merge_pl(self):
        PLOUT = []
        for i in range(0,nCk(self.get_var_number()+1,2)):
            PLOUT.append(int(np.median([y[i] for x,y in enumerate(self.get_samples_pl()) if x not in self.get_gt_na_pos()])))
        return PLOUT
    
    

    def get_ACGT_allele_counts(self, var_type):
        if var_type != 'noVAR' and var_type != 'biSNP' and var_type != 'muSNP':
            return
        if var_type == 'repSNP':
            for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
                self.add_format_on_samples(sample_, 'ACGT_count', get_ACGT_count_dict(0,0,0,0))
                self.SAMPLES[sample_]['ACGT_count'][list(set(self.REF[0]))[0]] = self.get_samples_ad_ref()[s_pos]
                for alt_pos, alt_ in enumerate(self.ALT):
                    self.SAMPLES[sample_]['ACGT_count'][list(set(alt_))[0]] = self.get_samples_ad_alt()[s_pos][alt_pos]
        for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
            self.add_format_on_samples(sample_, 'ACGT_count', get_ACGT_count_dict(0,0,0,0))
            self.SAMPLES[sample_]['ACGT_count'][self.REF[0]] = self.get_samples_ad_ref()[s_pos]
            for alt_pos, alt_ in enumerate(self.ALT):
                self.SAMPLES[sample_]['ACGT_count'][alt_] = self.get_samples_ad_alt()[s_pos][alt_pos]
        
    
    def get_alleles(self, var_type_, type_):
        if var_type_ == 'noVAR':
            MAJOUT = list(np.repeat(self.REF,len(self.SAMPLES)))
            MINOUT = list(np.repeat(self.REF,len(self.SAMPLES)))
            IUPACOUT = list(np.repeat(self.REF,len(self.SAMPLES)))
            for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
                MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _biSNP(self, sample_, type_)
            return [MAJOUT,MINOUT,IUPACOUT]
        if var_type_ == 'biSNP':
            MAJOUT = list(np.repeat('N',len(self.SAMPLES)))
            MINOUT = list(np.repeat('N',len(self.SAMPLES)))
            IUPACOUT = list(np.repeat('N',len(self.SAMPLES)))
            for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
                MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _biSNP(self, sample_, type_)
            return [MAJOUT,MINOUT,IUPACOUT]
        if var_type_ == 'muSNP':
            MAJOUT = list(np.repeat('N',len(self.SAMPLES)))
            MINOUT = list(np.repeat('N',len(self.SAMPLES)))
            IUPACOUT = list(np.repeat('N',len(self.SAMPLES)))
            for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
                MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _muSNP(self, sample_, type_)
            return [MAJOUT,MINOUT,IUPACOUT]
        if var_type_ == 'repSNP':
            MAJOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))
            MINOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))
            IUPACOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))
            for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
                MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _repSNP(self, sample_, type_)
            return [MAJOUT,MINOUT,IUPACOUT]
        if var_type_ == 'deletion':
            MAJOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))
            MINOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))
            IUPACOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))            
            for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
                MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _deletion(self, sample_, type_)
            return [MAJOUT,MINOUT,IUPACOUT]
        if var_type_ == 'mudeletion':
            MAJOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))
            MINOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))
            IUPACOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))            
            for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
                MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _mudeletion(self, sample_, type_)
            return [MAJOUT,MINOUT,IUPACOUT]
        if var_type_ == 'insertion':
            MAJOUT = list(np.repeat(self.REF,len(self.SAMPLES)))
            MINOUT = list(np.repeat(self.REF,len(self.SAMPLES)))
            IUPACOUT = list(np.repeat(self.REF,len(self.SAMPLES)))            
            return [MAJOUT,MINOUT,IUPACOUT]
        if var_type_ == 'muinsertion':
            MAJOUT = list(np.repeat(self.REF,len(self.SAMPLES)))
            MINOUT = list(np.repeat(self.REF,len(self.SAMPLES)))
            IUPACOUT = list(np.repeat(self.REF,len(self.SAMPLES)))            
            return [MAJOUT,MINOUT,IUPACOUT]
        if var_type_ == 'complexDS':
            MAJOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))
            MINOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))
            IUPACOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))
            for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
                MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _complexDS(self, sample_, type_)
            return [MAJOUT,MINOUT,IUPACOUT]
        if var_type_ == 'complexIS':
            MAJOUT = list(np.repeat('N',len(self.SAMPLES)))
            MINOUT = list(np.repeat('N',len(self.SAMPLES)))
            IUPACOUT = list(np.repeat('N',len(self.SAMPLES)))
            for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
                MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _complexIS(self, sample_, type_)
            return [MAJOUT,MINOUT,IUPACOUT]
        if var_type_ == 'complexDI':
            MAJOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))
            MINOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))
            IUPACOUT = list(np.repeat('N'*np.max(self.get_var_len()),len(self.SAMPLES)))
            for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
                MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _complexDI(self, sample_, type_)
            return [MAJOUT,MINOUT,IUPACOUT]
        if var_type_ == 'complexDIS':
            return


#common dictionaries
IUPACdict = {'A':'A','AA':'A','C':'C','CC':'C','G':'G','GG':'G','T':'T','TT':'T','-':'-','N':'N','.':'.',
'AG':'R','GA':'R','CT':'Y','TC':'Y','CG':'S','GC':'S','AT':'W','TA':'W','GT':'K','TG':'K','AC':'M','CA':'M',
'CGT':'B','CTG':'B','GCT':'B','GTC':'B','TCG':'B','TGC':'B',
'AGT':'D','ATG':'D','GAT':'D','GTA':'D','TAG':'D','TGA':'D',
'ACT':'H','ATC':'H','CAT':'H','CTA':'H','TAC':'H','TCA':'H',
'ACG':'V','AGC':'V','CAG':'V','CGA':'V','GAC':'V','GCA':'V',
'ACGT':'N','ACTG':'N','AGCT':'N','AGTC':'N','ATCG':'N','ATGC':'N',
'CAGT':'N','CATG':'N','CGAT':'N','CGTA':'N','CTAG':'N','CTGA':'N',
'GACT':'N','GATC':'N','GCAT':'N','GCTA':'N','GTAC':'N','GTCA':'N',
'TACG':'N','TAGC':'N','TCAG':'N','TCGA':'N','TGAC':'N','TGCA':'N',
'A-':'N','C-':'N','G-':'N','T-':'N',
'-A':'N','-C':'N','-G':'N','-T':'N',
'-AG':'N','-CT':'N','-CG':'N','-AT':'N','-GT':'N','-AC':'N',
'-CGT':'N','-AGT':'N','-ACT':'N','-ACG':'N',
'-ACGT':'N',
'A*':'N','C*':'N','G*':'N','T*':'N'}


#common functions
def sliding_window_steps_generator(sw,j,start,end):
    if end<=start:
        print 'end must be smaller than start'
        return
    start_seq = np.arange(start,end,j)
    end_seq = np.arange(start+sw-1,end,j)
    end_seq = np.append(end_seq,np.repeat(end,len(start_seq)-len(end_seq)))
    mid_seq = (start_seq+end_seq)/2
    return([start_seq,mid_seq,end_seq])


def getIUPAC(x):
    return IUPACdict[x]


def POINT2zero(x):
    if x == '.':
        return '0'
    else:
        return x


def POINTNA2zero(x):
    if x == '.':
        return '0'
    if x == 'NA':
        return '0'
    else:
        return x


def NA2zero(x):
    if x == 'NA':
        return int(0)
    else:
        return int(x)


def nCk(n, k):
    return len(list(itertools.combinations(range(n), k)))


def evaluate_ref_alt_max(x,y):
    if x[0]==0 and x[1]==0:
        return './.'
    if x[0]>0 and x[1]==0:
        return str(y[0])+'/'+str(y[0])
    if x[0]==0 and x[1]>0:
        return str(y[1])+'/'+str(y[1])
    if x[0]>0 and x[1]>0:
        return str(y[0])+'/'+str(y[1])


def get_header_from_vcf( argsDICT ):
    HEADER = []
    with open(argsDICT['ivcf'],'rb') as f:
        for flines in f:
           if flines[:2]=='##':
               HEADER.append(flines)
           if flines[:4]=='#CHR':
               HEADER.append('##INFO=<ID=NCFnumber,Number=1,Type=String,Description="NotCalledFraction number">\n')
               HEADER.append('##INFO=<ID=NCFfraction,Number=1,Type=Float,Description="NotCalledFraction fraction">\n')
               HEADER.append('##mvcf2consensus.py '+argsDICT['args'][0]+'\n')
               HEADER.append(changeHEADER(flines, argsDICT['add'], argsDICT['id'], argsDICT['samples']))
           if flines[0]!='#':
               break
    return HEADER


def get_header_from_gz( argsDICT ):
    HEADER = []
    with gzip.open(argsDICT['ivcf'],'rb') as f:
        for flines in f:
           if flines[:2]=='##':
               HEADER.append(flines)
           if flines[:4]=='#CHR':
               HEADER.append('##INFO=<ID=NCFnumber,Number=1,Type=String,Description="NotCalledFraction number">\n')
               HEADER.append('##INFO=<ID=NCFfraction,Number=1,Type=Float,Description="NotCalledFraction fraction">\n')
               HEADER.append('##mvcf2consensus.py '+argsDICT['args'][0]+'\n')
               HEADER.append(changeHEADER(flines, argsDICT['add'], argsDICT['id'], argsDICT['samples']))
           if flines[0]!='#':
               break
    return HEADER


def get_sample_pos_from_header( argsDICT ):
    SAMPLE_POS = []
    if argsDICT['tabix']:
        with gzip.open(argsDICT['ivcf'],'rb') as f:
            for flines in f:
               if flines[:4]=='#CHR':
                   flines_stripped = flines.strip().split('\t')
               if flines[0]!='#':
                   break
    if not argsDICT['tabix']:
        with open(argsDICT['ivcf'],'rb') as f:
            for flines in f:
                if flines[:4]=='#CHR':
                    flines_stripped = flines.strip().split('\t')
                if flines[0]!='#':
                    break
    for s in argsDICT['samples']:
        if s in ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT']:
            SAMPLE_POS.append([x for x,y in enumerate(flines_stripped) if y == s][1])
        if s not in ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT']:
            SAMPLE_POS.append([x for x,y in enumerate(flines_stripped) if y == s][0])
    return SAMPLE_POS


def get_sample_pos_dict_from_header( argsDICT ):
    sampleposDICT = {}
    if argsDICT['tabix']:
        with gzip.open(argsDICT['ivcf'],'rb') as f:
            for flines in f:
               if flines[:4]=='#CHR':
                   flines_stripped = flines.strip().split('\t')
               if flines[0]!='#':
                   break
    if not argsDICT['tabix']:
        with open(argsDICT['ivcf'],'rb') as f:
            for flines in f:
                if flines[:4]=='#CHR':
                    flines_stripped = flines.strip().split('\t')
                if flines[0]!='#':
                    break
    for s in argsDICT['samples']:
        if s in ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT']:
            sampleposDICT[s]=[x for x,y in enumerate(flines_stripped) if y == s][1]
        if s not in ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT']:
            sampleposDICT[s]=[x for x,y in enumerate(flines_stripped) if y == s][0]
    return sampleposDICT



def changeHEADER(header, add, name, samples):
    if add:
        headersplit = header.strip().split('\tFORMAT')
        return('\t'.join([headersplit[0]]+['FORMAT']+samples+[name])+'\n')
    if not add:
        headersplit = header.strip().split('\tFORMAT')
        return('\t'.join([headersplit[0]]+['FORMAT']+[name])+'\n')


def get_missing_type(x):
    if x == 'keep':
        return ['keep', 0, 0]
    if x == 'zero':
        return ['zero', 0, 0]
    if 'set' in x:
        return ['set', int(x.split(':')[1]), int(x.split(':')[2])]


def get_missing_type2(x):
    if x == 'keep':
        return ['keep', 0.333, 0.333, 0.333]
    if x == 'equal':
        return ['equal', 0.333, 0.333, 0.333]
    if 'set' in x:
        return ['set', float(x.split(':')[1]), float(x.split(':')[2]), float(x.split(':')[3])]


def get_reference_len( argsDICT ):
    print 'start obtaining reference length'
    length_dict = {}
    for seq in SeqIO.parse( argsDICT['R'], 'fasta' ):
        if argsDICT['verbose']:
            print seq.id
        if seq.id not in argsDICT['chr']:
            continue
        if seq.id in argsDICT['chr']:
            length_dict[seq.id] = len( seq )
    print 'finished obtaining reference length'    
    return length_dict


def duplicate_sequence(fastafile, id):
    # type: (object, object) -> object
    for rec in SeqIO.parse(fastafile, "fasta"):
        if rec.id == id:
            duplicated_seq = SeqRecord.SeqRecord(rec.seq.tomutable())
            duplicated_seq.id = rec.id
            duplicated_seq.name = rec.name
            duplicated_seq.description = rec.description
            return duplicated_seq


def get_mask_dict ( argsDICT ):
    cov = int(argsDICT['cov2N'])
    chromosomes = argsDICT['chr']
    ibga = argsDICT['ibga']
    maskDICT={}
    print 'start loading masking bga'
    for lines in open(ibga, 'r'):
        chr_, start_, end_, cov_ = lines.strip().split( '\t' )
        start_ = int(start_)
        end = int(end_)
        cov_ = int(cov_)
        if chr_ not in chromosomes:
            continue
        if chr_ not in maskDICT:
            print 'working on %s' % ( chr_ )
            maskDICT[chr_] = {}
        if cov_ > cov:
            continue
        if cov_ <= cov:
            maskDICT[chr_][str( int( start_ ) + 1 ) + '_' + end_] = 0
    for mchr in maskDICT.keys():
        maskDICT[mchr] = interval.union([interval([x.split('_')[0],x.split('_')[1]]) for x in maskDICT[mchr].keys()])
    print 'finished loading masking bga'
    return maskDICT


def get_mask_dict_multiple_by_region ( argsDICT, referenceDICT ):
    cov = int(argsDICT['cov2N'])
    chromosome = argsDICT['chr']
    print 'start loading masking bga'
    for s_pos, sample_ in enumerate(argsDICT['samples_pos']):
        print 'working on %s' % ( argsDICT['samples'][s_pos] )
        ibga = argsDICT['ibga'][s_pos]
        for lines in open(ibga, 'r'):
            chr_, start_, end_, cov_ = lines.strip().split( '\t' )
            start_ = int(start_)
            end = int(end_)
            cov_ = int(cov_)
            if chr_ not in chromosome:
                continue
            if cov_ > cov:
                continue
            if cov_ <= cov:
                referenceDICT[sample_][chr_]['BGA'][str( int( start_ ) + 1 ) + '_' + end_] = 0
        referenceDICT[sample_][chromosome]['BGA'] = interval([int(argsDICT['chr_start']),int(argsDICT['chr_end'])]) & interval.union([interval([x.split('_')[0],x.split('_')[1]]) for x in referenceDICT[sample_][chromosome]['BGA'].keys()])
        print 'finished working on %s' % ( argsDICT['samples'][s_pos] )
    print 'finished loading masking bga'


def get_ACGT_count_dict(A_, C_, G_, T_):
    ACGT_count_dict = collections.OrderedDict()
    ACGT_count_dict['A'] = A_
    ACGT_count_dict['C'] = C_
    ACGT_count_dict['G'] = G_
    ACGT_count_dict['T'] = T_
    return ACGT_count_dict


#allele functions
def _biSNP(self, sample_pos, type_):
    if sample_pos not in self.SAMPLES.keys():
        return
    samplesad_ = self.SAMPLES[sample_pos]['AD']
    ref_alt_ = self.get_ref_alt()
    ref_ = ref_alt_[0]
    alt_ = ref_alt_[1:]
    ref_out = 'N'
    alt_out = 'N'
    iupac_out = 'N'
    samplesad_ = [int(x) for x in samplesad_]
    ref_alt_nonzero = [ref_alt_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    samplesad_nonzero = [samplesad_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    if len(samplesad_nonzero) == 0:
        return [ref_out, alt_out, iupac_out]
    admax = np.max([int(x) for x in samplesad_nonzero])
    admin = np.min([int(x) for x in samplesad_nonzero])
    admaxpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admax]
    adminpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]!=admax]
    if type_ == 'refaltN':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refaltsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refmajorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[admaxpos[0]]
            alt_out = ref_alt_nonzero[adminpos[0]]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refminorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[adminpos[0]]
            alt_out = ref_alt_nonzero[admaxpos[0]]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
    if type_ == 'iupac':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]


def _muSNP(self, sample_pos, type_):
    if sample_pos not in self.SAMPLES.keys():
        return
    samplesad_ = self.SAMPLES[sample_pos]['AD']
    ref_alt_ = self.get_ref_alt()
    ref_ = ref_alt_[0]
    alt_ = ref_alt_[1:]
    ref_out = 'N'
    alt_out = 'N'
    iupac_out = 'N'
    samplesad_ = [int(x) for x in samplesad_]
    ref_alt_nonzero = [ref_alt_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    samplesad_nonzero = [samplesad_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    if len(samplesad_nonzero) == 0:
        return [ref_out, alt_out, iupac_out]
    admax = np.max([int(x) for x in samplesad_nonzero])
    admin = np.min([int(x) for x in samplesad_nonzero])
    admaxpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admax]
    adminpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]!=admax]
    if type_ == 'refaltN':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>1:
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refaltsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = random.sample([x for x in ref_alt_nonzero if x != ref_out],1)[0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refmajorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[admaxpos[0]]
            alt_out = ref_alt_nonzero[adminpos[0]]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) != 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample(min_nonzero,1)[0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) == 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample([x for x in max_nonzero if x != ref_out],1)[0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refminorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[adminpos[0]]
            alt_out = ref_alt_nonzero[admaxpos[0]]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) != 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample(min_nonzero,1)[0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) == 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            ref_out = random.sample(min_nonzero,1)[0]
            alt_out = random.sample([x for x in min_nonzero if x != ref_out],1)[0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
    if type_ == 'iupac':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>1:
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]


def _repSNP(self, sample_pos, type_):
    if sample_pos not in self.SAMPLES.keys():
        return
    samplesad_ = self.SAMPLES[sample_pos]['AD']
    ref_alt_ = self.get_ref_alt()
    ref_ = ref_alt_[0]
    alt_ = ref_alt_[1:]
    ref_out = ''.join(list(repeat('N',len(ref_))))
    alt_out = ''.join(list(repeat('N',len(ref_))))
    iupac_out = ''.join(list(repeat('N',len(ref_))))
    samplesad_ = [int(x) for x in samplesad_]
    ref_alt_nonzero = [ref_alt_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    samplesad_nonzero = [samplesad_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    if len(samplesad_nonzero) == 0:
        return [ref_out, alt_out, iupac_out]
    admax = np.max([int(x) for x in samplesad_nonzero])
    admin = np.min([int(x) for x in samplesad_nonzero])
    admaxpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admax]
    adminpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admin]
    if type_ == 'refaltN':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refaltsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refmajorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[admaxpos[0]]
            alt_out = ref_alt_nonzero[adminpos[0]]
            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refminorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[adminpos[0]]
            alt_out = ref_alt_nonzero[admaxpos[0]]
            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
            return [ref_out, alt_out, iupac_out]
    if type_ == 'iupac':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
            return [ref_out, alt_out, iupac_out]


def _deletion(self, sample_pos, type_):
    if sample_pos not in self.SAMPLES.keys():
        return
    samplesad_ = self.SAMPLES[sample_pos]['AD']
    ref_alt_ = self.get_ref_alt()
    ref_ = ref_alt_[0]
    alt_ = ref_alt_[1:]
    ref_out = ref_[:len(alt_[0])]+''.join(list(repeat('N',len(ref_)-len(alt_[0]))))
    alt_out = ref_[:len(alt_[0])]+''.join(list(repeat('N',len(ref_)-len(alt_[0]))))
    iupac_out = ref_[:len(alt_[0])]+''.join(list(repeat('N',len(ref_)-len(alt_[0]))))
    samplesad_ = [int(x) for x in samplesad_]
    ref_alt_nonzero = [ref_alt_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    samplesad_nonzero = [samplesad_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    ref_ad = samplesad_[0]
    alt_ad = samplesad_[1:]
    alt_nonzero = [alt_[x] for x,y in enumerate(alt_ad) if alt_ad[x]!=0]
    alt_ad_nonzero = [alt_ad[x] for x,y in enumerate(alt_ad) if alt_ad[x]!=0]
    deletion_len = [len(ref_)-len(x) for x in alt_nonzero]
    deletion_str = [''.join(list(repeat('-',x))) for x in deletion_len]
    deletion_str_out = [alt_nonzero[x]+deletion_str[x] for x,y in enumerate(alt_nonzero)]
    if ref_ad != 0:
        samplesad_nonzero = [ref_ad]+alt_ad_nonzero
        ref_alt_nonzero = [ref_]+deletion_str_out
    if ref_ad == 0:
        samplesad_nonzero = alt_ad_nonzero
        ref_alt_nonzero = deletion_str_out
    if len(samplesad_nonzero) == 0:
        return [ref_out, alt_out, iupac_out]
    admax = np.max([int(x) for x in samplesad_nonzero])
    admin = np.min([int(x) for x in samplesad_nonzero])
    admaxpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admax]
    adminpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admin]
    if type_ == 'refaltN':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refaltsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refmajorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[admaxpos[0]]
            alt_out = ref_alt_nonzero[adminpos[0]]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refminorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[adminpos[0]]
            alt_out = ref_alt_nonzero[admaxpos[0]]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            return [ref_out, alt_out, iupac_out]
    if type_ == 'iupac':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            return [ref_out, alt_out, iupac_out]


def _mudeletion(self, sample_pos, type_):
    if sample_pos not in self.SAMPLES.keys():
        return
    samplesad_ = self.SAMPLES[sample_pos]['AD']
    ref_alt_ = self.get_ref_alt()
    ref_ = ref_alt_[0]
    alt_ = ref_alt_[1:]
    alt_min = np.min([len(x) for x in alt_])
    ref_out = ref_[:alt_min]+''.join(list(repeat('N',len(ref_)-alt_min)))
    alt_out = ref_[:alt_min]+''.join(list(repeat('N',len(ref_)-alt_min)))
    iupac_out = ref_[:alt_min]+''.join(list(repeat('N',len(ref_)-alt_min)))
    samplesad_ = [int(x) for x in samplesad_]
    ref_alt_nonzero = [ref_alt_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    samplesad_nonzero = [samplesad_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    ref_ad = samplesad_[0]
    alt_ad = samplesad_[1:]
    alt_nonzero = [alt_[x] for x,y in enumerate(alt_ad) if alt_ad[x]!=0]
    alt_ad_nonzero = [alt_ad[x] for x,y in enumerate(alt_ad) if alt_ad[x]!=0]
    deletion_len = [len(ref_)-len(x) for x in alt_nonzero]
    deletion_str = [''.join(list(repeat('-',x))) for x in deletion_len]
    deletion_str_out = [alt_nonzero[x]+deletion_str[x] for x,y in enumerate(alt_nonzero)]
    if ref_ad != 0:
        samplesad_nonzero = [ref_ad]+alt_ad_nonzero
        ref_alt_nonzero = [ref_]+deletion_str_out
    if ref_ad == 0:
        samplesad_nonzero = alt_ad_nonzero
        ref_alt_nonzero = deletion_str_out
    if len(samplesad_nonzero) == 0:
        return [ref_out, alt_out, iupac_out]
    admax = np.max([int(x) for x in samplesad_nonzero])
    admin = np.min([int(x) for x in samplesad_nonzero])
    admaxpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admax]
    adminpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]!=admax]
    if type_ == 'refaltN':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>1:
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refaltsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = random.sample([x for x in ref_alt_nonzero if x != ref_out],1)[0]
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refmajorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[admaxpos[0]]
            alt_out = ref_alt_nonzero[adminpos[0]]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) != 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample(min_nonzero,1)[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) == 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample([x for x in max_nonzero if x != ref_out],1)[0]
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refminorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[adminpos[0]]
            alt_out = ref_alt_nonzero[admaxpos[0]]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) != 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample(min_nonzero,1)[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) == 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            ref_out = random.sample(min_nonzero,1)[0]
            alt_out = random.sample([x for x in min_nonzero if x != ref_out],1)[0]
            return [ref_out, alt_out, iupac_out]
    if type_ == 'iupac':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>1:
            return [ref_out, alt_out, iupac_out]


def _complexDS(self, sample_pos, type_):
    if sample_pos not in self.SAMPLES.keys():
        return
    samplesad_ = self.SAMPLES[sample_pos]['AD']
    ref_alt_ = self.get_ref_alt()
    ref_ = ref_alt_[0]
    alt_ = ref_alt_[1:]
    alt_min = np.min([len(x) for x in alt_])
    alt_max = np.max([len(x) for x in alt_])
    ref_out = ''.join(list(repeat('N',len(ref_))))
    alt_out = ''.join(list(repeat('N',len(ref_))))
    iupac_out = ''.join(list(repeat('N',len(ref_))))
    samplesad_ = [int(x) for x in samplesad_]
    ref_alt_nonzero = [ref_alt_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    samplesad_nonzero = [samplesad_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    ref_ad = samplesad_[0]
    alt_ad = samplesad_[1:]
    alt_nonzero = [alt_[x] for x,y in enumerate(alt_ad) if alt_ad[x]!=0]
    alt_ad_nonzero = [alt_ad[x] for x,y in enumerate(alt_ad) if alt_ad[x]!=0]
    alt_nonzero_SNP = [x for x in alt_nonzero if len(x) == len(ref_)]
    alt_ad_nonzero_SNP = [alt_ad_nonzero[x] for x,y in enumerate(alt_ad_nonzero) if alt_nonzero[x] in alt_nonzero_SNP]
    alt_nonzero_deletion = [x for x in alt_nonzero if len(x) != len(ref_)]
    alt_ad_nonzero_deletion = [alt_ad_nonzero[x] for x,y in enumerate(alt_ad_nonzero) if alt_nonzero[x] in alt_nonzero_deletion]
    deletion_len = [len(ref_)-len(x) for x in alt_nonzero_deletion]
    deletion_str = [''.join(list(repeat('-',x))) for x in deletion_len]
    deletion_str_out = [alt_nonzero_deletion[x]+deletion_str[x] for x,y in enumerate(alt_nonzero_deletion)]
    if ref_ad != 0:
        samplesad_nonzero = [ref_ad]+alt_ad_nonzero_SNP+alt_ad_nonzero_deletion
        ref_alt_nonzero = [ref_]+alt_nonzero_SNP+deletion_str_out
    if ref_ad == 0:
        samplesad_nonzero = alt_ad_nonzero_SNP+alt_ad_nonzero_deletion
        ref_alt_nonzero = alt_nonzero_SNP+deletion_str_out
    if len(samplesad_nonzero) == 0:
        return [ref_out, alt_out, iupac_out]
    admax = np.max([int(x) for x in samplesad_nonzero])
    admin = np.min([int(x) for x in samplesad_nonzero])
    admaxpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admax]
    adminpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]!=admax]
    if type_ == 'refaltN':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>1:
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refaltsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = random.sample([x for x in ref_alt_nonzero if x != ref_out],1)[0]
            iupac_out = ''.join([getIUPAC(x) for x in [''.join(sorted(''.join(list(set(''.join([y[i] for x,y in enumerate(ref_alt_nonzero)])))))) for i in range(0,np.max([len(x) for x in ref_alt_]))]])
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refmajorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[admaxpos[0]]
            alt_out = ref_alt_nonzero[adminpos[0]]
            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) != 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample(min_nonzero,1)[0]
            iupac_out = ''.join([getIUPAC(x) for x in [''.join(sorted(''.join(list(set(''.join([y[i] for x,y in enumerate(ref_alt_nonzero)])))))) for i in range(0,np.max([len(x) for x in ref_alt_]))]])
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) == 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample([x for x in max_nonzero if x != ref_out],1)[0]
            iupac_out = ''.join([getIUPAC(x) for x in [''.join(sorted(''.join(list(set(''.join([y[i] for x,y in enumerate(ref_alt_nonzero)])))))) for i in range(0,np.max([len(x) for x in ref_alt_]))]])
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refminorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[adminpos[0]]
            alt_out = ref_alt_nonzero[admaxpos[0]]
            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) != 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample(min_nonzero,1)[0]
            iupac_out = ''.join([getIUPAC(x) for x in [''.join(sorted(''.join(list(set(''.join([y[i] for x,y in enumerate(ref_alt_nonzero)])))))) for i in range(0,np.max([len(x) for x in ref_alt_]))]])
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) == 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            ref_out = random.sample(min_nonzero,1)[0]
            alt_out = random.sample([x for x in min_nonzero if x != ref_out],1)[0]
            iupac_out = ''.join([getIUPAC(x) for x in [''.join(sorted(''.join(list(set(''.join([y[i] for x,y in enumerate(ref_alt_nonzero)])))))) for i in range(0,np.max([len(x) for x in ref_alt_]))]])
            return [ref_out, alt_out, iupac_out]
    if type_ == 'iupac':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2:
            iupac_out = ''.join([getIUPAC(x) for x in [''.join(sorted(''.join(list(set(''.join([y[i] for x,y in enumerate(ref_alt_nonzero)])))))) for i in range(0,np.max([len(x) for x in ref_alt_]))]])
            return [ref_out, alt_out, iupac_out]


def _complexIS(self, sample_pos, type_):
    """ only set SNP - discard insertion allele """
    if sample_pos not in self.SAMPLES.keys():
        return
    samplesad_ = self.SAMPLES[sample_pos]['AD']
    ref_alt_ = self.get_ref_alt()
    ref_ = ref_alt_[0]
    alt_ = ref_alt_[1:]
    ref_out = 'N'
    alt_out = 'N'
    iupac_out = 'N'
    samplesad_ = [int(x) for x in samplesad_]
    ref_alt_nonzero = [ref_alt_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    samplesad_nonzero = [samplesad_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    ref_ad = samplesad_[0]
    alt_ad = samplesad_[1:]
    alt_nonzero = [alt_[x] for x,y in enumerate(alt_ad) if alt_ad[x]!=0]
    alt_ad_nonzero = [alt_ad[x] for x,y in enumerate(alt_ad) if alt_ad[x]!=0]
    alt_nonzero_SNP = [x for x in alt_nonzero if len(x) == len(ref_)]
    alt_ad_nonzero_SNP = [alt_ad_nonzero[x] for x,y in enumerate(alt_ad_nonzero) if alt_nonzero[x] in alt_nonzero_SNP]
    if ref_ad != 0:
        samplesad_nonzero = [ref_ad]+alt_ad_nonzero_SNP
        ref_alt_nonzero = [ref_]+alt_nonzero_SNP
    if ref_ad == 0:
        samplesad_nonzero = alt_ad_nonzero_SNP
        ref_alt_nonzero = alt_nonzero_SNP
    if len(samplesad_nonzero) == 0:
        return [ref_out, alt_out, iupac_out]
    admax = np.max([int(x) for x in samplesad_nonzero])
    admin = np.min([int(x) for x in samplesad_nonzero])
    admaxpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admax]
    adminpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]!=admax]
    if type_ == 'refaltN':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>1:
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refaltsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = random.sample([x for x in ref_alt_nonzero if x != ref_out],1)[0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refmajorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[admaxpos[0]]
            alt_out = ref_alt_nonzero[adminpos[0]]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) != 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample(min_nonzero,1)[0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) == 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample([x for x in max_nonzero if x != ref_out],1)[0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refminorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[adminpos[0]]
            alt_out = ref_alt_nonzero[admaxpos[0]]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) != 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample(min_nonzero,1)[0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) == 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            ref_out = random.sample(min_nonzero,1)[0]
            alt_out = random.sample([x for x in min_nonzero if x != ref_out],1)[0]
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]
    if type_ == 'iupac':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>1:
            iupac_out = getIUPAC(''.join(ref_alt_nonzero))
            return [ref_out, alt_out, iupac_out]



def _complexDI(self, sample_pos, type_):
    """ only set deletion - discard insertion allele """
    if sample_pos not in self.SAMPLES.keys():
        return
    samplesad_ = self.SAMPLES[sample_pos]['AD']
    ref_alt_ = self.get_ref_alt()
    ref_ = ref_alt_[0]
    alt_ = ref_alt_[1:]
    alt_min = np.min([len(x) for x in alt_])
    ref_out = ref_[:alt_min]+''.join(list(repeat('N',len(ref_)-alt_min)))
    alt_out = ref_[:alt_min]+''.join(list(repeat('N',len(ref_)-alt_min)))
    iupac_out = ref_[:alt_min]+''.join(list(repeat('N',len(ref_)-alt_min)))
    samplesad_ = [int(x) for x in samplesad_]
    ref_alt_nonzero = [ref_alt_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    samplesad_nonzero = [samplesad_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
    ref_ad = samplesad_[0]
    alt_ad = samplesad_[1:]
    alt_nonzero = [alt_[x] for x,y in enumerate(alt_ad) if alt_ad[x]!=0]
    alt_ad_nonzero = [alt_ad[x] for x,y in enumerate(alt_ad) if alt_ad[x]!=0]
    alt_nonzero_deletion = [x for x in alt_nonzero if len(x) < len(ref_)]
    alt_ad_nonzero_deletion = [alt_ad_nonzero[x] for x,y in enumerate(alt_ad_nonzero) if alt_nonzero[x] in alt_nonzero_deletion]
    deletion_len = [len(ref_)-len(x) for x in alt_nonzero_deletion]
    deletion_str = [''.join(list(repeat('-',x))) for x in deletion_len]
    deletion_str_out = [alt_nonzero_deletion[x]+deletion_str[x] for x,y in enumerate(alt_nonzero_deletion)]
    if ref_ad != 0:
        samplesad_nonzero = [ref_ad]+alt_ad_nonzero_deletion
        ref_alt_nonzero = [ref_]+deletion_str_out
    if ref_ad == 0:
        samplesad_nonzero = alt_ad_nonzero_deletion
        ref_alt_nonzero = deletion_str_out
    if len(samplesad_nonzero) == 0:
        return [ref_out, alt_out, iupac_out]
    admax = np.max([int(x) for x in samplesad_nonzero])
    admin = np.min([int(x) for x in samplesad_nonzero])
    admaxpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admax]
    adminpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]!=admax]
    if type_ == 'refaltN':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>1:
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refaltsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = random.sample([x for x in ref_alt_nonzero if x != ref_out],1)[0]
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refmajorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[admaxpos[0]]
            alt_out = ref_alt_nonzero[adminpos[0]]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) != 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample(min_nonzero,1)[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) == 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample([x for x in max_nonzero if x != ref_out],1)[0]
            return [ref_out, alt_out, iupac_out]
    if type_ == 'refminorsample':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
            ref_out = ref_alt_nonzero[adminpos[0]]
            alt_out = ref_alt_nonzero[admaxpos[0]]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
            ref_out = random.sample(ref_alt_nonzero,1)[0]
            alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) != 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            ref_out = random.sample(max_nonzero,1)[0]
            alt_out = random.sample(min_nonzero,1)[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) == 1:
            max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
            min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
            ref_out = random.sample(min_nonzero,1)[0]
            alt_out = random.sample([x for x in min_nonzero if x != ref_out],1)[0]
            return [ref_out, alt_out, iupac_out]
    if type_ == 'iupac':
        if len(ref_alt_nonzero)==1:
            ref_out = ref_alt_nonzero[0]
            alt_out = ref_alt_nonzero[0]
            iupac_out = ref_alt_nonzero[0]
            return [ref_out, alt_out, iupac_out]
        if len(ref_alt_nonzero)>1:
            return [ref_out, alt_out, iupac_out]


#TODO
#def getAlleles_complexDIS():
#    if type_ == 'refaltN':
#    if type_ == 'refaltsample':
#    if type_ == 'refmajorsample':
#    if type_ == 'refminorsample':
#    if type_ == 'iupac':


#vcf2fasta specific functions
def get_gtf_dict(gtf):
    gtfDICT={}
    counter=0
    for lines in open(gtf,'rU'):
        if lines[0] == "#":
            continue
        counter+=1
        chr_, source_, type_, start_, end_, score_, strand_, phase_, attr_ = lines.strip().split('\t')
        gene_id_ = ''
        transcript_id_ = ''
        gene_id_ = attr_.split('gene_id ')[1].split(';')[0].replace('"','')
        if type_ != 'gene':
            transcript_id_ = attr_.split('transcript_id ')[1].split(';')[0].replace('"','')
        if chr_ in gtfDICT:
            if type_ == 'gene' and gene_id_ in gtfDICT[chr_]['gene']:
                gtfDICT[chr_]['gene'][gene_id_]['gene'].append([start_,end_,strand_])
            if type_ == 'gene' and gene_id_ not in gtfDICT[chr_]['gene']:
                gtfDICT[chr_]['gene'][gene_id_]={'gene':[],'transcripts':{}}
                gtfDICT[chr_]['gene'][gene_id_]['gene'].append([start_,end_,strand_])
                continue
            if type_ != 'gene' and gene_id_ in gtfDICT[chr_]['gene']:
                if transcript_id_ in gtfDICT[chr_]['gene'][gene_id_]['transcripts']:
                    gtfDICT[chr_]['gene'][gene_id_]['transcripts'][transcript_id_][type_].append([start_,end_,strand_])
                    continue
                if transcript_id_ not in gtfDICT[chr_]['gene'][gene_id_]['transcripts']:
                    gtfDICT[chr_]['gene'][gene_id_]['transcripts'][transcript_id_]={'transcript':[], 'start_codon':[], 'five_prime_utr':[], 'exon':[], 'CDS':[], 'stop_codon':[], 'three_prime_utr':[], 'Selenocysteine':[]}
                    gtfDICT[chr_]['gene'][gene_id_]['transcripts'][transcript_id_][type_].append([start_,end_,strand_])
            if type_ != 'gene' and gene_id_ not in gtfDICT[chr_]['gene']:
                gtfDICT[chr_]['gene'][gene_id_]={'gene':[],'transcripts':{}}
                gtfDICT[chr_]['gene'][gene_id_]['transcripts'][transcript_id_]={'transcript':[], 'start_codon':[], 'five_prime_utr':[], 'exon':[], 'CDS':[], 'stop_codon':[], 'three_prime_utr':[], 'Selenocysteine':[]}
                gtfDICT[chr_]['gene'][gene_id_]['transcripts'][transcript_id_][type_].append([start_,end_,strand_])
                continue
        if chr_ not in gtfDICT:
            gtfDICT[chr_]={'gene':{}}
            gtfDICT[chr_]['gene'][gene_id_]={'gene':[],'transcripts':{}}
            if type_ == 'gene':
                gtfDICT[chr_]['gene'][gene_id_]['gene'].append([start_,end_,strand_])
                continue
            if type_ != 'gene':
                gtfDICT[chr_]['gene'][gene_id_]['transcripts'][transcript_id_]={'transcript':[], 'start_codon':[], 'five_prime_utr':[], 'exon':[], 'CDS':[], 'stop_codon':[], 'three_prime_utr':[], 'Selenocysteine':[]}
                gtfDICT[chr_]['gene'][gene_id_]['transcripts'][transcript_id_][type_].append([start_,end_,strand_])
                continue
    return gtfDICT


def write_feature2fasta(gtfDICT, id_, feature, chr_, referenceDICT):
    if feature == 'chromosome':
        tmp_seq = referenceDICT[chr_][:]
        tmp_seq.id = id_+'_'+tmp_seq.id
        tmp_seq.name = id_+'_'+tmp_seq.name
        tmp_seq.description = ''
        yield tmp_seq
    if feature == 'gene':
        for gene in gtfDICT[chr_]['gene'].keys():
            gene_start = int(gtfDICT[chr_]['gene'][gene]['gene'][0][0])
            gene_end = int(gtfDICT[chr_]['gene'][gene]['gene'][0][1])
            gene_strand = gtfDICT[chr_]['gene'][gene]['gene'][0][2]
            tmp_seq_id = id_+'_'+gene+'_'+chr_+'_'+str(gene_start)+'_'+str(gene_end)+'_'+gene_strand
            tmp_seq = referenceDICT[chr_][gene_start-1:gene_end][:]
            tmp_seq.id = tmp_seq_id
            tmp_seq.name = tmp_seq_id
            tmp_seq.description = ''
            if gene_strand == '-':
                tmp_seq = tmp_seq.reverse_complement()
                tmp_seq.id = tmp_seq_id
                tmp_seq.name = tmp_seq_id
                tmp_seq.description = ''
            yield tmp_seq
    if feature == 'transcript:transcript':
        for gene in gtfDICT[chr_]['gene'].keys():
            for transcript in gtfDICT[chr_]['gene'][gene]['transcripts'].keys():
                transcript_start = int(gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['transcript'][0][0])
                transcript_end = int(gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['transcript'][0][1])
                transcript_strand = gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['transcript'][0][2]
                tmp_seq_id = id_+'_'+transcript+'_'+chr_+'_'+str(transcript_start)+'_'+str(transcript_end)+'_'+transcript_strand
                tmp_seq = referenceDICT[chr_][transcript_start-1:transcript_end][:]
                tmp_seq.id = tmp_seq_id
                tmp_seq.name = tmp_seq_id
                tmp_seq.description = ''
                if gene_strand == '-':
                    tmp_seq = tmp_seq.reverse_complement()
                    tmp_seq.id = tmp_seq_id
                    tmp_seq.name = tmp_seq_id
                    tmp_seq.description = ''
                yield tmp_seq
    if feature == 'transcript:CDS':
        for gene in gtfDICT[chr_]['gene'].keys():
            for transcript in gtfDICT[chr_]['gene'][gene]['transcripts'].keys():
                transcript_start = int(gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['transcript'][0][0])
                transcript_end = int(gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['transcript'][0][1])
                transcript_strand = gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['transcript'][0][2]
                tmp_CDS = [x[:] for x in gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['CDS']]
                tmp_CDS = tmp_CDS + gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['stop_codon']
                tmp_CDS_sorted = sorted(tmp_CDS, key=lambda x:int(x[0]), reverse = False)
                tmp_CDS_id = id_+'_'+transcript+':CDS'+'_'+chr_
                if len(tmp_CDS_sorted)!=0:
                    tmp_CDS_out = ''
                    for CDS_ in tmp_CDS_sorted:
                        tmp_CDS_start = int(CDS_[0])
                        tmp_CDS_end = int(CDS_[1])
                        tmp_CDS_strand = CDS_[2]
                        tmp_CDS_out += str(referenceDICT[chr_][tmp_CDS_start-1:tmp_CDS_end].seq)
                    tmp_CDS_seq = SeqRecord.SeqRecord(Seq(tmp_CDS_out))
                    tmp_CDS_seq.id = tmp_CDS_id
                    tmp_CDS_seq.name = tmp_CDS_id
                    tmp_CDS_seq.description = ''
                    if tmp_CDS_strand == '-':
                        tmp_CDS_seq = tmp_CDS_seq.reverse_complement()
                        tmp_CDS_seq.id = tmp_CDS_id
                        tmp_CDS_seq.name = tmp_CDS_id
                        tmp_CDS_seq.description = ''
                    yield tmp_CDS_seq
    if feature == 'transcript:exon':
        for gene in gtfDICT[chr_]['gene'].keys():
            for transcript in gtfDICT[chr_]['gene'][gene]['transcripts'].keys():
                transcript_start = int(gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['transcript'][0][0])
                transcript_end = int(gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['transcript'][0][1])
                transcript_strand = gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['transcript'][0][2]
                tmp_exon = [x[:] for x in gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['exon']]
                tmp_exon_sorted = [x[:] for x in sorted(tmp_exon, key=lambda x:int(x[0]), reverse = False)]
                tmp_exon_id = id_+'_'+transcript+':exon'+'_'+chr_
                if len(tmp_exon_sorted)!=0:
                    tmp_exon_out = ''
                    for exon_ in tmp_exon_sorted:
                        tmp_exon_start = int(exon_[0])
                        tmp_exon_end = int(exon_[1])
                        tmp_exon_strand = exon_[2]
                        tmp_exon_out += str(referenceDICT[chr_][tmp_exon_start-1:tmp_exon_end].seq)
                    tmp_exon_seq = SeqRecord.SeqRecord(Seq(tmp_exon_out))
                    tmp_exon_seq.id = tmp_exon_id
                    tmp_exon_seq.name = tmp_exon_id
                    tmp_exon_seq.description = ''
                    if tmp_exon_strand == '-':
                        tmp_exon_seq = tmp_exon_seq.reverse_complement()
                        tmp_exon_seq.id = tmp_exon_id
                        tmp_exon_seq.name = tmp_exon_id
                        tmp_exon_seq.description = ''
                    yield tmp_exon_seq
    if feature == 'transcript:intron':
        for gene in gtfDICT[chr_]['gene'].keys():
            for transcript in gtfDICT[chr_]['gene'][gene]['transcripts'].keys():
                transcript_start = int(gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['transcript'][0][0])
                transcript_end = int(gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['transcript'][0][1])
                transcript_strand = gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['transcript'][0][2]
                tmp_exon = [x[:] for x in gtfDICT[chr_]['gene'][gene]['transcripts'][transcript]['exon']]
                if len(tmp_exon)>1:
                    tmp_intron = []
                    if transcript_strand == '+':
                        tmp_exon_sorted = [x[:] for x in sorted(tmp_exon, key=lambda x:int(x[0]), reverse = False)]
                        tmp_intron = [x[:] for x in tmp_exon_sorted[:len(tmp_exon_sorted)-1]]
                        for i in range(0,len(tmp_intron)):
                            tmp_intron[i][0]=str(int(tmp_exon_sorted[i][1])+1)
                            tmp_intron[i][1]=str(int(tmp_exon_sorted[i+1][0])-1)
                    if transcript_strand == '-':
                        tmp_exon_sorted = [x[:] for x in sorted(tmp_exon, key=lambda x:int(x[0]), reverse = True)]
                        tmp_intron = [x[:] for x in tmp_exon_sorted[1:]]
                        for i in range(0,len(tmp_intron)):
                            tmp_intron[i][0]=str(int(tmp_exon_sorted[i+1][1])+1)
                            tmp_intron[i][1]=str(int(tmp_exon_sorted[i][0])-1)
                    tmp_intron_id = id_+'_'+transcript+':intron'+'_'+chr_
                    tmp_intron_out = ''
                    tmp_intron_sorted = [x[:] for x in sorted(tmp_intron, key=lambda x:int(x[0]), reverse = False)]
                    for intron_ in tmp_intron_sorted:
                        tmp_intron_start = int(intron_[0])
                        tmp_intron_end = int(intron_[1])
                        tmp_intron_strand = intron_[2]
                        tmp_intron_out += str(referenceDICT[chr_][tmp_intron_start-1:tmp_intron_end].seq)
                    tmp_intron_seq = SeqRecord.SeqRecord(Seq(tmp_intron_out))
                    tmp_intron_seq.id = tmp_intron_id
                    tmp_intron_seq.name = tmp_intron_id
                    tmp_intron_seq.description = ''
                    if tmp_intron_strand == '-':
                        tmp_intron_seq = tmp_intron_seq.reverse_complement()
                        tmp_intron_seq.id = tmp_intron_id
                        tmp_intron_seq.name = tmp_intron_id
                        tmp_intron_seq.description = ''
                    yield tmp_intron_seq


#mvcfPL2GP
def mvcfPL2GP(args, parser):
    #MVCFPL2GP RECORD PARSER
    def parse_record(self, argsDICT, polytypeDICT, filterDICT, tsvouthandle):
        """ Parse each record and writes converted PL into outhandle if variant is kept after filtering """
        REF_ALT = self.get_ref_alt()
        POLYTYPE = self.get_var_type()
        POLYNUMBER = self.get_var_number()
        POLYLEN = self.get_var_len()
        if POLYTYPE in polytypeDICT:
            polytypeDICT[POLYTYPE]+=1
        if POLYTYPE not in polytypeDICT:
            polytypeDICT[POLYTYPE]=1
        if POLYTYPE != 'repSNP' and POLYTYPE != 'biSNP':
            print POLYTYPE
            return
            #sys.exit('\nPOLYTYPE not considered for PL conversion\nPlease reduce to BIALLELIC SNP and MNP')
        #0. REDUCE record TO SAMPLES and ADD mandatory FORMAT TO SAMPLES [GT:AD:DP]:GQ:PL
        self.keep_samples(argsDICT)
        if 'PL' not in self.FORMAT:
            if argsDICT['verbose']:
                print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
            sys.exit('\nPL Flag missing!')
        if 'GQ' not in self.FORMAT:
            for s in argsDICT['samples_pos']:
                self.add_format_on_samples(s, 'GQ', 0)
            self.FORMAT.append('GQ')
        if 'GP' not in self.FORMAT:
            for s in argsDICT['samples_pos']:
                self.add_format_on_samples(s, 'GP', list(np.repeat(float(0),3)))
            self.FORMAT.append('GP')
        #1. FILTER BY filterarg
        if self.is_filtered(argsDICT):
            if argsDICT['verbose']:
                print self
            filterDICT['filterarg']+=1
            return
        #2. FILTER BY minMQ
        MQ = self.get_info('MQ')
        if MQ == ".":
             MQ = 0.0
        if float(MQ) < argsDICT['minMQ']:
            if argsDICT['verbose']:
                print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
            filterDICT['MQ']+=1
            return
        #3. GET DP
        #fill empty AD
        self.fill_empty_ad()
        SAMPLESAD = self.get_samples_ad()
        SAMPLESADLEN = self.get_samples_ad_len()
        if not all([x==POLYNUMBER for x in SAMPLESADLEN]):
            if argsDICT['verbose']:
                print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
            sys.exit('\nSAMPLES AD length differs from total POLYNUMBER')
        SAMPLESGT = self.get_samples_gt()
        SAMPLESGTTYPE = self.get_gt_type()
        SAMPLESGTNAPOS = self.get_gt_na_pos()
        SAMPLESGQ = self.get_samples_gq()
        SAMPLESAD = self.get_samples_ad()
        #set dp based on altered ad
        self.set_dp_by_ad()
        SAMPLESDP = self.get_samples_dp()
        #4. KEEP OR RESET GP BY MISSING GT OPTION
        self.fill_empty_pl()
        SAMPLESPL = self.get_samples_pl()
        self.set_samples_gp_by_gt(argsDICT)
        SAMPLESGP = self.get_samples_gp()
        #5. KEEP OR RESET GP USING THE DP OPTION AND GQ OPTION
        self.set_samples_gp_by_dp(argsDICT)
        self.set_samples_gp_by_gq(argsDICT)
        #write to tsv
        TSVLINE = '\t'.join([self.CHROM+'_'+self.POS,self.REF[0],self.ALT[0],'\t'.join(['\t'.join([str(np.round(y,5)) for y in x]) for x in self.get_samples_gp()])])+'\n'
        tsvouthandle.write(TSVLINE)
        filterDICT['passed']+=1
    #IVCF
    if args.ivcf is None:
        parser.print_help()
        sys.exit('\nPlease specify input vcf file')
    args.ivcf = os.path.abspath(args.ivcf)
    #TABIX
    if args.tabix:
        if args.R is None:
            parser.print_help()
            sys.exit('\nPlease specify reference fasta file, since [tabix: "true"]')
        args.R = os.path.abspath(args.R)
    #OUTPUT
    if args.o is None:
        parser.print_help()
        sys.exit('\nPlease specify output prefix')
    args.o = os.path.abspath(args.o)
    #CHROMOSOMES
    if args.chr is None:
        sys.exit('\nPlease specify chromosomes to be parsed as a comma-seperated list')
    args.chr = args.chr.split(',')
    #SAMPLES
    if args.samples is None:
        parser.print_help()
        sys.exit('\nPlease specify sample names to be parsed as comma-seperated list\ngrep "#CHROM" vcf')
    args.samples = args.samples.split(',')
    #FILTER
    args.filterarg = args.filterarg.split(',')
    #MQ OPTION
    args.missingGT = get_missing_type2(args.missingGT)
    args.ps = int(float(args.ps))
    print(args)
    #GET ARGS DICT
    argsDICT = vars(args)
    argsDICT['args'] = ['"'+str(args).replace("'","")+'"']
    #STATIC DICT
    polytypeDICT = {}
    filterDICT = {'passed':0, 'filterarg':0, 'MQ':0, 'NCF':0, 'CDP':0}
    argsDICT['samples_pos'] = get_sample_pos_from_header(argsDICT)
    if argsDICT['tabix']:
        with open(argsDICT['o']+'.tsv','w') as tsvouthandle:
            tsvouthandle.write('marker\tallele1\tallele2\t'+'\t'.join(['\t'.join(list(np.repeat(x,3))) for x in argsDICT['samples']])+'\n')
            reference_length = get_reference_len(argsDICT)
            tb = tabix.open(argsDICT['ivcf'])
            counter = 0
            for tmp_chr in argsDICT['chr']:
                tmp_chr_length = reference_length[tmp_chr]
                records = tb.query(tmp_chr,0,tmp_chr_length)
                for lines in records:
                    record = vcfRecord(lines, argsDICT['CHROM'], argsDICT['POS'], argsDICT['ID'], argsDICT['REF'], argsDICT['ALT'], argsDICT['QUAL'], argsDICT['FILTER'], argsDICT['INFO'], argsDICT['FORMAT'], argsDICT['samples_pos'], argsDICT['tabix'])
                    parse_record(record, argsDICT, polytypeDICT, filterDICT, tsvouthandle)
                    if not argsDICT['po']:
                        counter += 1
                        if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) != 0:
                            sys.stderr.write('.')
                        if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) == 0:
                            sys.stderr.write('o')
    if not argsDICT['tabix']:
        with open(argsDICT['o']+'.tsv','w') as tsvouthandle:
            tsvouthandle.write('marker\tallele1\tallele2\t'+'\t'.join(['\t'.join(list(np.repeat(x,3))) for x in argsDICT['samples']])+'\n')
            counter = 0
            with open(argsDICT['ivcf'],'rU') as vcfhandle:
                for lines in vcfhandle:
                    if lines[0]=='#':
                        continue
                    record = vcfRecord(lines, argsDICT['CHROM'], argsDICT['POS'], argsDICT['ID'], argsDICT['REF'], argsDICT['ALT'], argsDICT['QUAL'], argsDICT['FILTER'], argsDICT['INFO'], argsDICT['FORMAT'], argsDICT['samples_pos'], argsDICT['tabix'])
                    #continue if not in chromosome_ids
                    if record.CHROM not in argsDICT['chr']:
                        continue
                    #else
                    parse_record(record, argsDICT, polytypeDICT, filterDICT, tsvouthandle)
                    if not argsDICT['po']:
                        counter += 1
                        if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) != 0:
                            sys.stderr.write('.')
                        if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) == 0:
                            sys.stderr.write('o')
    sys.stderr.write('\n')
    print 'variant types:'
    print polytypeDICT
    print 'filtered variants by:'
    print filterDICT


#mvcf2consensus
def mvcf2consensus(args, parser):
    #MVCF2CONSENSUS RECORD PARSER
    def parse_record(self, argsDICT, polytypeDICT, filterDICT, tsvouthandle, vcfouthandle):
        """ Parse each record and writes into outhandle if variant is kept after filtering """
        REF_ALT = self.get_ref_alt()
        POLYTYPE = self.get_var_type()
        POLYNUMBER = self.get_var_number()
        POLYLEN = self.get_var_len()
        if POLYTYPE in polytypeDICT:
            polytypeDICT[POLYTYPE]+=1
        if POLYTYPE not in polytypeDICT:
            polytypeDICT[POLYTYPE]=1
        #0. REDUCE record TO SAMPLES and ADD mandatory FORMAT TO SAMPLES [GT:AD:DP]:GQ:PL
        self.keep_samples(argsDICT)
        if 'GQ' not in self.FORMAT:
            for s in argsDICT['samples_pos']:
                self.add_format_on_samples(s, 'GQ', 0)
            self.FORMAT.append('GQ')
        if 'PL' not in self.FORMAT:
            for s in argsDICT['samples_pos']:
                self.add_format_on_samples(s, 'PL', list(np.repeat(0,nCk(self.get_var_number()+1,2))))
            self.FORMAT.append('PL')
        #1. FILTER BY filterarg
        if self.is_filtered(argsDICT):
            if argsDICT['verbose']:
                print self
            filterDICT['filterarg']+=1
            return
        #2. FILTER BY minMQ
        MQ = self.get_info('MQ')
        if MQ == ".":
             MQ = 0.0
        if float(MQ) < argsDICT['minMQ']:
            if argsDICT['verbose']:
                print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
            filterDICT['MQ']+=1
            return
        #3. GET AD
        #fill empty AD
        self.fill_empty_ad()
        SAMPLESAD = self.get_samples_ad()
        SAMPLESADLEN = self.get_samples_ad_len()
        if not all([x==POLYNUMBER for x in SAMPLESADLEN]):
            if argsDICT['verbose']:
                print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
            sys.exit('\nSAMPLES AD length differs from total POLYNUMBER')
        #4. SET AD BY MISSING GT OPTION
        SAMPLESGT = self.get_samples_gt()
        SAMPLESGQ = self.get_samples_gq()
        self.set_samples_ad_by_gt(argsDICT)
        SAMPLESAD = self.get_samples_ad()
        #set dp based on altered ad
        self.set_dp_by_ad()
        #5. KEEP OR RESET GT USING THE DP OPTION AND AD INFORMATION
        SAMPLESDP = self.get_samples_dp()
        self.set_samples_gt_by_ad(argsDICT)
        #6. FILTER BY NotCalledFraction
        SAMPLESGTTYPE = self.get_gt_type()
        SAMPLESGTNAPOS = self.get_gt_na_pos()
        NOTCALLED = len(SAMPLESGTNAPOS)
        NOTCALLEDOUT = str(NOTCALLED)+'/'+str(len(self.SAMPLES))
        self.add_info(';NCFnumber='+str(NOTCALLEDOUT))
        NOTCALLEDFRACTION = float(NOTCALLED)/float(len(self.SAMPLES))
        self.add_info(';NCFfraction='+str(NOTCALLEDFRACTION))
        if NOTCALLEDFRACTION >= argsDICT['ncf']:
            if argsDICT['verbose']:
                print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
            filterDICT['NCF']+=1
            return
        #7. GET CONSENSUS AND EVALUATE CDP OPTION
        #fill empty PL
        self.fill_empty_pl()
        CONSENSUS_CALLDATA = self.get_consensus_calldata(argsDICT)
        if CONSENSUS_CALLDATA['DP'] < argsDICT['cdp']:
            if argsDICT['verbose']:
                print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
            filterDICT['CDP']+=1
            return
        if argsDICT['add']:
            self.SAMPLES[argsDICT['id']] = CONSENSUS_CALLDATA
        if not argsDICT['add']:
            self.SAMPLES = collections.OrderedDict()
            self.SAMPLES[argsDICT['id']] = CONSENSUS_CALLDATA
        VCFLINE = '\t'.join([self.CHROM, self.POS, self.ID, self.REF[0], ','.join(self.ALT), self.QUAL, self.FILTER, self.INFO, ':'.join(self.FORMAT)])
        SAMPLES_LINE = []
        for s in self.SAMPLES.keys():
            SAMPLE_LINE = []
            for k in self.SAMPLES[s].keys():
                if isinstance(self.SAMPLES[s][k], list):
                    SAMPLE_LINE.append(','.join([str(x) for x in self.SAMPLES[s][k]]))
                elif self.SAMPLES[s][k] is None:
                    SAMPLE_LINE.append(".")
                else:
                    SAMPLE_LINE.append(str(self.SAMPLES[s][k]))
            SAMPLES_LINE.append(':'.join(SAMPLE_LINE))
        vcfouthandle.write(VCFLINE+'\t'+'\t'.join(SAMPLES_LINE)+'\n')
        CONSENSUSOUT = SAMPLES_LINE[-1]
        if POLYTYPE != 'repSNP' and POLYTYPE != 'biSNP' and POLYTYPE != 'muSNP':
            print POLYTYPE
            sys.exit('\nPOLYTYPE not considered for CONSENSUS\nPlease reduce to SNP and MNP')
        if POLYTYPE == 'repSNP' or POLYTYPE == 'biSNP' or POLYTYPE == 'muSNP':
            CONSENSUSMAJ, CONSENSUSMIN, CONSENSUSIUPAC = self.get_alleles(POLYTYPE, 'refmajorsample')
            if argsDICT['add']:
                CONSENSUSMAJ = CONSENSUSMAJ[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
                CONSENSUSMIN = CONSENSUSMIN[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
                CONSENSUSIUPAC = CONSENSUSIUPAC[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
            if not argsDICT['add']:
                CONSENSUSMAJ = CONSENSUSMAJ[0]
                CONSENSUSMIN = CONSENSUSMIN[0]
                CONSENSUSIUPAC = CONSENSUSIUPAC[0]
        #write to tsv
        TSVLINE = '\t'.join([self.CHROM,str(self.POS),NOTCALLEDOUT,CONSENSUSOUT,','.join(REF_ALT),CONSENSUSMAJ,CONSENSUSMIN,CONSENSUSIUPAC])+'\n'
        tsvouthandle.write(TSVLINE)
        filterDICT['passed']+=1
    #IVCF
    if args.ivcf is None:
        parser.print_help()
        sys.exit('\nPlease specify input vcf file')
    args.ivcf = os.path.abspath(args.ivcf)
    #TABIX
    if args.tabix:
        if args.R is None:
            parser.print_help()
            sys.exit('\nPlease specify reference fasta file, since [tabix: "true"]')
        args.R = os.path.abspath(args.R)
    #OUTPUT
    if args.o is None:
        parser.print_help()
        sys.exit('\nPlease specify output prefix')
    args.o = os.path.abspath(args.o)
    #CHROMOSOMES
    if args.chr is None:
        sys.exit('\nPlease specify chromosomes to be parsed as a comma-seperated list')
    args.chr = args.chr.split(',')
    #SAMPLES
    if args.samples is None:
        parser.print_help()
        sys.exit('\nPlease specify sample names to be parsed as comma-seperated list\ngrep "#CHROM" vcf')
    args.samples = args.samples.split(',')
    #SAMPLES ID
    if args.id is None:
        parser.print_help()
        sys.exit('\nPlease specify CONSENSUS name')
    #FILTER
    args.filterarg = args.filterarg.split(',')
    #MQ OPTION
    args.missingGT = get_missing_type(args.missingGT)
    args.ps = int(float(args.ps))
    print(args)
    #GET ARGS DICT
    argsDICT = vars(args)
    argsDICT['args'] = ['"'+str(args).replace("'","")+'"']
    #STATIC DICT
    polytypeDICT = {}
    filterDICT = {'passed':0, 'filterarg':0, 'MQ':0, 'NCF':0, 'CDP':0}
    argsDICT['samples_pos'] = get_sample_pos_from_header(argsDICT)
    if argsDICT['tabix']:
        with open(argsDICT['o']+'.tsv','w') as tsvouthandle:
            with open(argsDICT['o']+'.vcf','w') as vcfouthandle:
                reference_length = get_reference_len(argsDICT)
                header = get_header_from_gz(argsDICT)
                tb = tabix.open(argsDICT['ivcf'])
                counter = 0
                if not argsDICT['H']:
                    for h in header:
                        vcfouthandle.write(h)
                for tmp_chr in argsDICT['chr']:
                    tmp_chr_length = reference_length[tmp_chr]
                    records = tb.query(tmp_chr,0,tmp_chr_length)
                    for lines in records:
                        record = vcfRecord(lines, argsDICT['CHROM'], argsDICT['POS'], argsDICT['ID'], argsDICT['REF'], argsDICT['ALT'], argsDICT['QUAL'], argsDICT['FILTER'], argsDICT['INFO'], argsDICT['FORMAT'], argsDICT['samples_pos'], argsDICT['tabix'])
                        parse_record(record, argsDICT, polytypeDICT, filterDICT, tsvouthandle, vcfouthandle)
                        if not argsDICT['po']:
                            counter += 1
                            if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) != 0:
                                sys.stderr.write('.')
                            if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) == 0:
                                sys.stderr.write('o')
    if not argsDICT['tabix']:
        with open(argsDICT['o']+'.tsv','w') as tsvouthandle:
            with open(argsDICT['o']+'.vcf','w') as vcfouthandle:
                header = get_header_from_vcf(argsDICT)
                counter = 0
                if not argsDICT['H']:
                    for h in header:
                        vcfouthandle.write(h)
                with open(argsDICT['ivcf'],'rU') as vcfhandle:
                    for lines in vcfhandle:
                        if lines[0]=='#':
                            continue
                        record = vcfRecord(lines, argsDICT['CHROM'], argsDICT['POS'], argsDICT['ID'], argsDICT['REF'], argsDICT['ALT'], argsDICT['QUAL'], argsDICT['FILTER'], argsDICT['INFO'], argsDICT['FORMAT'], argsDICT['samples_pos'], argsDICT['tabix'])
                        #continue if not in chromosome_ids
                        if record.CHROM not in argsDICT['chr']:
                            continue
                        #else
                        parse_record(record, argsDICT, polytypeDICT, filterDICT, tsvouthandle, vcfouthandle)
                        if not argsDICT['po']:
                            counter += 1
                            if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) != 0:
                                sys.stderr.write('.')
                            if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) == 0:
                                sys.stderr.write('o')
    sys.stderr.write('\n')
    print 'variant types:'
    print polytypeDICT
    print 'filtered variants by:'
    print filterDICT


#vcf2fasta
def vcf2fasta(args, parser):
    #VCF2FASTA RECORD PARSER
    def parse_record(self, argsDICT, polytypeDICT, filterDICT, referenceDICT, mqDICT, dpDICT, gqDICT):
        """ Parse each record and sets allele """
        REF_ALT = self.get_ref_alt()
        POLYTYPE = self.get_var_type()
        POLYNUMBER = self.get_var_number()
        POLYLEN = self.get_var_len()
        if POLYTYPE in polytypeDICT:
            polytypeDICT[POLYTYPE]+=1
        if POLYTYPE not in polytypeDICT:
            polytypeDICT[POLYTYPE]=1
        if POLYTYPE == 'noVAR':
            return
        #0. REDUCE record TO SAMPLES and ADD mandatory FORMAT TO SAMPLES [GT:AD:DP]:GQ:PL
        self.keep_samples(argsDICT)
        if 'GQ' not in self.FORMAT:
            for s in argsDICT['samples_pos']:
                self.add_format_on_samples(s, 'GQ', 0)
            self.FORMAT.append('GQ')
        if 'PL' not in self.FORMAT:
            for s in argsDICT['samples_pos']:
                self.add_format_on_samples(s, 'PL', list(np.repeat(0,nCk(self.get_var_number()+1,2))))
            self.FORMAT.append('PL')
        #1. FILTER BY filterarg
        if self.is_filtered(argsDICT):
            if argsDICT['verbose']:
                print self
            filterDICT['filterarg']+=1
            return
        #2. FILTER BY minMQ
        MQ = self.get_info('MQ')
        if MQ == ".":
             MQ = 0.0
        if float(MQ) < argsDICT['minMQ']:
            if argsDICT['verbose']:
                print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
            filterDICT['MQ']+=1
            mqDICT[self.CHROM][self.POS] = []
            return
        #3. GET AD
        #fill empty AD
        self.fill_empty_ad()
        SAMPLESAD = self.get_samples_ad()
        SAMPLESADLEN = self.get_samples_ad_len()
        if not all([x==POLYNUMBER for x in SAMPLESADLEN]):
            if argsDICT['verbose']:
                print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
            sys.exit('\nSAMPLES AD length differs from total POLYNUMBER')
        #4. FILTER BY minGQ
        SAMPLESGT = self.get_samples_gt()
        SAMPLESGQ = self.get_samples_gq()
        if SAMPLESGQ[0] < argsDICT['minGQ']:
            if argsDICT['verbose']:
                print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
            filterDICT['GQ']+=1
            gqDICT[self.CHROM][self.POS] = []
            return
        #5. SET AD BY MISSING GT OPTION
        self.set_samples_ad_by_gt(argsDICT)
        SAMPLESAD = self.get_samples_ad()
        #set dp based on altered ad
        self.set_dp_by_ad()
        #6. KEEP OR RESET GT USING THE DP OPTION AND AD INFORMATION
        SAMPLESDP = self.get_samples_dp()
        self.set_samples_gt_by_ad(argsDICT)
        #7. FILTER BY minDP
        if SAMPLESDP[0] < argsDICT['minDP']:
            if argsDICT['verbose']:
                print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
            filterDICT['DP']+=1
            dpDICT[self.CHROM][self.POS] = []
            return
        if SAMPLESDP[0] > argsDICT['maxDP']:
            if argsDICT['verbose']:
                print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
            filterDICT['DP']+=1
            dpDICT[self.CHROM][self.POS] = []
            return
        #8. GET ALLELE
        #TODO
        if POLYTYPE == 'complexDIS':
            return
        if POLYTYPE != 'complexDIS':
            REFallele = ''
            ALTallele = ''
            IUPACallele = ''
            REFallele, ALTallele, IUPACallele = self.get_alleles(POLYTYPE, argsDICT['type'])
            if argsDICT['type'] == 'iupac':
                referenceDICT[argsDICT['samples_pos'][0]]['IUPAC'][self.CHROM].seq[int(self.POS)-1:int(self.POS)+len(IUPACallele[0])-1]=IUPACallele[0]
            if argsDICT['type'] != 'iupac':
                referenceDICT[argsDICT['samples_pos'][0]]['IUPAC'][self.CHROM].seq[int(self.POS)-1:int(self.POS)+len(IUPACallele[0])-1]=IUPACallele[0]
                referenceDICT[argsDICT['samples_pos'][0]]['REF'][self.CHROM].seq[int(self.POS)-1:int(self.POS)+len(REFallele[0])-1]=REFallele[0]
                referenceDICT[argsDICT['samples_pos'][0]]['ALT'][self.CHROM].seq[int(self.POS)-1:int(self.POS)+len(REFallele[0])-1]=ALTallele[0]
        filterDICT['passed']+=1
    #MASKING
    def mask_reference_dict(referenceDICT, maskingDICT):
        print 'start setting X based on cov2N option'
        for sample_ in referenceDICT:
            for type_ in referenceDICT[sample_]:
                for chr_ in referenceDICT[sample_][type_]:
                    if chr_ in maskingDICT:
                        for mask_interval in maskingDICT[chr_]:
                            m_start, m_end = mask_interval
                            m_start = int(m_start)-1
                            m_end = int(m_end)
                            referenceDICT[sample_][type_][chr_].seq[int(m_start):int(m_end)]=''.join(list(repeat('X',int(m_end)-int(m_start))))
        print 'finished setting X based on cov2N option'
        return
    #WRITE FASTA W/O GTF
    def write_reference_dict_wo_gtf(argsDICT, referenceDICT):
        #get sequences ONLY for IUPAC
        if argsDICT['type'] == 'iupac':
            print 'start getting iupac sequences'
            if argsDICT['sw']:
                #write sequences for sliding windows
                print 'start writing iupac sequences'
                #make sliding windows
                print 'start preparing sliding windows'
                for chr_ in referenceDICT[argsDICT['samples_pos'][0]]['IUPAC'].keys():
                    tmp_seq = referenceDICT[argsDICT['samples_pos'][0]]['IUPAC'][chr_][:]
                    tmp_seq.id = argsDICT['id'][0]+'_'+tmp_seq.id+' '+argsDICT['type']+' iupac'
                    tmp_seq.name = argsDICT['id'][0]+'_'+tmp_seq.name+' '+argsDICT['type']+' iupac'
                    tmp_seq_windows = sliding_window_steps_generator(argsDICT['s'], argsDICT['j'], 1, len(tmp_seq))
                    for i in range(0, len(tmp_seq_windows[0])):
                        tmp_seq_w = tmp_seq[tmp_seq_windows[0][i]-1:tmp_seq_windows[2][i]]
                        tmp_seq_w.id = tmp_seq_w.id.split()[0]+'_'+str(tmp_seq_windows[0][i])+'_'+str(tmp_seq_windows[2][i])+' '+argsDICT['type']+' iupac'
                        tmp_seq_w.name = tmp_seq_w.name.split()[0]+'_'+str(tmp_seq_windows[0][i])+'_'+str(tmp_seq_windows[2][i])+' '+argsDICT['type']+' iupac'
                        SeqIO.write(tmp_seq_w,open(argsDICT['o']+'.'+tmp_seq_w.id.split()[0]+'.'+argsDICT['type']+'.iupac.fasta','w'),'fasta')
                print 'finished preparing sliding windows'
                print 'finished writing iupac sequences'
            if not argsDICT['sw']:
                #write sequences without sliding windows
                print 'start writing iupac sequences'
                for chr_ in referenceDICT[argsDICT['samples_pos'][0]]['IUPAC'].keys():
                    tmp_seq = referenceDICT[argsDICT['samples_pos'][0]]['IUPAC'][chr_][:]
                    tmp_seq.id = argsDICT['id'][0]+'_'+tmp_seq.id+' '+argsDICT['type']+' iupac'
                    tmp_seq.name = argsDICT['id'][0]+'_'+tmp_seq.name+' '+argsDICT['type']+' iupac'
                    SeqIO.write(tmp_seq,open(argsDICT['o']+'.'+tmp_seq.id.split()[0]+'.'+argsDICT['type']+'.iupac.fasta','w'),'fasta')
                print 'finished writing iupac sequences'
            print 'finished getting iupac sequences'
        #get sequences for IUPAC, REF, ALT
        if argsDICT['type'] != 'iupac':
            print 'start getting ref and alt sequences'
            if argsDICT['sw']:
                #write sequences for sliding windows
                print 'start writing ref and alt sequences'
                #make sliding windows
                print 'start preparing sliding windows'
                for chr_ in referenceDICT[argsDICT['samples_pos'][0]]['REF'].keys():
                    #IUPAC
                    tmp_seq_iupac = referenceDICT[argsDICT['samples_pos'][0]]['IUPAC'][chr_][:]
                    tmp_seq_iupac.id = argsDICT['id'][0]+'_'+tmp_seq_iupac.id+' '+argsDICT['type']+' iupac'
                    tmp_seq_iupac.name = argsDICT['id'][0]+'_'+tmp_seq_iupac.name+' '+argsDICT['type']+' iupac'
                    #REF
                    tmp_seq_ref = referenceDICT[argsDICT['samples_pos'][0]]['REF'][chr_][:]
                    tmp_seq_ref.id = argsDICT['id'][0]+'_'+tmp_seq_ref.id+' '+argsDICT['type']+' ref'
                    tmp_seq_ref.name = argsDICT['id'][0]+'_'+tmp_seq_ref.name+' '+argsDICT['type']+' ref'
                    #ALT
                    tmp_seq_alt = referenceDICT[argsDICT['samples_pos'][0]]['ALT'][chr_][:]
                    tmp_seq_alt.id = argsDICT['id'][0]+'_'+tmp_seq_alt.id+' '+argsDICT['type']+' alt'
                    tmp_seq_alt.name = argsDICT['id'][0]+'_'+tmp_seq_alt.name+' '+argsDICT['type']+' alt'
                    tmp_seq_windows = sliding_window_steps_generator(argsDICT['s'], argsDICT['j'], 1, len(tmp_seq_ref))
                    for i in range(0, len(tmp_seq_windows[0])):
                        #IUPAC
                        tmp_seq_iupac_w = tmp_seq_iupac[tmp_seq_windows[0][i]-1:tmp_seq_windows[2][i]]
                        tmp_seq_iupac_w.id = tmp_seq_iupac_w.id.split()[0]+'_'+str(tmp_seq_windows[0][i])+'_'+str(tmp_seq_windows[2][i])+' '+argsDICT['type']+' iupac'
                        tmp_seq_iupac_w.name = tmp_seq_iupac_w.name.split()[0]+'_'+str(tmp_seq_windows[0][i])+'_'+str(tmp_seq_windows[2][i])+' '+argsDICT['type']+' iupac'
                        SeqIO.write(tmp_seq_iupac_w,open(argsDICT['o']+'.'+tmp_seq_iupac_w.id.split()[0]+'.'+argsDICT['type']+'.iupac.fasta','w'),'fasta')
                        #REF
                        tmp_seq_ref_w = tmp_seq_ref[tmp_seq_windows[0][i]-1:tmp_seq_windows[2][i]]
                        tmp_seq_ref_w.id = tmp_seq_ref_w.id.split()[0]+'_'+str(tmp_seq_windows[0][i])+'_'+str(tmp_seq_windows[2][i])+' '+argsDICT['type']+' ref'
                        tmp_seq_ref_w.name = tmp_seq_ref_w.name.split()[0]+'_'+str(tmp_seq_windows[0][i])+'_'+str(tmp_seq_windows[2][i])+' '+argsDICT['type']+' ref'
                        SeqIO.write(tmp_seq_ref_w,open(argsDICT['o']+'.'+tmp_seq_ref_w.id.split()[0]+'.'+argsDICT['type']+'.ref.fasta','w'),'fasta')
                        #ALT
                        tmp_seq_alt_w = tmp_seq_alt[tmp_seq_windows[0][i]-1:tmp_seq_windows[2][i]]
                        tmp_seq_alt_w.id = tmp_seq_alt_w.id.split()[0]+'_'+str(tmp_seq_windows[0][i])+'_'+str(tmp_seq_windows[2][i])+' '+argsDICT['type']+' alt'
                        tmp_seq_alt_w.name = tmp_seq_alt_w.name.split()[0]+'_'+str(tmp_seq_windows[0][i])+'_'+str(tmp_seq_windows[2][i])+' '+argsDICT['type']+' alt'
                        SeqIO.write(tmp_seq_alt_w,open(argsDICT['o']+'.'+tmp_seq_alt_w.id.split()[0]+'.'+argsDICT['type']+'.alt.fasta','w'),'fasta')
                print 'finished preparing sliding windows'
                print 'finished writing ref and alt sequences'
            if not argsDICT['sw']:
                #write sequences without sliding windows
                print 'start writing ref and alt sequences'
                for chr_ in referenceDICT[argsDICT['samples_pos'][0]]['REF'].keys():
                    #IUPAC
                    tmp_seq_iupac = referenceDICT[argsDICT['samples_pos'][0]]['IUPAC'][chr_][:]
                    tmp_seq_iupac.id = argsDICT['id'][0]+'_'+tmp_seq_iupac.id+' '+argsDICT['type']+' iupac'
                    tmp_seq_iupac.name = argsDICT['id'][0]+'_'+tmp_seq_iupac.name+' '+argsDICT['type']+' iupac'
                    SeqIO.write(tmp_seq_iupac,open(argsDICT['o']+'.'+tmp_seq_iupac.id.split()[0]+'.'+argsDICT['type']+'.iupac.fasta','w'),'fasta')
                    #REF
                    tmp_seq_ref = referenceDICT[argsDICT['samples_pos'][0]]['REF'][chr_][:]
                    tmp_seq_ref.id = argsDICT['id'][0]+'_'+tmp_seq_ref.id+' '+argsDICT['type']+' ref'
                    tmp_seq_ref.name = argsDICT['id'][0]+'_'+tmp_seq_ref.name+' '+argsDICT['type']+' ref'
                    SeqIO.write(tmp_seq_ref,open(argsDICT['o']+'.'+tmp_seq_ref.id.split()[0]+'.'+argsDICT['type']+'.ref.fasta','w'),'fasta')
                    #ALT
                    tmp_seq_alt = referenceDICT[argsDICT['samples_pos'][0]]['ALT'][chr_][:]
                    tmp_seq_alt.id = argsDICT['id'][0]+'_'+tmp_seq_alt.id+' '+argsDICT['type']+' alt'
                    tmp_seq_alt.name = argsDICT['id'][0]+'_'+tmp_seq_alt.name+' '+argsDICT['type']+' alt'
                    SeqIO.write(tmp_seq_alt,open(argsDICT['o']+'.'+tmp_seq_alt.id.split()[0]+'.'+argsDICT['type']+'.alt.fasta','w'),'fasta')
                print 'finished writing ref and alt sequences'
            print 'finished getting ref and ref sequences'
        return
    #WRITE FASTA W GTF
    def write_reference_dict_w_gtf(argsDICT, referenceDICT):
        print 'start parsing gtf/gff3 file'
        gtfDICT = get_gtf_dict(argsDICT['gtf'])
        print 'finished parsing gtf/gff3 file'
        #get sequences ONLY for IUPAC
        if argsDICT['type'] == 'iupac':
            print 'start getting iupac sequences'
            if argsDICT['sw']:
                #write sequences for sliding windows
                print 'start writing iupac sequences'
                #make sliding windows
                print 'start preparing sliding windows'
                for chr_ in referenceDICT[argsDICT['samples_pos'][0]]['IUPAC'].keys():
                    tmp_records = write_feature2fasta(gtfDICT, argsDICT['id'][0], argsDICT['feature'], chr_, referenceDICT[argsDICT['samples_pos'][0]]['IUPAC'])
                    for tmp_rec in tmp_records:
                        tmp_rec.id = tmp_rec.id+' '+argsDICT['type']+' iuapc'
                        tmp_rec.name = tmp_rec.name+' '+argsDICT['type']+' iuapc'
                        tmp_seq_windows = sliding_window_steps_generator(argsDICT['s'], argsDICT['j'], 1, len(tmp_rec))
                        for i in range(0, len(tmp_seq_windows[0])):
                            tmp_seq_w = tmp_rec[tmp_seq_windows[0][i]-1:tmp_seq_windows[2][i]]
                            tmp_seq_w.id = tmp_seq_w.id.split()[0]+'_'+str(tmp_seq_windows[0][i])+'_'+str(tmp_seq_windows[2][i])+' '+argsDICT['type']+' iupac'
                            tmp_seq_w.name = tmp_seq_w.name.split()[0]+'_'+str(tmp_seq_windows[0][i])+'_'+str(tmp_seq_windows[2][i])+' '+argsDICT['type']+' iupac'
                            SeqIO.write(tmp_seq_w,open(argsDICT['o']+'.'+tmp_seq_w.id.split()[0]+'.'+argsDICT['type']+'.iupac.fasta','w'),'fasta')
                print 'finished preparing sliding windows'
                print 'finished writing iupac sequences'
            if not argsDICT['sw']:
                #write sequences
                print 'start writing iupac sequences'
                for chr_ in referenceDICT[argsDICT['samples_pos'][0]]['IUPAC'].keys():
                    tmp_records = write_feature2fasta(gtfDICT, argsDICT['id'][0], argsDICT['feature'], chr_, referenceDICT[argsDICT['samples_pos'][0]]['IUPAC'])
                    for tmp_rec in tmp_records:
                        tmp_rec.id = tmp_rec.id+' '+argsDICT['type']+' iuapc'
                        tmp_rec.name = tmp_rec.name+' '+argsDICT['type']+' iuapc'
                        SeqIO.write(tmp_rec,open(argsDICT['o']+'.'+tmp_rec.id.split()[0]+'.'+argsDICT['type']+'.iupac.fasta','w'),'fasta')
                print 'finished writing iupac sequences'
            print 'finished getting iupac sequences'
        if argsDICT['type'] != 'iupac':
            print 'start getting ref and alt sequences'
            if argsDICT['sw']:
                #write sequences for sliding windows
                print 'start writing ref and alt sequences'
                #make sliding windows
                print 'start preparing sliding windows'
                for chr_ in referenceDICT[argsDICT['samples_pos'][0]]['REF'].keys():
                    tmp_iupac_records = write_feature2fasta(gtfDICT, argsDICT['id'][0], argsDICT['feature'], chr_, referenceDICT[argsDICT['samples_pos'][0]]['IUPAC'])
                    tmp_ref_records = write_feature2fasta(gtfDICT, argsDICT['id'][0], argsDICT['feature'], chr_, referenceDICT[argsDICT['samples_pos'][0]]['REF'])
                    tmp_alt_records = write_feature2fasta(gtfDICT, argsDICT['id'][0], argsDICT['feature'], chr_, referenceDICT[argsDICT['samples_pos'][0]]['ALT'])
                    for tmp_iupac_rec in tmp_iupac_records:
                        tmp_seq_iupac_windows = sliding_window_steps_generator(argsDICT['s'], argsDICT['j'], 1, len(tmp_iupac_rec))
                        for i in range(0, len(tmp_seq_iupac_windows[0])):
                            tmp_seq_iupac_w = tmp_iupac_rec[tmp_seq_iupac_windows[0][i]-1:tmp_seq_iupac_windows[2][i]]
                            tmp_seq_iupac_w.id = tmp_seq_iupac_w.id.split()[0]+'_'+str(tmp_seq_iupac_windows[0][i])+'_'+str(tmp_seq_iupac_windows[2][i])+' '+argsDICT['type']+' iuapc'
                            tmp_seq_iupac_w.name = tmp_seq_iupac_w.name.split()[0]+'_'+str(tmp_seq_iupac_windows[0][i])+'_'+str(tmp_seq_iupac_windows[2][i])+' '+argsDICT['type']+' iuapc'
                            SeqIO.write(tmp_seq_iupac_w,open(argsDICT['o']+'.'+tmp_seq_iupac_w.id.split()[0]+'.'+argsDICT['type']+'.iupac.fasta','w'),'fasta')
                    for tmp_ref_rec in tmp_ref_records:
                        tmp_seq_ref_windows = sliding_window_steps_generator(argsDICT['s'], argsDICT['j'], 1, len(tmp_ref_rec))
                        for i in range(0, len(tmp_seq_ref_windows[0])):
                            tmp_seq_ref_w = tmp_ref_rec[tmp_seq_ref_windows[0][i]-1:tmp_seq_ref_windows[2][i]]
                            tmp_seq_ref_w.id = tmp_seq_ref_w.id.split()[0]+'_'+str(tmp_seq_ref_windows[0][i])+'_'+str(tmp_seq_ref_windows[2][i])+' '+args.type+' ref'
                            tmp_seq_ref_w.name = tmp_seq_ref_w.name.split()[0]+'_'+str(tmp_seq_ref_windows[0][i])+'_'+str(tmp_seq_ref_windows[2][i])+' '+args.type+' ref'
                            SeqIO.write(tmp_seq_ref_w,open(argsDICT['o']+'.'+tmp_seq_ref_w.id.split()[0]+'.'+argsDICT['type']+'.ref.fasta','w'),'fasta')
                    for tmp_alt_rec in tmp_alt_records:
                        tmp_seq_alt_windows = sliding_window_steps_generator(argsDICT['s'], argsDICT['j'], 1, len(tmp_alt_rec))
                        for i in range(0, len(tmp_seq_alt_windows[0])):
                            tmp_seq_alt_w = tmp_alt_rec[tmp_seq_alt_windows[0][i]-1:tmp_seq_alt_windows[2][i]]
                            tmp_seq_alt_w.id = tmp_seq_alt_w.id.split()[0]+'_'+str(tmp_seq_alt_windows[0][i])+'_'+str(tmp_seq_alt_windows[2][i])+' '+args.type+' alt'
                            tmp_seq_alt_w.name = tmp_seq_alt_w.name.split()[0]+'_'+str(tmp_seq_alt_windows[0][i])+'_'+str(tmp_seq_alt_windows[2][i])+' '+args.type+' alt'
                            SeqIO.write(tmp_seq_alt_w,open(argsDICT['o']+'.'+tmp_seq_alt_w.id.split()[0]+'.'+argsDICT['type']+'.alt.fasta','w'),'fasta')
                print 'finished preparing sliding windows'
                print 'finished writing ref and alt sequences'
            if not argsDICT['sw']:
                #write sequences without sliding windows
                print 'start writing ref and alt sequences'
                for chr_ in referenceDICT[argsDICT['samples_pos'][0]]['REF'].keys():
                    tmp_iupac_records = write_feature2fasta(gtfDICT, argsDICT['id'][0], argsDICT['feature'], chr_, referenceDICT[argsDICT['samples_pos'][0]]['IUPAC'])
                    tmp_ref_records = write_feature2fasta(gtfDICT, argsDICT['id'][0], argsDICT['feature'], chr_, referenceDICT[argsDICT['samples_pos'][0]]['REF'])
                    tmp_alt_records = write_feature2fasta(gtfDICT, argsDICT['id'][0], argsDICT['feature'], chr_, referenceDICT[argsDICT['samples_pos'][0]]['ALT'])
                    for tmp_iupac_rec in tmp_iupac_records:
                        tmp_iupac_rec.id = tmp_iupac_rec.id+' '+argsDICT['type']+' iuapc'
                        tmp_iupac_rec.name = tmp_iupac_rec.name+' '+argsDICT['type']+' iuapc'
                        SeqIO.write(tmp_iupac_rec,open(argsDICT['o']+'.'+tmp_iupac_rec.id.split()[0]+'.'+argsDICT['type']+'.iupac.fasta','w'),'fasta')
                    for tmp_ref_rec in tmp_ref_records:
                        tmp_ref_rec.id = tmp_ref_rec.id+' '+args.type+' ref'
                        tmp_ref_rec.name = tmp_ref_rec.name+' '+args.type+' ref'
                        SeqIO.write(tmp_ref_rec,open(argsDICT['o']+'.'+tmp_ref_rec.id.split()[0]+'.'+argsDICT['type']+'.ref.fasta','w'),'fasta')
                    for tmp_alt_rec in tmp_alt_records:
                        tmp_alt_rec.id = tmp_alt_rec.id+' '+args.type+' alt'
                        tmp_alt_rec.name = tmp_alt_rec.name+' '+args.type+' alt'
                        SeqIO.write(tmp_alt_rec,open(argsDICT['o']+'.'+tmp_alt_rec.id.split()[0]+'.'+argsDICT['type']+'.alt.fasta','w'),'fasta')
                print 'finished writing ref and alt sequences'
            print 'finished getting ref and alt sequences'
    #IVCF
    if args.ivcf is None:
        parser.print_help()
        sys.exit('\nPlease specify input vcf file')
    args.ivcf = os.path.abspath(args.ivcf)
    #R
    if args.R is None:
        parser.print_help()
        sys.exit('\nPlease specify reference fasta file')
    args.R = os.path.abspath(args.R)
    #OUTPUT
    if args.o is None:
        parser.print_help()
        sys.exit('\nPlease specify output prefix')
    args.o = os.path.abspath(args.o)
    #CHROMOSOMES
    if args.chr is None:
        sys.exit('\nPlease specify chromosomes to be parsed as a comma-seperated list')
    args.chr = args.chr.split(',')
    #SAMPLES
    if args.samples is None:
        parser.print_help()
        sys.exit('\nPlease specify sample name to be parsed\ngrep "#CHROM" vcf')
    args.samples = args.samples.split(',')
    if len(args.samples) != 1:
        parser.print_help()
        sys.exit('\nPlease specify sample ONLY ONE name to be parsed\ngrep "#CHROM" vcf')
    #SAMPLES ID
    if args.id is not None:
        args.id = [args.id]
    if args.id is None:
        print '\nID used in FASTA will be set to SAMPLES name'
        args.id = args.samples
    #IBGA
    if args.ibga is not None:
        args.ibga = os.path.abspath(args.ibga)
    #GTF
    if args.gtf is not None:
        args.gtf = os.path.abspath(args.gtf)    
    #FILTER
    args.filterarg = args.filterarg.split(',')
    #MQ OPTION
    args.missingGT = get_missing_type(args.missingGT)
    args.ps = int(float(args.ps))
    print(args)
    #GET ARGS DICT
    argsDICT = vars(args)
    argsDICT['args'] = ['"'+str(args).replace("'","")+'"']
    #STATIC DICT
    polytypeDICT = {}
    filterDICT = {'passed':0, 'filterarg':0, 'MQ':0, 'DP':0, 'GQ':0}
    # prepare pseudo fasta(s) based on reference fasta file for each sample, only using subset of chromosomes as indicated by args.ct
    print 'start loading reference'
    referenceDICT = {}
    mqDICT = {}
    dpDICT = {}
    gqDICT = {}
    argsDICT['samples_pos'] = get_sample_pos_from_header(argsDICT)
    for s in argsDICT['samples_pos']:
        referenceDICT[s]={'REF':{},'ALT':{}, 'IUPAC':{}}
        for chr_ in argsDICT['chr']:
            mqDICT[chr_] = {}
            dpDICT[chr_] = {}
            gqDICT[chr_] = {}
            if argsDICT['type'] == 'iupac':
                referenceDICT[s]['IUPAC'][chr_] = duplicate_sequence(argsDICT['R'], chr_)
            if argsDICT['type'] != 'iupac':
                referenceDICT[s]['IUPAC'][chr_] = duplicate_sequence(argsDICT['R'], chr_)
                referenceDICT[s]['REF'][chr_] = duplicate_sequence(argsDICT['R'], chr_)
                referenceDICT[s]['ALT'][chr_] = duplicate_sequence(argsDICT['R'], chr_)
    reference_length = get_reference_len(argsDICT)
    print 'finished loading reference'
    print 'start parsing vcf'
    if argsDICT['tabix']:
        tb = tabix.open(argsDICT['ivcf'])
        counter = 0
        for tmp_chr in argsDICT['chr']:
            tmp_chr_length = reference_length[tmp_chr]
            records = tb.query(tmp_chr,0,tmp_chr_length)
            for lines in records:
                record = vcfRecord(lines, argsDICT['CHROM'], argsDICT['POS'], argsDICT['ID'], argsDICT['REF'], argsDICT['ALT'], argsDICT['QUAL'], argsDICT['FILTER'], argsDICT['INFO'], argsDICT['FORMAT'], argsDICT['samples_pos'], argsDICT['tabix'])
                parse_record(record, argsDICT, polytypeDICT, filterDICT, referenceDICT, mqDICT, dpDICT, gqDICT)
                if not argsDICT['po']:
                    counter += 1
                    if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) != 0:
                        sys.stderr.write('.')
                    if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) == 0:
                        sys.stderr.write('o')
    if not argsDICT['tabix']:
        counter = 0
        with open(argsDICT['ivcf'],'rU') as vcfhandle:
            for lines in vcfhandle:
                if lines[0]=='#':
                    continue
                record = vcfRecord(lines, argsDICT['CHROM'], argsDICT['POS'], argsDICT['ID'], argsDICT['REF'], argsDICT['ALT'], argsDICT['QUAL'], argsDICT['FILTER'], argsDICT['INFO'], argsDICT['FORMAT'], argsDICT['samples_pos'], argsDICT['tabix'])
                #continue if not in chromosome_ids
                if record.CHROM not in argsDICT['chr']:
                    continue
                #else
                parse_record(record, argsDICT, polytypeDICT, filterDICT, referenceDICT, mqDICT, dpDICT, gqDICT)
                if not argsDICT['po']:
                    counter += 1
                    if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) != 0:
                        sys.stderr.write('.')
                    if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) == 0:
                        sys.stderr.write('o')
    print 'finished parsing vcf'
    if argsDICT['ibga'] is not None:
        maskingDICT = get_mask_dict(argsDICT)
        #overwrite all positions in the reference with X if cov2N option applies
        mask_reference_dict(referenceDICT, maskingDICT)
    if argsDICT['gtf'] is None:
        #WRITE FASTA W/O GTF
        write_reference_dict_wo_gtf(argsDICT, referenceDICT)
    if argsDICT['gtf'] is not None:
        #WRITE FASTA W GTF
        write_reference_dict_w_gtf(argsDICT, referenceDICT)
    sys.stderr.write('\n')
    print 'variant types:'
    print polytypeDICT
    print 'filtered variants by:'
    print filterDICT


#mvcf2PoMo
def mvcf2PoMo(args, parser):
    #MVCF2POMO RECORD PARSER
    def parse_record(self, argsDICT, polytypeDICT, filterDICT, referenceDICT):
        """ Parse each record and writes allele counts into dictonary if variant is kept after filtering """
        REF_ALT = self.get_ref_alt()
        POLYTYPE = self.get_var_type()
        POLYNUMBER = self.get_var_number()
        POLYLEN = self.get_var_len()
        if POLYTYPE in polytypeDICT:
            polytypeDICT[POLYTYPE]+=1
        if POLYTYPE not in polytypeDICT:
            polytypeDICT[POLYTYPE]=1
        if POLYTYPE != 'repSNP' and POLYTYPE != 'biSNP' and POLYTYPE != 'muSNP':
            filterDICT['TYPE']+=1
            return
            #TODO include other variant types
        #0. REDUCE record TO SAMPLES and ADD mandatory FORMAT TO SAMPLES [GT:AD:DP]:GQ:PL
        self.keep_samples(argsDICT)
        if 'GQ' not in self.FORMAT:
            for s in argsDICT['samples_pos']:
                self.add_format_on_samples(s, 'GQ', 0)
            self.FORMAT.append('GQ')
        #1. FILTER BY filterarg
        if self.is_filtered(argsDICT):
            if argsDICT['verbose']:
                print self
            filterDICT['filterarg']+=1
            return
        #2. FILTER BY minMQ
        MQ = self.get_info('MQ')
        if MQ == ".":
             MQ = 0.0
        if float(MQ) < argsDICT['minMQ']:
            if argsDICT['verbose']:
                print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
            filterDICT['MQ']+=1
            return
        #3. GET DP
        #fill empty AD
        self.fill_empty_ad()
        SAMPLESAD = self.get_samples_ad()
        SAMPLESADLEN = self.get_samples_ad_len()
        if not all([x==POLYNUMBER for x in SAMPLESADLEN]):
            if argsDICT['verbose']:
                print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
            sys.exit('\nSAMPLES AD length differs from total POLYNUMBER')
        #4. SET AD BY MISSING GT OPTION
        SAMPLESGT = self.get_samples_gt()
        SAMPLESGQ = self.get_samples_gq()
        self.set_samples_ad_by_gt(argsDICT)
        SAMPLESAD = self.get_samples_ad()
        #set dp based on altered ad
        self.set_dp_by_ad()
        SAMPLESDP = self.get_samples_dp()
        #4. GET ALLELE COUNTS
        self.get_ACGT_allele_counts(POLYTYPE)
        for sample_ in self.SAMPLES.keys():
            if POLYTYPE != 'repSNP':
                referenceDICT[sample_][self.CHROM]['POS'][int(self.POS)] = self.SAMPLES[sample_]['ACGT_count']
            if POLYTYPE == 'repSNP':
                for i_ in range(np.max(POLYLEN)):
                    referenceDICT[sample_][self.CHROM]['POS'][int(self.POS)+i_] = self.SAMPLES[sample_]['ACGT_count']
        filterDICT['passed']+=1
    def get_reference_count_dict(SNP_pos_, referenceSEQ):
        if referenceSEQ[SNP_pos_-1] == 'A':
            return get_ACGT_count_dict(1,0,0,0) 
        if referenceSEQ[SNP_pos_-1] == 'C':
            return get_ACGT_count_dict(0,1,0,0) 
        if referenceSEQ[SNP_pos_-1] == 'G':
            return get_ACGT_count_dict(0,0,1,0) 
        if referenceSEQ[SNP_pos_-1] == 'T':
            return get_ACGT_count_dict(0,0,0,1) 
        if referenceSEQ[SNP_pos_-1] not in ['A','C','G','T']:
            return get_ACGT_count_dict(0,0,0,0) 
    #set tabix
    args.tabix = True
    #IVCF
    if args.ivcf is None:
        parser.print_help()
        sys.exit('\nPlease specify input vcf file')
    args.ivcf = os.path.abspath(args.ivcf)
    #R
    if args.R is None:
        parser.print_help()
        sys.exit('\nPlease specify reference fasta file')
    args.R = os.path.abspath(args.R)
    #OUTPUT
    if args.o is None:
        parser.print_help()
        sys.exit('\nPlease specify output prefix')
    args.o = os.path.abspath(args.o)
    #REGION
    if args.region is None:
        parser.print_help()
        sys.exit('\nPlease specify region to be parsed as e.g. chr1:1-15000')
    args.chr = args.region.split(':')[0]
    args.chr_start = int(args.region.split(':')[1].split('-')[0].replace(',',''))
    args.chr_end = int(args.region.split(':')[1].split('-')[1].replace(',',''))
    if args.chr_end <= args.chr_start:
        parser.print_help()
        sys.exit('\nRegion end needs to be larger than region start position')
    #SAMPLES
    if args.samples is None:
        parser.print_help()
        sys.exit('\nPlease specify sample name to be parsed\ngrep "#CHROM" vcf')
    args.samples = args.samples.split(',')
    #IBGA
    if args.ibga is not None:
        args.ibga = args.ibga.split(',')
        if len(args.ibga) != len(args.samples):
            parser.print_help()
            sys.exit('\nNumber of BGA coverage files needs to equal amount of samples, empty files can be provided\n')
        args.ibga = [os.path.abspath(x) for x in args.ibga]
    #FILTER
    args.filterarg = args.filterarg.split(',')
    #MQ OPTION
    args.missingGT = get_missing_type(args.missingGT)
    args.ps = int(float(args.ps))
    print(args)
    #GET ARGS DICT
    argsDICT = vars(args)
    argsDICT['args'] = ['"'+str(args).replace("'","")+'"']
    #STATIC DICT
    polytypeDICT = {}
    filterDICT = {'passed':0, 'filterarg':0, 'MQ':0, 'TYPE':0}
    argsDICT['samples_pos'] = get_sample_pos_from_header(argsDICT)
    referenceDICT = {}
    referenceSEQ = duplicate_sequence(argsDICT['R'], argsDICT['chr'])
    for s in argsDICT['samples_pos']:
        referenceDICT[s]={argsDICT['chr']:{'POS':{},'BGA':{}}}
    print 'start parsing vcf'
    tb = tabix.open(argsDICT['ivcf'])
    counter = 0
    records = tb.query(argsDICT['chr'],argsDICT['chr_start']-1,argsDICT['chr_end'])
    for lines in records:
        record = vcfRecord(lines, argsDICT['CHROM'], argsDICT['POS'], argsDICT['ID'], argsDICT['REF'], argsDICT['ALT'], argsDICT['QUAL'], argsDICT['FILTER'], argsDICT['INFO'], argsDICT['FORMAT'], argsDICT['samples_pos'], argsDICT['tabix'])
        parse_record(record, argsDICT, polytypeDICT, filterDICT, referenceDICT)
        if not argsDICT['po']:
            counter += 1
            if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) != 0:
                sys.stderr.write('.')
            if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) == 0:
                sys.stderr.write('o')
    print 'finished parsing vcf'
    if argsDICT['ibga'] is not None:
        print 'start parsing bga files'
        get_mask_dict_multiple_by_region(argsDICT, referenceDICT)
        for sample_ in argsDICT['samples_pos']:
            if len(referenceDICT[sample_][argsDICT['chr']]['BGA']) != 0:
                bga_positions = list(np.hstack([range(int(x),int(y)+1) for x,y in referenceDICT[sample_][argsDICT['chr']]['BGA']]))
                for bga_pos in bga_positions:
                    referenceDICT[sample_][argsDICT['chr']]['POS'][bga_pos] =get_ACGT_count_dict(0,0,0,0) 
        print 'finished parsing bga files'
    #FILL REFERENCE POSITIONS DUE TO BGA
    print 'start setting missing BGA positions to reference'
    all_SNP_positions = {}
    for sample_ in argsDICT['samples_pos']:
        all_SNP_positions = dict.fromkeys(referenceDICT[sample_][argsDICT['chr']]['POS'].keys()+all_SNP_positions.keys(), 0)
    all_SNP_positions = list(np.sort(all_SNP_positions.keys()))
    for SNP_pos_ in all_SNP_positions:
        for sample_ in argsDICT['samples_pos']:
            if SNP_pos_ not in referenceDICT[sample_][argsDICT['chr']]['POS']:
                referenceDICT[sample_][argsDICT['chr']]['POS'][SNP_pos_] = get_reference_count_dict(SNP_pos_, referenceSEQ)
    print 'finished setting missing BGA positions to reference'
    print 'start writing count file'
    with open(argsDICT['o']+'.tsv','w') as tsvouthandle:
        tsvouthandle.write('COUNTSFILE\tNPOP\t'+str(len(argsDICT['samples']))+'\tNSITES\t'+str(int(filterDICT['passed']))+'\n')
        tsvouthandle.write('CHROM\tPOS\t'+'\t'.join(argsDICT['samples'])+'\n')
        for SNP_pos_ in all_SNP_positions:
            tsvouthandle.write(argsDICT['chr']+'\t'+str(SNP_pos_)+'\t')
            for sample_ in argsDICT['samples_pos']:
                tsvouthandle.write(','.join([str(x) for x in referenceDICT[sample_][argsDICT['chr']]['POS'][SNP_pos_].values()]))
                tsvouthandle.write('\t')
            tsvouthandle.write('\n')
    print 'finished writing count file'
    print 'variant types:'
    print polytypeDICT
    print 'filtered variants by:'
    print filterDICT


#subparser
def subparser(subparsers):
    # mvcf2consensus; parser
    parser_mvcf2consensus = subparsers.add_parser('mvcf2consensus', help='mvcf2consensus help'
                                                                  , description='Given a multiple sample vcf file reduced to SNP and MNP and the reference fasta file, ' \
                                                                                'this script merges the samples into a consensus. ' \
                                                                                'Major, minor and IUPAC code of the consensus will be written to a tab separated file.'
                                                                                '\nTo extract a subpopulations from e.g. GATK multiple vcf file use the following command from GATK: ' \
                                                                                'java -jar GenomeAnalysisTK.jar -T SelectVariants -sn $SAMPLE1 -sn $SAMPLE2 -V $MULTISAMPLEVCF -R $REFERENCE ' \
                                                                                '[-trimAlternates : removeUnusedAlternates] ' \
                                                                                '[-env : excludeNonVariants] ' \
                                                                                '[-ef : excludeFiltered] ' \
                                                                                '-selectType SNP -selectType MNP -o $OUTPUT ' \
                                                                                '\nor vcftools: ' \
                                                                                'vcftools --vcf $MULTISAMPLEVCF [--gvcf] --out $OUTPUT --recode --indv $SAMPLE1 --indc $SAMPLE2 ' \
                                                                                '[--minGQ : filter for genotype quality] ' \
                                                                                '[--minDP : filter for depth] ' \
                                                                                '--remove-indels ')
    parser_mvcf2consensus.add_argument('-v', '--verbose', help='increase output verbosity', action='store_true')
    parser_mvcf2consensus.add_argument('-tabix', help='specify if input is tabix file. ' \
                                                      'NOTE: if set to true reference file needs to be specified to get length of each entry [default: False]'
                                               , action='store_true')
    parser_mvcf2consensus.add_argument('-R', help='specify reference fasta file. Only needed if [tabix] option')
    parser_mvcf2consensus.add_argument('-ivcf', help='specify input vcf file')
    parser_mvcf2consensus.add_argument('-o', help='specify output prefix')
    parser_mvcf2consensus.add_argument('-samples', help='specify sample names to be merged as a comma-seperated list, they need to exactly match sample ids from vcf input')
    parser_mvcf2consensus.add_argument('-id', help='specify CONSENSUS name to be added to HEADER')
    parser_mvcf2consensus.add_argument('-chr', help='secify chromosomes to be parsed as a comma-seperated list')
    parser_mvcf2consensus.add_argument('-add', help='specify if CONSENSUS should be added as a new column to existing vcf file [default: False]', action='store_true')
    parser_mvcf2consensus.add_argument('-filterarg', help='specify FILTER arguments to keep variants, all variants without these will be discarded. ' \
                                                          'use a comma-seperated list e.g. PASS,VQSRTTranchesSNP90. ' \
                                                          'By default all variants are retained [default: ""]', default='')
    parser_mvcf2consensus.add_argument('-cdp', help='specify consensus depth threshold necessary to exceeded to retain SNP/MNP [default: "5"]' \
                                                    'NOTE: only Called samples will be considered after GT reset', type=int, default=5)
    parser_mvcf2consensus.add_argument('-ncf', help='specify NotCalledFraction to exclude SNP/MNP [default: "1.0"] for consensus calculation', type=float, default=1.0)
    parser_mvcf2consensus.add_argument('-minMQ', help='specify mapping quality threshold necessary to exceeded to retain variant [default: "0.0"]', type=float, default=0.0)
    parser_mvcf2consensus.add_argument('-minDP', help='specify minimal sample depth threshold to set position to be NotCalled during GT reset [default: "0"]', type=int, default=0)
    parser_mvcf2consensus.add_argument('-maxDP', help='specify maximal sample depth threshold to set position to be NotCalled during GT reset [default: "9999999"]', type=int, default=9999999)
    parser_mvcf2consensus.add_argument('-minGQ', help='specify minimal genotype quality threshold to set position to Called during GT reset [default: "0"]', type=int, default=0)
    parser_mvcf2consensus.add_argument('-missingGT', help='specify how missing GT data (based on e.g. calling SNPs with GATK on multiple samples simultaneously; GT: ./.) ' \
                                                          'should be handled, either keep AD coverage [default: "keep"], set AD to 0 for all samples which have GT: ./. for all alleles ["zero"], ' \
                                                          'set REF allele to certain value and ALT alleles to certain value for all samples that have GT: ./. ["set:value:value"] e.g. ref:10:10 ' \
                                                          'NOTE: there might be cases where the VariantCaller has defined GT: ./. however AD still shows counts AD: 3,0 ' \
                                                          'here one might not want to keep this variant as NotCalled (GT: ./.) and use the AD information to reset GT based on AD', default='keep')
    parser_mvcf2consensus.add_argument('-resetGT', help='specify how GT should be reset, either reset using AD information [default: "AD"] or keep as defined by VariantCaller ["keep"] ' \
                                                        'NOTE: the DP value needs to be exceeded otherwise the sample will be set to ./. ' \
                                                        'NOTE: this will influence the calculation of the NotCalledFraction', default='AD', choices=['AD','keep'])
    parser_mvcf2consensus.add_argument('-H', help='specify if HEADER of vcf file should be ommitted [default: False]', action='store_true')
    parser_mvcf2consensus.add_argument('-po', help='specify if progress should not be printed based on variant number processed [default: False]', action='store_true')
    parser_mvcf2consensus.add_argument('-ps', help='specify progress steps [default: "1e5"]', default='1e5')
    parser_mvcf2consensus.add_argument('-CHROM', help='specify CHROM column [default: 0]', type=int, default=0)
    parser_mvcf2consensus.add_argument('-POS', help='specify POS column [default: 1]', type=int, default=1)
    parser_mvcf2consensus.add_argument('-ID', help='specify ID column [default: 2]', type=int, default=2)
    parser_mvcf2consensus.add_argument('-REF', help='specify REF column [default: 3]', type=int, default=3)
    parser_mvcf2consensus.add_argument('-ALT', help='specify ALT column [default: 4]', type=int, default=4)
    parser_mvcf2consensus.add_argument('-QUAL', help='specify ALT column [default: 5]', type=int, default=5)
    parser_mvcf2consensus.add_argument('-FILTER', help='specify FILTER column [default: 6]', type=int, default=6)
    parser_mvcf2consensus.add_argument('-INFO', help='specify INFO column [default: 7]', type=int, default=7)
    parser_mvcf2consensus.add_argument('-FORMAT', help='specify FORMAT column [default: 8]', type=int, default=8)
    parser_mvcf2consensus.set_defaults(func=mvcf2consensus)
    # vcf2fasta; parser
    parser_vcf2fasta = subparsers.add_parser('vcf2fasta', help='vcf2fasta help'
                                                        , description='Given a single sample vcf or vcf tabix file, the reference fasta ' \
                                                                      'file and an optional GTF file, this script extracts REF and ALT ' \
                                                                      'alleles as either two sample specific fasta files as a pseudo-reference ' \
                                                                      'or one sample specific fasta file using IUPAC code. The user can choice how deletions ' \
                                                                      'and missing data should be handled, insertions will not be parsed. Additional specific ' \
                                                                      'filter arguments can be set to pre-process the vcf file and given the optional GFF file ' \
                                                                      'the pseudo-reference can be splitted into subfeatures like "gene"; "mRNA"; "transcript" ' \
                                                                      'indicating if the Parent or Children features should be extracted e.g. "transcript:CDS" ' \
                                                                      'for coding sequence or transcript:exon for cDNA. It is also possible to extract sliding windows giving a window ' \
                                                                      'and jump parameter, which will be applied to either the complete chromsomes or individual features. ' \
                                                                      'An optional coverage file will set all not covered bases to Z.' \
                                                                      '\nTo extract a single sample excluding non-variant sites (env) from e.g. GATK vcf file use the following command from GATK: ' \
                                                                      'java -jar GenomeAnalysisTK.jar -T SelectVariants -sn $SAMPLE -V $MULTISAMPLEVCF -R $REFERENCE ' \
                                                                      '[-trimAlternates : removeUnusedAlternates] ' \
                                                                      '[-env : excludeNonVariants] ' \
                                                                      '[-ef : excludeFiltered] ' \
                                                                      '-selectType SNP -selectType MNP -o $OUTPUT ')
    parser_vcf2fasta.add_argument('-v', '--verbose', help='increase output verbosity', action='store_true')
    parser_vcf2fasta.add_argument('-type', help='specify which type of parsing should be used, either producing two fasta files ' \
                                                ' extracting REF and ALT allele where in case of a heterozygous (REF: A; ALT: T/C) ' \
                                                ' with AD:1,1,1 or AD:0,1,1 the position is set to N [refaltN], ' \
                                                ' where in case of a heterozygous position (REF: A; ALT: C/T) with AD:1,1,1 or AD:1,2,1 ' \
                                                ' the REF output and ALT output allele is set arbitrary to A/C/T [refaltsample], ' \
                                                ' where in case of heterozygous position (REF: A; ALT: C/T) with AD:1,2,1 ' \
                                                ' the REF output is set to major C and ALT output allele is set to arbitrary minor allele A/T [default: refmajorsample] ' \
                                                ' where in case of heterozygous position (REF: A; ALT: C/T) with AD:2,2,1 ' \
                                                ' the REF output is set to minor T and ALT output allele is set to arbitrary major allele A/C [refminorsample] ' \
                                                ' or producing only one fasta file using ambigous IUPAC code [iupac]'
                                         , choices=['refalt2N', 'refaltsample', 'refmajorsample', 'refminorsample', 'iupac'], default='refmajorsample')
    parser_vcf2fasta.add_argument('-tabix', help='specify if input is tabix file. ' \
                                                 'NOTE: if set to true reference file needs to be specified to get length of each entry [default: False]'
                                       , action='store_true')
    parser_vcf2fasta.add_argument('-R', help='specify reference fasta file. Only needed if [tabix] option')
    parser_vcf2fasta.add_argument('-ivcf', help='specify input vcf file')
    parser_vcf2fasta.add_argument('-o', help='specify output prefix')
    parser_vcf2fasta.add_argument('-samples', help='specify sample name to be used to generate FASTA file, needs to exactly match sample id from vcf input')
    parser_vcf2fasta.add_argument('-id', help='specify name to be used in FASTA')
    parser_vcf2fasta.add_argument('-chr', help='secify chromosomes to be parsed as a comma-seperated list')
    parser_vcf2fasta.add_argument('-ibga', help='specify input bedGraph file which contains coverage information of the bam files ' \
                                                'used to call SNP/Indels. ' \
                                                'e.g. genomeCoverageBed -ibam $INPUT -bga -g $REFERENCE > $OUTPUT')
    parser_vcf2fasta.add_argument('-cov2N', help='specify coverage threshold to set positions from bga file to be not called ' \
                                                 'coded as X in the resulting fasta file [default: 0]', type=int, default=0)
    parser_vcf2fasta.add_argument('-filterarg', help='specify FILTER arguments to keep variants, all variants without these will be discarded. ' \
                                                     'use a comma-seperated list e.g. PASS,VQSRTTranchesSNP90. ' \
                                                     'By default all variants are retained [default: ""]', default='')
    parser_vcf2fasta.add_argument('-minMQ', help='specify mapping quality threshold necessary to exceeded to retain variant [default: "0.0"]', type=float, default=0.0)
    parser_vcf2fasta.add_argument('-minDP', help='specify minimal sample depth threshold to set variant [default: "1"]', type=int, default=1)
    parser_vcf2fasta.add_argument('-maxDP', help='specify maximal sample depth threshold to set variant [default: "9999999"]', type=int, default=9999999)
    parser_vcf2fasta.add_argument('-minGQ', help='specify minimal genotype quality threshold to set position [default: "0"]', type=int, default=0)
    parser_vcf2fasta.add_argument('-missingGT', help='specify how missing GT data (based on e.g. calling SNPs with GATK on multiple samples simultaneously; GT: ./.) ' \
                                                     'should be handled, either keep AD coverage [default: "keep"], set AD to 0 for all samples which have GT: ./. for all alleles ["zero"], ' \
                                                     'set REF allele to certain value and ALT alleles to certain value for all samples that have GT: ./. ["set:value:value"] e.g. ref:10:10 ' \
                                                     'NOTE: there might be cases where the VariantCaller has defined GT: ./. however AD still shows counts AD: 3,0 ' \
                                                     'here one might not want to keep this variant as NotCalled (GT: ./.) and use the AD information to reset GT based on AD', default='keep')
    parser_vcf2fasta.add_argument('-resetGT', help='specify how GT should be reset, either reset using AD information [default: "AD"] or keep as defined by VariantCaller ["keep"] ' \
                                                   'NOTE: the DP value needs to be exceeded otherwise the sample will be set to ./. ' \
                                                   'NOTE: this will influence the calculation of the NotCalledFraction', default='AD', choices=['AD','keep'])
    parser_vcf2fasta.add_argument('-gtf', help='specify input gtf/gff3 file')
    parser_vcf2fasta.add_argument('-feature', help='specify feature to be extracted, e.g. chromosome [chromosome]; cDNA [transcript:exon]; ' \
                                                   'CDS [transcript:CDS]; pre-mRNA [transcript:transcript]; gene [gene]; [default: chromosome]'
                                            , default = 'chromosome', choices = ['chromosome','gene','transcript:exon', 'transcript:CDS', 'transcript:transcript', 'transcript:intron'])
    parser_vcf2fasta.add_argument('-sw', help='specify if sliding windows should be prepared for each sequence [default: False]', action = 'store_true')
    parser_vcf2fasta.add_argument('-s', help='specify sliding windonw length [default:5000]', type = int, default = 5000)
    parser_vcf2fasta.add_argument('-j', help='specify sliding windonw jump length [default:5000]', type = int, default = 5000)
    parser_vcf2fasta.add_argument('-po', help='specify if progress should not be printed based on variant number processed [default: False]', action='store_true')
    parser_vcf2fasta.add_argument('-ps', help='specify progress steps [default: 1e5]', default='1e5')
    parser_vcf2fasta.add_argument('-CHROM', help='specify CHROM column [default: 0]', type=int, default=0)
    parser_vcf2fasta.add_argument('-POS', help='specify POS column [default: 1]', type=int, default=1)
    parser_vcf2fasta.add_argument('-ID', help='specify ID column [default: 2]', type=int, default=2)
    parser_vcf2fasta.add_argument('-REF', help='specify REF column [default: 3]', type=int, default=3)
    parser_vcf2fasta.add_argument('-ALT', help='specify ALT column [default: 4]', type=int, default=4)
    parser_vcf2fasta.add_argument('-QUAL', help='specify ALT column [default: 5]', type=int, default=5)
    parser_vcf2fasta.add_argument('-FILTER', help='specify FILTER column [default: 6]', type=int, default=6)
    parser_vcf2fasta.add_argument('-INFO', help='specify INFO column [default: 7]', type=int, default=7)
    parser_vcf2fasta.add_argument('-FORMAT', help='specify FORMAT column [default: 8]', type=int, default=8)
    parser_vcf2fasta.set_defaults(func=vcf2fasta)
    # mvcfPL2GP; parser
    parser_mvcfPL2GP = subparsers.add_parser('mvcfPL2GP', help='mvcfPL2GP help'
                                                        , description='Given a multiple sample vcf file containing the PL flag reduced to BIALLELIC SNP and MNP and the reference fasta file, ' \
                                                                      'this script converts phred-scale PL values into GP for NgsAdmix input.')
    parser_mvcfPL2GP.add_argument('-v', '--verbose', help='increase output verbosity', action='store_true')
    parser_mvcfPL2GP.add_argument('-tabix', help='specify if input is tabix file. ' \
                                                 'NOTE: if set to true reference file needs to be specified to get length of each entry [default: False]'
                                       , action='store_true')
    parser_mvcfPL2GP.add_argument('-R', help='specify reference fasta file. Only needed if [tabix] option')
    parser_mvcfPL2GP.add_argument('-ivcf', help='specify input vcf file')
    parser_mvcfPL2GP.add_argument('-o', help='specify output prefix')
    parser_mvcfPL2GP.add_argument('-samples', help='specify sample name to be used to generate FASTA file, needs to exactly match sample id from vcf input')
    parser_mvcfPL2GP.add_argument('-chr', help='secify chromosomes to be parsed as a comma-seperated list')
    parser_mvcfPL2GP.add_argument('-filterarg', help='specify FILTER arguments to keep variants, all variants without these will be discarded. ' \
                                                     'use a comma-seperated list e.g. PASS,VQSRTTranchesSNP90. ' \
                                                     'By default all variants are retained [default: ""]', default='')
    parser_mvcfPL2GP.add_argument('-minMQ', help='specify mapping quality threshold necessary to exceeded to retain variant [default: "0.0"]', type=float, default=0.0)
    parser_mvcfPL2GP.add_argument('-minDP', help='specify minimal sample depth threshold to keep GP [default: "1"]', type=int, default=1)
    parser_mvcfPL2GP.add_argument('-maxDP', help='specify maximal sample depth threshold to keep GP [default: "9999999"]', type=int, default=9999999)
    parser_mvcfPL2GP.add_argument('-minGQ', help='specify minimal genotype quality threshold to keep GP [default: "0"]', type=int, default=0)
    parser_mvcfPL2GP.add_argument('-missingGT', help='specify how missing GT data (based on e.g. calling SNPs with GATK on multiple samples simultaneously; GT: ./.) ' \
                                                     'should be handled, either keep PL values ["keep"], set GP to 0.333 for all samples which have GT: ./. for all alleles [default: "equal"], ' \
                                                     'set GP to certain values for all samples that have GT: ./. ["set:value:value:value"] e.g. ref:1.0:0.0:0.0 ', default='equal')
    parser_mvcfPL2GP.add_argument('-po', help='specify if progress should not be printed based on variant number processed [default: False]', action='store_true')
    parser_mvcfPL2GP.add_argument('-ps', help='specify progress steps [default: 1e5]', default='1e5')
    parser_mvcfPL2GP.add_argument('-CHROM', help='specify CHROM column [default: 0]', type=int, default=0)
    parser_mvcfPL2GP.add_argument('-POS', help='specify POS column [default: 1]', type=int, default=1)
    parser_mvcfPL2GP.add_argument('-ID', help='specify ID column [default: 2]', type=int, default=2)
    parser_mvcfPL2GP.add_argument('-REF', help='specify REF column [default: 3]', type=int, default=3)
    parser_mvcfPL2GP.add_argument('-ALT', help='specify ALT column [default: 4]', type=int, default=4)
    parser_mvcfPL2GP.add_argument('-QUAL', help='specify ALT column [default: 5]', type=int, default=5)
    parser_mvcfPL2GP.add_argument('-FILTER', help='specify FILTER column [default: 6]', type=int, default=6)
    parser_mvcfPL2GP.add_argument('-INFO', help='specify INFO column [default: 7]', type=int, default=7)
    parser_mvcfPL2GP.add_argument('-FORMAT', help='specify FORMAT column [default: 8]', type=int, default=8)
    parser_mvcfPL2GP.set_defaults(func=mvcfPL2GP)
    # mvcf2PoMo; parser
    parser_mvcf2PoMo = subparsers.add_parser('mvcf2PoMo', help='mvcf2PoMo help'
                                                        , description='Given a multiple sample vcf tabix file, the reference fasta ' \
                                                                      'and optional coverage files for each sample, this script extracts A,C,G,T ' \
                                                                      'counts for SNP and MNP for a given chromosome and region to produce an input file ' \
                                                                      'for the IQ-TREE Polymorphism-Aware-Model. In case of low coverage ' \
                                                                      'all allele counts will be set to zero. NOTE that the mvcf file is not filtered for e.g. minDP ' \
                                                                      'or minGQ so one would need to filter the mvcf file with e.g. vcftools or bcftools first: ' \
                                                                      'bcftools filter -e \'FORMAT/DP < 5 | FORMAT/GQ < 30\' --set-GT . --SnpGap 3 $INPUT -O u | bcftools view -v snps,mnps -O z > $OUTPUT ' \
                                                                      'NOTE that all samples with GT: ./. on default settings are treated as missing where allelic depth (AD)' \
                                                                      'will be set to zero which has an influence on unchanged bases ' \
                                                                      'as compared to the reference and is depedant on how the mvcf file was created (see missingGT and resetGT option).')
    parser_mvcf2PoMo.add_argument('-v', '--verbose', help='increase output verbosity', action='store_true')
    parser_mvcf2PoMo.add_argument('-R', help='specify reference fasta file. Only needed if [tabix] option')
    parser_mvcf2PoMo.add_argument('-ivcf', help='specify input vcf file')
    parser_mvcf2PoMo.add_argument('-o', help='specify output prefix')
    parser_mvcf2PoMo.add_argument('-type', help='specify type of extraction, either [default: GTpop] or [ADpop] or [GTsingle] or [ADsingle], ' \
                                                'if populations should be grouped one needs to additionally specify popids and group sample names according to their population groups e.g. ' \
                                                '-samples "SAMPLE1,SAMPLE2;SAMPLE3,SAMPLE4;SAMPLE5"')
    parser_mvcf2PoMo.add_argument('-samples', help='specify sample name to be used to generate FASTA file, needs to exactly match sample id from vcf input')
    parser_mvcf2PoMo.add_argument('-ibga', help='specify input bedGraph files for each sample which contains coverage information of the bam files ' \
                                                'used to call SNP/Indels. ' \
                                                'e.g. genomeCoverageBed -ibam $INPUT -bga -g $REFERENCE > $OUTPUT' \
                                                'NOTE that each sample needs to have an coverage file if this option is used and that it needs to have the same order like the samples option. '\
                                                'NOTE that the coverage file can be empty.')
    parser_mvcf2PoMo.add_argument('-cov2N', help='specify coverage threshold to set positions from bga file to be not called ' \
                                                 'resulting in zero allelic depth [default: 0]', type=int, default=0)
    parser_mvcf2PoMo.add_argument('-region', help='secify region to be parsed as e.g. chr1:1-15000')
    parser_mvcf2PoMo.add_argument('-filterarg', help='specify FILTER arguments to keep variants, all variants without these will be discarded. ' \
                                                     'use a comma-seperated list e.g. PASS,VQSRTTranchesSNP90. ' \
                                                     'By default all variants are retained [default: ""]', default='')
    parser_mvcf2PoMo.add_argument('-minMQ', help='specify mapping quality threshold necessary to exceeded to retain variant [default: "0.0"]', type=float, default=0.0)
    parser_mvcf2PoMo.add_argument('-missingGT', help='specify how missing GT data (based on e.g. calling SNPs with GATK on multiple samples simultaneously; GT: ./.) ' \
                                                     'should be handled, either keep AD coverage ["keep"], set AD to 0 for all samples which have GT: ./. for all alleles [default: "zero"], ' \
                                                     'set REF allele to certain value and ALT alleles to certain value for all samples that have GT: ./. ["set:value:value"] e.g. ref:10:10 ' \
                                                     'NOTE: there might be cases where the VariantCaller has defined GT: ./. however AD still shows counts AD: 3,0 ' \
                                                     'here one might not want to keep this variant as NotCalled (GT: ./.) and use the AD information to reset GT based on AD', default='zero')
    parser_mvcf2PoMo.add_argument('-resetGT', help='specify how GT should be reset, either reset using AD information [default: "AD"] or keep as defined by VariantCaller ["keep"] ' \
                                                   'NOTE: the DP value needs to be exceeded otherwise the sample will be set to ./. ' \
                                                   'NOTE: this will influence the calculation of the NotCalledFraction', default='AD', choices=['AD','keep'])
    parser_mvcf2PoMo.add_argument('-po', help='specify if progress should not be printed based on variant number processed [default: False]', action='store_true')
    parser_mvcf2PoMo.add_argument('-ps', help='specify progress steps [default: 1e5]', default='1e5')
    parser_mvcf2PoMo.add_argument('-CHROM', help='specify CHROM column [default: 0]', type=int, default=0)
    parser_mvcf2PoMo.add_argument('-POS', help='specify POS column [default: 1]', type=int, default=1)
    parser_mvcf2PoMo.add_argument('-ID', help='specify ID column [default: 2]', type=int, default=2)
    parser_mvcf2PoMo.add_argument('-REF', help='specify REF column [default: 3]', type=int, default=3)
    parser_mvcf2PoMo.add_argument('-ALT', help='specify ALT column [default: 4]', type=int, default=4)
    parser_mvcf2PoMo.add_argument('-QUAL', help='specify ALT column [default: 5]', type=int, default=5)
    parser_mvcf2PoMo.add_argument('-FILTER', help='specify FILTER column [default: 6]', type=int, default=6)
    parser_mvcf2PoMo.add_argument('-INFO', help='specify INFO column [default: 7]', type=int, default=7)
    parser_mvcf2PoMo.add_argument('-FORMAT', help='specify FORMAT column [default: 8]', type=int, default=8)
    parser_mvcf2PoMo.set_defaults(func=mvcf2PoMo)
    

#main
def main():
    # top-level parser
    parser = argparse.ArgumentParser(prog='vcfparser', usage='%(prog)s <sub-script> [options] [<arguments>...]',
                                     description='scripts for VCFPARSER file handling')
    subparsers = parser.add_subparsers(title='sub-scripts', description='valid sub-scripts', help='sub-scripts help',
                                       dest='cmd')
    # sub-level parser
    subparser(subparsers)

    # get args
    args = parser.parse_args()

    # call function
    args.func(args, parser)


if __name__ == '__main__':
    main()