Gitlab Community Edition Instance

Skip to content
Snippets Groups Projects
Commit 6c7c63d9 authored by Kristian Ullrich's avatar Kristian Ullrich
Browse files

add mvcf2consensus.py

parent e62b5dd9
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/python
# -*- coding: UTF-8 -*-
'''
Author: Krisian Ullrich
date: January 2017
email: ullrich@evolbio.mpg.de
License: MIT
The MIT License (MIT)
Copyright (c) 2017 Kristian Ullrich
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
This software relies on the following python packages:
sys
os
re
numpy https://pypi.python.org/pypi/numpy
random
argparse
tabix https://pypi.python.org/pypi/pytabix
gzip
itertools
Bio https://github.com/biopython/DIST
collections
'''
import sys
import os
import re
import numpy as np
import random
import argparse
import tabix
import gzip
import itertools
from itertools import repeat
from Bio import SeqIO
import collections
IUPACdict = {'A':'A','C':'C','G':'G','T':'T','-':'-','N':'N','.':'.','AG':'R','GA':'R','CT':'Y','TC':'Y','CG':'S',
'GC':'S','AT':'W', 'TA': 'W', 'GT': 'K', 'TG': 'K', 'AC': 'M', 'CA': 'M','CGT':'B','CTG':'B','GCT':'B','GTC':'B',
'TCG':'B','TGC':'B','AGT':'D','ATG':'D','GAT':'D','GTA':'D','TAG':'D','TGA':'D','ACT':'H','ATC':'H','CAT':'H',
'CTA':'H','TAC':'H','TCA':'H','ACG':'V','AGC':'V','CAG':'V','CGA':'V','GAC':'V','GCA':'V','ACGT':'N','ACTG':'N',
'AGCT':'N','AGTC':'N','ATCG':'N','ATGC':'N','CAGT':'N','CATG':'N','CGAT':'N','CGTA':'N','CTAG':'N','CTGA':'N',
'GACT':'N','GATC':'N','GCAT':'N','GCTA':'N','GTAC':'N','GTCA':'N','TACG':'N','TAGC':'N','TCAG':'N','TCGA':'N',
'TGAC':'N','TGCA':'N','A*':'N','C*':'N','G*':'N','T*':'N'}
def getIUPAC(x):
return IUPACdict[x]
def POINT2zero(x):
if x == '.':
return '0'
else:
return x
def POINTNA2zero(x):
if x == '.':
return '0'
if x == 'NA':
return '0'
else:
return x
def NA2zero(x):
if x == 'NA':
return int(0)
else:
return int(x)
def nCk(n, k):
return len(list(itertools.combinations(range(n), k)))
def evaluate_ref_alt_max(x,y):
if x[0]==0 and x[1]==0:
return './.'
if x[0]>0 and x[1]==0:
return str(y[0])+'/'+str(y[0])
if x[0]==0 and x[1]>0:
return str(y[1])+'/'+str(y[1])
if x[0]>0 and x[1]>0:
return str(y[0])+'/'+str(y[1])
def get_header_from_vcf( argsDICT ):
HEADER = []
with open(argsDICT['ivcf'],'rb') as f:
for flines in f:
if flines[:2]=='##':
HEADER.append(flines)
if flines[:4]=='#CHR':
HEADER.append('##INFO=<ID=NCFnumber,Number=1,Type=String,Description="NotCalledFraction number">\n')
HEADER.append('##INFO=<ID=NCFfraction,Number=1,Type=Float,Description="NotCalledFraction fraction">\n')
HEADER.append('##mvcf2consensus.py '+argsDICT['args'][0]+'\n')
HEADER.append(changeHEADER(flines, argsDICT['add'], argsDICT['id'], argsDICT['samples']))
if flines[0]!='#':
break
return HEADER
def get_header_from_gz( argsDICT ):
HEADER = []
with gzip.open(argsDICT['ivcf'],'rb') as f:
for flines in f:
if flines[:2]=='##':
HEADER.append(flines)
if flines[:4]=='#CHR':
HEADER.append('##INFO=<ID=NCFnumber,Number=1,Type=String,Description="NotCalledFraction number">\n')
HEADER.append('##INFO=<ID=NCFfraction,Number=1,Type=Float,Description="NotCalledFraction fraction">\n')
HEADER.append('##mvcf2consensus.py '+argsDICT['args'][0]+'\n')
HEADER.append(changeHEADER(flines, argsDICT['add'], argsDICT['id'], argsDICT['samples']))
if flines[0]!='#':
break
return HEADER
def get_sample_pos_from_header( argsDICT ):
SAMPLE_POS = []
if argsDICT['tabix']:
with gzip.open(argsDICT['ivcf'],'rb') as f:
for flines in f:
if flines[:4]=='#CHR':
flines_stripped = flines.strip().split('\t')
if flines[0]!='#':
break
if not argsDICT['tabix']:
with open(argsDICT['ivcf'],'rb') as f:
for flines in f:
if flines[:4]=='#CHR':
flines_stripped = flines.strip().split('\t')
if flines[0]!='#':
break
for s in argsDICT['samples']:
SAMPLE_POS.append([x for x,y in enumerate(flines_stripped) if y == s][0])
return SAMPLE_POS
def get_sample_pos_dict_from_header( argsDICT ):
sampleposDICT = {}
if argsDICT['tabix']:
with gzip.open(argsDICT['ivcf'],'rb') as f:
for flines in f:
if flines[:4]=='#CHR':
flines_stripped = flines.strip().split('\t')
if flines[0]!='#':
break
if not argsDICT['tabix']:
with open(argsDICT['ivcf'],'rb') as f:
for flines in f:
if flines[:4]=='#CHR':
flines_stripped = flines.strip().split('\t')
if flines[0]!='#':
break
for s in argsDICT['samples']:
sampleposDICT[s]=[x for x,y in enumerate(flines_stripped) if y == s][0]
return sampleposDICT
def changeHEADER(header, add, name, samples):
if add:
headersplit = header.strip().split('\tFORMAT')
return('\t'.join([headersplit[0]]+['FORMAT']+samples+[name])+'\n')
if not add:
headersplit = header.strip().split('\tFORMAT')
return('\t'.join([headersplit[0]]+['FORMAT']+[name])+'\n')
def get_missing_type(x):
if x == 'keep':
return ['keep', 0, 0]
if x == 'zero':
return ['zero', 0, 0]
if 'set' in x:
return ['set', int(x.split(':')[1]), int(x.split(':')[2])]
def get_reference_len( argsDICT ):
print 'start obtaining reference length'
length_dict = {}
for seq in SeqIO.parse( argsDICT['R'], 'fasta' ):
if argsDICT['verbose']:
print seq.id
if seq.id not in argsDICT['chr']:
continue
if seq.id in argsDICT['chr']:
length_dict[seq.id] = len( seq )
print 'finished obtaining reference length'
return length_dict
class vcfRecord:
def __init__(self, record, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, SAMPLES, tabix):
if not tabix:
self.splitted = record.strip().split('\t')
if tabix:
self.splitted = record
self.CHROM = self.splitted[CHROM]
self.POS = self.splitted[POS]
self.ID = self.splitted[ID]
self.REF = [self.splitted[REF]]
self.ALT = self.splitted[ALT].split(',')
self.QUAL = self.splitted[QUAL]
self.FILTER = self.splitted[FILTER]
self.INFO = self.splitted[INFO]
self.FORMAT = self.splitted[FORMAT].split(':')
self.SAMPLES = collections.OrderedDict()
for s in SAMPLES:
self.SAMPLES[s] = collections.OrderedDict()
if len(self.splitted[s].split(':')) != len(self.FORMAT):
self.splitted[s]+=':'+':'.join(list(np.repeat("0",len(self.FORMAT)-len(self.splitted[s].split(':')))))
for p,f in enumerate(self.FORMAT):
self.SAMPLES[s][f] = self.splitted[s].split(':')[p].split(',')
def get_var_len(self):
"""" Return length of alleles """
var_len = [len(self.REF)]
var_len += [len(x) for x in self.ALT]
return var_len
def get_var_number(self):
""" Return total number of alleles """
return len([self.REF])+len(self.ALT)
def get_ref_alt(self):
""" Return alleles """
REF_ALT = self.REF+self.ALT
return REF_ALT
def is_filtered(self, argsDICT):
""" Return True if FILTER of record is in filterarg """
if len(argsDICT['filterarg'])!=1:
if self.FILTER not in argsDICT['filterarg']:
return True
if self.FILTER in argsDICT['filterarg']:
return False
if len(argsDICT['filterarg']) == 1 and argsDICT['filterarg'][0]!='':
if self.FILTER not in argsDICT['filterarg']:
return True
if self.FILTER in argsDICT['filterarg']:
return False
if len(argsDICT['filterarg']) == 1 and argsDICT['filterarg'][0] == '':
return False
def get_var_type(self):
""" Return variant type according to GATK """
var_type = 'NA'
var_len = self.get_var_len()
var_number = self.get_var_number()
if len(self.ALT) == 1 and self.ALT[0] == '.':
var_type = 'noVAR'
if len(self.ALT) == 1 and self.ALT[0] == '*':
var_type = 'complexVAR'
if var_number == 2:
if var_len[0] == 1 and var_len[1] == 1:
#equals GATK SNP
var_type = 'biSNP'
if var_len[0] > 1 and var_len[1] > 1 and var_len[0] == var_len[1]:
#equals GATK MNP
var_type = 'repSNP'
if var_len[0] > var_len[1]:
var_type = 'deletion'
if var_len[0] < var_len[1]:
var_type = 'insertion'
#equals GATK MULTI-ALLELIC
if var_number != 2:
if all( [x == var_len[0] for x in var_len[1::]] ) and all( [x == 1 for x in var_len] ):
#equals GATK SNP
var_type = 'muSNP'
#TODO 'murepSNP'
if all( [x < var_len[0] for x in var_len[1::]] ):
var_type = 'mudeletion'
if all( [x > var_len[0] for x in var_len[1::]] ):
var_type = 'muinsertion'
if any( [x < var_len[0] for x in var_len[1::]] ) and any( [x == var_len[0] for x in var_len[1::]] ):
var_type = 'complexDS'
if any( [x > var_len[0] for x in var_len[1::]] ) and any( [x == var_len[0] for x in var_len[1::]] ):
var_type = 'complexIS'
if any( [x < var_len[0] for x in var_len[1::]] ) and any( [x > var_len[0] for x in var_len[1::]] ):
var_type = 'complexDI'
if any( [x < var_len[0] for x in var_len[1::]] ) and any( [x > var_len[0] for x in var_len[1::]] ) and any( [x == var_len[0] for x in var_len[1::]] ):
var_type = 'complexDIS'
return var_type
def keep_samples(self, argsDICT):
""" Keep only samples in record which are listed """
for s in self.SAMPLES.copy():
if s not in argsDICT['samples_pos']:
self.SAMPLES.pop(s)
def get_info(self, value):
"""" Return INFO value """
return [x[1] for x in [x.split('=') for x in self.INFO.split(';')] if x[0] == value][0]
def add_info(self, value):
""" Add INFO value """
self.INFO+=str(value)
def add_format_on_samples(self, s, f, v):
""" Add new FORMAT field to sample """
self.SAMPLES[s][f] = v
def replace_format_on_samples(self, s, f, v):
""" Replace FORMAT field of sample """
self.SAMPLES[s][f] = v
def get_samples_ad(self):
""" Return samples AD as list """
return [self.SAMPLES[x]['AD'] for x in self.SAMPLES]
def get_samples_gt(self):
""" Return samples GT as list """
return [self.SAMPLES[x]['GT'] for x in self.SAMPLES]
def get_samples_gq(self):
""" Return samples GQ as list """
return [self.SAMPLES[x]['GQ'] for x in self.SAMPLES]
def get_samples_dp(self):
""" Return samples GQ as list """
return [self.SAMPLES[x]['DP'] for x in self.SAMPLES]
def get_samples_pl(self):
""" Return samples GQ as list """
return [self.SAMPLES[x]['PL'] for x in self.SAMPLES]
def get_samples_ad_len(self):
""" Return samples AD len as list """
return [len(self.SAMPLES[x]['AD']) for x in self.SAMPLES]
def get_samples_ad_ref(self):
""" Return samples AD REF as list """
return [self.SAMPLES[x]['AD'][0] for x in self.SAMPLES]
def get_samples_ad_alt(self):
""" Return samples AD ALT as list """
return [self.SAMPLES[x]['AD'][1:] for x in self.SAMPLES]
def gt_type(self, x):
if len(x)==1:
if x[0] == '.':
return 'NA'
if x[0] != '.':
return 'hap'
if x[0] == '.' and x[1] == '.':
return 'NA'
if x[0] != '.' and x[1] != '.' and x[0] == x[1]:
return 'hom'
if x[0] != '.' and x[1] != '.' and x[0] != x[1]:
return 'het'
def get_gt_type(self):
""" Return GT type """
gt_split = re.compile("[|/]")
GTs = [re.split(gt_split,x[0]) for x in self.get_samples_gt()]
return [self.gt_type(x) for x in GTs]
def get_gt_na_pos(self):
""" Return index of GT which is NA """
samplesgttype = self.get_gt_type()
samplesgtnapos = [x for x,y in enumerate(samplesgttype) if y == 'NA']
return samplesgtnapos
def fill_empty_ad(self):
""" Replaces "." with 0 and fills to var_number if less """
x_ = [x+list(np.repeat(0,self.get_var_number()-len(x))) for x in [[int(x.replace(".","0")) for x in y] for y in self.get_samples_ad()]]
for x,y in zip(x_, self.SAMPLES.keys()):
self.SAMPLES[y]['AD'] = x
def fill_empty_pl(self):
""" Replaces "." with [0,0,0] """
x_ = [x+list(np.repeat(0,nCk(self.get_var_number()+1,2)-len(x))) for x in [[int(x.replace(".","0")) for x in y] for y in self.get_samples_pl()]]
for x,y in zip(x_, self.SAMPLES.keys()):
self.SAMPLES[y]['PL'] = x
def set_samples_ad_by_gt(self, argsDICT):
if argsDICT['missingGT'][0] == 'keep':
return
if argsDICT['missingGT'][0] == 'zero':
samplesgtnapos = slef.get_gt_na_pos()
#set AD to zero for all samples with GT equals NA
for s in [x.sample for y,x in enumerate(self.samples) if y in samplesgtnapos]:
replace_format_on_samples(self, s, 'AD', list(np.repeat(0, get_var_number(self))))
if argsDICT['missingGT'][0] == 'set':
samplesgtnapos = self.get_gt_na_pos()
#set AD to zero for all samples with GT equals NA
for s in self.get_gt_na_pos():
self.replace_format_on_samples(self.SAMPLES.keys()[s], 'AD', [argsDICT['missingGT'][1]]+list(np.repeat(argsDICT['missingGT'][2], self.get_var_number()-1)))
def set_dp_by_ad(self):
samplesad = self.get_samples_ad()
samplesadsum = [np.sum([int(x) for x in x]) for x in samplesad]
#set DP to ADSUM
for i in range(0, len(self.SAMPLES)):
self.replace_format_on_samples(self.SAMPLES.keys()[i], 'DP', samplesadsum[i])
def set_samples_gt_by_ad(self, argsDICT):
if argsDICT['resetGT'] == 'keep':
return
if argsDICT['resetGT'] == 'AD':
#setGTbyAD
samplesad_ref = self.get_samples_ad_ref()
samplesad_alt = self.get_samples_ad_alt()
#NOTE np.argmax return the first element for equal values which would bias GT setting always to the first ALT entry if equal depth exist
#NOTE this is only used to determine NotCalledFraction, AD for other alleles will be considered
samplesad_alt_maxpos = [np.argmax(x) for x in samplesad_alt]
samplesad_alt_max = [np.max(x) for x in samplesad_alt]
samplesad_ref_alt_max = map(lambda a, b: [a,b], samplesad_ref, samplesad_alt_max)
samplesad_ref_alt_maxpos = map(lambda a, b: [a,b], list(np.repeat(0,len(samplesad_ref))), [x+1 for x in samplesad_alt_maxpos])
samplesgt = []
for a, b in zip(samplesad_ref_alt_max, samplesad_ref_alt_maxpos):
samplesgt.append(evaluate_ref_alt_max(a, b))
#get NotCalled positions
minDP_NCpos = [x for x,y in enumerate(self.get_samples_dp()) if y < argsDICT['minDP']]
maxDP_NCpos = [x for x,y in enumerate(self.get_samples_dp()) if y > argsDICT['maxDP']]
minGQ_NCpos = [x for x,y in enumerate(self.get_samples_gq()) if y < argsDICT['minGQ']]
#combine and unique NotCalled positions
NCpos = [x for x in set(minDP_NCpos+maxDP_NCpos+minGQ_NCpos)]
for x in NCpos:
samplesgt[x]='./.'
for i in range(0, len(self.SAMPLES)):
self.replace_format_on_samples(self.SAMPLES.keys()[i], 'GT', samplesgt[i])
def get_consensus_calldata(self, argsDICT):
""" Return consensus of existing record samples """
consensusdict = collections.OrderedDict()
for rf in self.FORMAT:
consensusdict[rf] = None
consensusdict['AD'] = self.merge_ad()
consensusdict['DP'] = np.sum(consensusdict['AD'])
consensusdict['GQ'] = self.merge_gq()
#evaluate CONSENSUSPL from SAMPLESPL AND SAMPLESGTNAPOS
consensusdict['PL'] = self.merge_pl()
consensusdict['GT'] = self.set_consensus_gt_by_ad(consensusdict)
return consensusdict
def set_consensus_gt_by_ad(self, consensusdict):
consensusad_ref = consensusdict['AD'][0]
consensusad_alt = consensusdict['AD'][1:]
#NOTE np.argmax return the first element for equal values which would bias GT setting always to the first ALT entry if equal depth exist
consensusad_alt_maxpos = np.argmax(consensusad_alt)
consensusad_alt_max = np.max(consensusad_alt)
consensusad_ref_alt_max = [consensusad_ref, consensusad_alt_max]
consensusad_ref_alt_maxpos = [0, consensusad_alt_maxpos+1]
consensusgt = evaluate_ref_alt_max(consensusad_ref_alt_max, consensusad_ref_alt_maxpos)
return consensusgt
def merge_ad(self):
ADOUT = []
for i in range(0, self.get_var_number()):
ADOUT.append(np.sum([int(y[i]) for x,y in enumerate(self.get_samples_ad()) if x not in self.get_gt_na_pos()]))
return ADOUT
def merge_gq(self):
return int(np.median([[0 if z == "." else int(z) for z in y] for x,y in enumerate(self.get_samples_gq()) if x not in self.get_gt_na_pos()]))
def merge_dp(self):
return np.sum([y for x,y in enumerate(self.get_samples_dp()) if x not in self.get_gt_na_pos()])
def merge_pl(self):
PLOUT = []
for i in range(0,nCk(self.get_var_number()+1,2)):
PLOUT.append(int(np.median([y[i] for x,y in enumerate(self.get_samples_pl()) if x not in self.get_gt_na_pos()])))
return PLOUT
def get_alleles_muSNP(self):
MAJOUT = list(np.repeat('N',len(self.SAMPLES)))
MINOUT = list(np.repeat('N',len(self.SAMPLES)))
IUPACOUT = list(np.repeat('N',len(self.SAMPLES)))
for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _muSNP(self, sample_, 'refmajorsample')
return [MAJOUT,MINOUT,IUPACOUT]
def get_alleles_biSNP(self):
MAJOUT = list(np.repeat('N',len(self.SAMPLES)))
MINOUT = list(np.repeat('N',len(self.SAMPLES)))
IUPACOUT = list(np.repeat('N',len(self.SAMPLES)))
for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _biSNP(self, sample_, 'refmajorsample')
return [MAJOUT,MINOUT,IUPACOUT]
def get_alleles_repSNP(self):
MAJOUT = list(np.repeat('N'*len(get_ref_alt(self)[0]),len(self.SAMPLES)))
MINOUT = list(np.repeat('N'*len(get_ref_alt(self)[0]),len(self.SAMPLES)))
IUPACOUT = list(np.repeat('N'*len(get_ref_alt(self)[0]),len(self.SAMPLES)))
for s_pos, sample_ in enumerate(self.SAMPLES.keys()):
MAJOUT[s_pos], MINOUT[s_pos], IUPACOUT[s_pos] = _repSNP(self, sample_, 'refmajorsample')
return [MAJOUT,MINOUT,IUPACOUT]
def _biSNP(self, sample_pos, type_):
if sample_pos not in self.SAMPLES.keys():
return
samplesad_ = self.SAMPLES[sample_pos]['AD']
ref_alt_ = self.get_ref_alt()
ref_ = ref_alt_[0]
alt_ = ref_alt_[1:]
ref_out = 'N'
alt_out = 'N'
iupac_out = 'N'
samplesad_ = [int(x) for x in samplesad_]
ref_alt_nonzero = [ref_alt_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
samplesad_nonzero = [samplesad_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
if len(samplesad_nonzero) == 0:
return [ref_out, alt_out, iupac_out]
admax = np.max([int(x) for x in samplesad_nonzero])
admin = np.min([int(x) for x in samplesad_nonzero])
admaxpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admax]
adminpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]!=admax]
if type_ == 'refaltN':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2:
return [ref_out, alt_out, iupac_out]
if type_ == 'refaltsample':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2:
ref_out = random.sample(ref_alt_nonzero,1)[0]
alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if type_ == 'refmajorsample':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
ref_out = ref_alt_nonzero[admaxpos[0]]
alt_out = ref_alt_nonzero[adminpos[0]]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
ref_out = random.sample(ref_alt_nonzero,1)[0]
alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if type_ == 'refminorsample':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
ref_out = ref_alt_nonzero[adminpos[0]]
alt_out = ref_alt_nonzero[admaxpos[0]]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
ref_out = random.sample(ref_alt_nonzero,1)[0]
alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if type_ == 'iupac':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2:
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
def _muSNP(self, sample_pos, type_):
if sample_pos not in self.SAMPLES.keys():
return
samplesad_ = self.SAMPLES[sample_pos]['AD']
ref_alt_ = self.get_ref_alt()
ref_ = ref_alt_[0]
alt_ = ref_alt_[1:]
ref_out = 'N'
alt_out = 'N'
iupac_out = 'N'
samplesad_ = [int(x) for x in samplesad_]
ref_alt_nonzero = [ref_alt_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
samplesad_nonzero = [samplesad_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
if len(samplesad_nonzero) == 0:
return [ref_out, alt_out, iupac_out]
admax = np.max([int(x) for x in samplesad_nonzero])
admin = np.min([int(x) for x in samplesad_nonzero])
admaxpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admax]
adminpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]!=admax]
if type_ == 'refaltN':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)>1:
return [ref_out, alt_out, iupac_out]
if type_ == 'refaltsample':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2:
ref_out = random.sample(ref_alt_nonzero,1)[0]
alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)>2:
ref_out = random.sample(ref_alt_nonzero,1)[0]
alt_out = random.sample([x for x in ref_alt_nonzero if x != ref_out],1)[0]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if type_ == 'refmajorsample':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
ref_out = ref_alt_nonzero[admaxpos[0]]
alt_out = ref_alt_nonzero[adminpos[0]]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
ref_out = random.sample(ref_alt_nonzero,1)[0]
alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) != 1:
max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
ref_out = random.sample(max_nonzero,1)[0]
alt_out = random.sample(min_nonzero,1)[0]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) == 1:
max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
ref_out = random.sample(max_nonzero,1)[0]
alt_out = random.sample([x for x in max_nonzero if x != ref_out],1)[0]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if type_ == 'refminorsample':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
ref_out = ref_alt_nonzero[adminpos[0]]
alt_out = ref_alt_nonzero[admaxpos[0]]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
ref_out = random.sample(ref_alt_nonzero,1)[0]
alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) != 1:
max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
ref_out = random.sample(max_nonzero,1)[0]
alt_out = random.sample(min_nonzero,1)[0]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)>2 and len(set(samplesad_nonzero)) == 1:
max_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in adminpos]
min_nonzero = [x for y,x in enumerate(ref_alt_nonzero) if y in admaxpos]
ref_out = random.sample(min_nonzero,1)[0]
alt_out = random.sample([x for x in min_nonzero if x != ref_out],1)[0]
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
if type_ == 'iupac':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)>1:
iupac_out = getIUPAC(''.join(ref_alt_nonzero))
return [ref_out, alt_out, iupac_out]
def _repSNP(self, sample_pos, type_):
if sample_pos not in self.SAMPLES.keys():
return
samplesad_ = self.SAMPLES[sample_pos]['AD']
ref_alt_ = self.get_ref_alt()
ref_ = ref_alt_[0]
alt_ = ref_alt_[1:]
ref_out = ''.join(list(repeat('N',len(ref_))))
alt_out = ''.join(list(repeat('N',len(ref_))))
iupac_out = ''.join(list(repeat('N',len(ref_))))
samplesad_ = [int(x) for x in samplesad_]
ref_alt_nonzero = [ref_alt_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
samplesad_nonzero = [samplesad_[x] for x,y in enumerate(samplesad_) if samplesad_[x]!=0]
if len(samplesad_nonzero) == 0:
return [ref_out, alt_out, iupac_out]
admax = np.max([int(x) for x in samplesad_nonzero])
admin = np.min([int(x) for x in samplesad_nonzero])
admaxpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admax]
adminpos = [x for x,y in enumerate(samplesad_nonzero) if samplesad_nonzero[x]==admin]
if type_ == 'refaltN':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2:
return [ref_out, alt_out, iupac_out]
if type_ == 'refaltsample':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2:
ref_out = random.sample(ref_alt_nonzero,1)[0]
alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
return [ref_out, alt_out, iupac_out]
if type_ == 'refmajorsample':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
ref_out = ref_alt_nonzero[admaxpos[0]]
alt_out = ref_alt_nonzero[adminpos[0]]
iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
ref_out = random.sample(ref_alt_nonzero,1)[0]
alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
return [ref_out, alt_out, iupac_out]
if type_ == 'refminorsample':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) != 1:
ref_out = ref_alt_nonzero[adminpos[0]]
alt_out = ref_alt_nonzero[admaxpos[0]]
iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2 and len(set(samplesad_nonzero)) == 1:
ref_out = random.sample(ref_alt_nonzero,1)[0]
alt_out = [x for x in ref_alt_nonzero if x != ref_out][0]
iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
return [ref_out, alt_out, iupac_out]
if type_ == 'iupac':
if len(ref_alt_nonzero)==1:
ref_out = ref_alt_nonzero[0]
alt_out = ref_alt_nonzero[0]
iupac_out = ref_alt_nonzero[0]
return [ref_out, alt_out, iupac_out]
if len(ref_alt_nonzero)==2:
iupac_out = ''.join([getIUPAC(ref_alt_nonzero[0][x]+ref_alt_nonzero[1][x]) for x,y in enumerate(ref_alt_nonzero[0])])
return [ref_out, alt_out, iupac_out]
def parse_record(self, argsDICT, polytypeDICT, filterDICT, tsvouthandle, vcfouthandle):
""" Parse each record and writes into outhandle if variant is kept after filtering """
REF_ALT = self.get_ref_alt()
POLYTYPE = self.get_var_type()
POLYNUMBER = self.get_var_number()
POLYLEN = self.get_var_len()
if POLYTYPE in polytypeDICT:
polytypeDICT[POLYTYPE]+=1
if POLYTYPE not in polytypeDICT:
polytypeDICT[POLYTYPE]=1
#0. REDUCE record TO SAMPLES and ADD mandatory FORMAT TO SAMPLES [GT:AD:DP]:GQ:PL
self.keep_samples(argsDICT)
if 'GQ' not in self.FORMAT:
for s in argsDICT['samples_pos']:
self.add_format_on_samples(s, 'GQ', 0)
self.FORMAT.append('GQ')
if 'PL' not in self.FORMAT:
for s in argsDICT['samples_pos']:
self.add_format_on_samples(s, 'PL', list(np.repeat(0,nCk(self.get_var_number()+1,2))))
self.FORMAT.append('PL')
#1. FILTER BY filterarg
if self.is_filtered(argsDICT):
if argsDICT['verbose']:
print self
filterDICT['filterarg']+=1
return
#2. FILTER BY minMQ
MQ = self.get_info('MQ')
if MQ == ".":
MQ = 0.0
if float(MQ) < argsDICT['minMQ']:
if argsDICT['verbose']:
print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
filterDICT['MQ']+=1
return
#3. GET AD
#fill empty AD
self.fill_empty_ad()
SAMPLESAD = self.get_samples_ad()
SAMPLESADLEN = self.get_samples_ad_len()
if not all([x==POLYNUMBER for x in SAMPLESADLEN]):
if argsDICT['verbose']:
print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
sys.exit('\nSAMPLES AD length differs from total POLYNUMBER')
#4. SET AD BY MISSING GT OPTION
SAMPLESGT = self.get_samples_gt()
SAMPLESGQ = self.get_samples_gq()
self.set_samples_ad_by_gt(argsDICT)
SAMPLESAD = self.get_samples_ad()
#set dp based on altered ad
self.set_dp_by_ad()
#5. KEEP OR RESET GT USING THE DP OPTION AND AD INFORMATION
SAMPLESDP = self.get_samples_dp()
self.set_samples_gt_by_ad(argsDICT)
#6. FILTER BY NotCalledFraction
SAMPLESGTTYPE = self.get_gt_type()
SAMPLESGTNAPOS = self.get_gt_na_pos()
NOTCALLED = len(SAMPLESGTNAPOS)
NOTCALLEDOUT = str(NOTCALLED)+'/'+str(len(self.SAMPLES))
self.add_info(';NCFnumber='+str(NOTCALLEDOUT))
NOTCALLEDFRACTION = float(NOTCALLED)/float(len(self.SAMPLES))
self.add_info(';NCFfraction='+str(NOTCALLEDFRACTION))
if NOTCALLEDFRACTION >= argsDICT['ncf']:
if argsDICT['verbose']:
print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
filterDICT['NCF']+=1
return
#7. GET CONSENSUS AND EVALUATE CDP OPTION
#fill empty PL
self.fill_empty_pl()
CONSENSUS_CALLDATA = self.get_consensus_calldata(argsDICT)
if CONSENSUS_CALLDATA['DP'] < argsDICT['cdp']:
if argsDICT['verbose']:
print [self.CHROM, self.POS, self.REF, self.ALT, self.INFO, self.FORMAT, self.SAMPLES]
filterDICT['CDP']+=1
return
if argsDICT['add']:
self.SAMPLES[argsDICT['id']] = CONSENSUS_CALLDATA
if not argsDICT['add']:
self.SAMPLES = collections.OrderedDict()
self.SAMPLES[argsDICT['id']] = CONSENSUS_CALLDATA
VCFLINE = '\t'.join([self.CHROM, self.POS, self.ID, self.REF[0], ','.join(self.ALT), self.QUAL, self.FILTER, self.INFO, ':'.join(self.FORMAT)])
SAMPLES_LINE = []
for s in self.SAMPLES.keys():
SAMPLE_LINE = []
for k in self.SAMPLES[s].keys():
if isinstance(self.SAMPLES[s][k], list):
SAMPLE_LINE.append(','.join([str(x) for x in self.SAMPLES[s][k]]))
elif self.SAMPLES[s][k] is None:
SAMPLE_LINE.append(".")
else:
SAMPLE_LINE.append(str(self.SAMPLES[s][k]))
SAMPLES_LINE.append(':'.join(SAMPLE_LINE))
vcfouthandle.write(VCFLINE+'\t'+'\t'.join(SAMPLES_LINE)+'\n')
CONSENSUSOUT = SAMPLES_LINE[-1]
if POLYTYPE == 'repSNP':
if argsDICT['add']:
CONSENSUSMAJ, CONSENSUSMIN, CONSENSUSIUPAC = self.get_alleles_repSNP()
CONSENSUSMAJ = CONSENSUSMAJ[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
CONSENSUSMIN = CONSENSUSMIN[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
CONSENSUSIUPAC = CONSENSUSIUPAC[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
if not argsDICT['add']:
CONSENSUSMAJ, CONSENSUSMIN, CONSENSUSIUPAC = self.get_alleles_repSNP()
CONSENSUSMAJ = CONSENSUSMAJ[0]
CONSENSUSMIN = CONSENSUSMIN[0]
CONSENSUSIUPAC = CONSENSUSIUPAC[0]
if POLYTYPE == 'biSNP':
if argsDICT['add']:
CONSENSUSMAJ, CONSENSUSMIN, CONSENSUSIUPAC = self.get_alleles_biSNP()
CONSENSUSMAJ = CONSENSUSMAJ[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
CONSENSUSMIN = CONSENSUSMIN[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
CONSENSUSIUPAC = CONSENSUSIUPAC[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
if not argsDICT['add']:
CONSENSUSMAJ, CONSENSUSMIN, CONSENSUSIUPAC = self.get_alleles_biSNP()
CONSENSUSMAJ = CONSENSUSMAJ[0]
CONSENSUSMIN = CONSENSUSMIN[0]
CONSENSUSIUPAC = CONSENSUSIUPAC[0]
if POLYTYPE == 'muSNP':
if argsDICT['add']:
CONSENSUSMAJ, CONSENSUSMIN, CONSENSUSIUPAC = self.get_alleles_muSNP()
CONSENSUSMAJ = CONSENSUSMAJ[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
CONSENSUSMIN = CONSENSUSMIN[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
CONSENSUSIUPAC = CONSENSUSIUPAC[[x for x,y in enumerate(self.SAMPLES.keys()) if y == argsDICT['id']][0]]
if not argsDICT['add']:
CONSENSUSMAJ, CONSENSUSMIN, CONSENSUSIUPAC = self.get_alleles_muSNP()
CONSENSUSMAJ = CONSENSUSMAJ[0]
CONSENSUSMIN = CONSENSUSMIN[0]
CONSENSUSIUPAC = CONSENSUSIUPAC[0]
if POLYTYPE != 'repSNP' and POLYTYPE != 'biSNP' and POLYTYPE != 'muSNP':
print POLYTYPE
sys.exit('\nPOLYTYPE not considered for CONSENSUS\nPlease reduce to SNP and MNP')
#write to tsv
TSVLINE = '\t'.join([self.CHROM,str(self.POS),NOTCALLEDOUT,CONSENSUSOUT,CONSENSUSMAJ,CONSENSUSMIN,CONSENSUSIUPAC])+'\n'
tsvouthandle.write(TSVLINE)
filterDICT['passed']+=1
def main():
""" main """
parser = argparse.ArgumentParser(usage='%(prog)s [options]',
description='Given a multiple sample vcf file reduced to SNP and MNP and the reference fasta file, ' \
'this script merges the samples into a consensus. ' \
'Major, minor and IUPAC code of the consensus will be written to a tab separated file.'
'\nTo extract a subpopulations from e.g. GATK multiple vcf file use the following command from GATK: ' \
'java -jar GenomeAnalysisTK.jar -T SelectVariants -sn $SAMPLE1 -sn $SAMPLE2 -V $MULTISAMPLEVCF -R $REFERENCE ' \
'[-trimAlternates : removeUnusedAlternates] ' \
'[-env : excludeNonVariants] ' \
'[-ef : excludeFiltered] ' \
'-selectType SNP -selectType MNP -o $OUTPUT ' \
'\nor vcftools: ' \
'vcftools --vcf $MULTISAMPLEVCF [--gvcf] --out $OUTPUT --recode --indv $SAMPLE1 --indc $SAMPLE2 ' \
'[--minGQ : filter for genotype quality] ' \
'[--minDP : filter for depth] ' \
'--remove-indels ')
parser.add_argument('-v', '--verbose', help='increase output verbosity', action='store_true')
#parser.add_argument('-tsv', help='specify if tsv should be written [default: False]', action='store_true')
parser.add_argument('-tabix', help='specify if input is tabix file. ' \
'NOTE: if set to true reference file needs to be specified to get length of each entry [default: False]'
, action='store_true')
parser.add_argument('-R', help='specify reference fasta file. Only needed if [tabix] option')
parser.add_argument('-ivcf', help='specify input vcf file')
parser.add_argument('-o', help='specify output prefix')
parser.add_argument('-H', help='specify if HEADER of vcf file should be ommitted [default: False]', action='store_true')
parser.add_argument('-chr', help='secify chromosomes to be parsed as a comma-seperated list')
parser.add_argument('-filterarg', help='specify FILTER arguments to keep variants, all variants without these will be discarded. ' \
'use a comma-seperated list e.g. PASS,VQSRTTranchesSNP90. ' \
'By default all variants are retained [default: ""]', default='')
parser.add_argument('-samples', help='specify sample names to be merged as a comma-seperated list, they need to exactly match sample ids from vcf input')
parser.add_argument('-add', help='specify if CONSENSUS should be added as a new column to existing vcf file [default: False]', action='store_true')
parser.add_argument('-id', help='specify CONSENSUS name to be added to HEADER')
parser.add_argument('-minMQ', help='specify mapping quality threshold necessary to exceeded to retain variant [default: "0.0"]', type=float, default=0.0)
parser.add_argument('-missingGT', help='specify how missing GT data (based on e.g. calling SNPs with GATK on multiple samples simultaneously; GT: ./.) ' \
'should be handled, either keep AD coverage [default: "keep"], set AD to 0 for all samples which have GT: ./. for all alleles ["zero"], ' \
'set REF allele to certain value and ALT alleles to certain value for all samples that have GT: ./. ["set:value:value"] e.g. ref:10:10 ' \
'NOTE: there might be cases where the VariantCaller has defined GT: ./. however AD still shows counts AD: 3,0 ' \
'here one might not want to keep this variant as NotCalled (GT: ./.) and use the AD information to reset GT based on AD', default='keep')
parser.add_argument('-resetGT', help='specify how GT should be reset, either reset using AD information [default: "AD"] or keep as defined by VariantCaller ["keep"] ' \
'NOTE: the DP value needs to be exceeded otherwise the sample will be set to ./. ' \
'NOTE: this will influence the calculation of the NotCalledFraction', default='AD', choices=['AD','keep'])
parser.add_argument('-minDP', help='specify minimal sample depth threshold to set position to be NotCalled during GT reset [default: "0"]', type=int, default=0)
parser.add_argument('-maxDP', help='specify maximal sample depth threshold to set position to be NotCalled during GT reset [default: "9999999"]', type=int, default=9999999)
parser.add_argument('-minGQ', help='specify minimal genotype quality threshold to set position to Called during GT reset [default: "0"]', type=int, default=0)
parser.add_argument('-ncf', help='specify NotCalledFraction to exclude SNP/MNP [default: "1.0"] for consensus calculation', type=float, default=1.0)
parser.add_argument('-cdp', help='specify consensus depth threshold necessary to exceeded to retain SNP/MNP [default: "5"]' \
'NOTE: only Called samples will be considered after GT reset', type=int, default=5)
parser.add_argument('-po', help='specify if progress should not be printed based on variant number processed [default: False]', action='store_true')
parser.add_argument('-ps', help='specify progress steps [default: "1e5"]', default='1e5')
parser.add_argument('-CHROM', help='specify CHROM column [default: 0]', type=int, default=0)
parser.add_argument('-POS', help='specify POS column [default: 1]', type=int, default=1)
parser.add_argument('-ID', help='specify ID column [default: 2]', type=int, default=2)
parser.add_argument('-REF', help='specify REF column [default: 3]', type=int, default=3)
parser.add_argument('-ALT', help='specify ALT column [default: 4]', type=int, default=4)
parser.add_argument('-QUAL', help='specify ALT column [default: 5]', type=int, default=5)
parser.add_argument('-FILTER', help='specify FILTER column [default: 6]', type=int, default=6)
parser.add_argument('-INFO', help='specify INFO column [default: 7]', type=int, default=7)
parser.add_argument('-FORMAT', help='specify FORMAT column [default: 8]', type=int, default=8)
args = parser.parse_args()
#IVCF
if args.ivcf is None:
parser.print_help()
sys.exit('\nPlease specify input vcf file')
args.ivcf = os.path.abspath(args.ivcf)
#TABIX
if args.tabix:
if args.R is None:
parser.print_help()
sys.exit('\nPlease specify reference fasta file, since [tabix: "true"]')
args.R = os.path.abspath(args.R)
#OUTPUT
if args.o is None:
parser.print_help()
sys.exit('\nPlease specify output path')
args.o = os.path.abspath(args.o)
#CHROMOSOMES
if args.chr is None:
sys.exit('\nPlease specify chromosomes to be parsed as a comma-seperated list')
args.chr = args.chr.split(',')
#SAMPLES
if args.samples is None:
parser.print_help()
sys.exit('\nPlease specify sample names to be parsed as comma-seperated list\ngrep "#CHROM" vcf')
args.samples = args.samples.split(',')
#SAMPLES ID
if args.id is None:
parser.print_help()
sys.exit('\nPlease specify CONSENSUS name')
#FILTER
args.filterarg = args.filterarg.split(',')
#MQ OPTION
args.missingGT = get_missing_type(args.missingGT)
args.ps = int(float(args.ps))
print(args)
#GET ARGS DICT
argsDICT = vars(args)
argsDICT['args'] = ['"'+str(args).replace("'","")+'"']
#STATIC DICT
polytypeDICT = {}
filterDICT = {'passed':0, 'filterarg':0, 'MQ':0, 'NCF':0, 'CDP':0}
if argsDICT['tabix']:
with open(argsDICT['o']+'.tsv','w') as tsvouthandle:
with open(argsDICT['o']+'.vcf','w') as vcfouthandle:
reference_length = get_reference_len(argsDICT)
header = get_header_from_gz(argsDICT)
argsDICT['samples_pos'] = get_sample_pos_from_header(argsDICT)
tb = tabix.open(argsDICT['ivcf'])
counter = 0
if not argsDICT['H']:
for h in header:
vcfouthandle.write(h)
for tmp_chr in argsDICT['chr']:
tmp_chr_length = reference_length[tmp_chr]
records = tb.query(tmp_chr,1,tmp_chr_length)
for lines in records:
record = vcfRecord(lines, argsDICT['CHROM'], argsDICT['POS'], argsDICT['ID'], argsDICT['REF'], argsDICT['ALT'], argsDICT['QUAL'], argsDICT['FILTER'], argsDICT['INFO'], argsDICT['FORMAT'], argsDICT['samples_pos'], argsDICT['tabix'])
parse_record(record, argsDICT, polytypeDICT, filterDICT, tsvouthandle, vcfouthandle)
if not argsDICT['po']:
counter += 1
if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) != 0:
sys.stderr.write('.')
if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) == 0:
sys.stderr.write('o')
if not argsDICT['tabix']:
with open(argsDICT['o']+'.tsv','w') as tsvouthandle:
with open(argsDICT['o']+'.vcf','w') as vcfouthandle:
header = get_header_from_vcf(argsDICT)
argsDICT['samples_pos'] = get_sample_pos_from_header(argsDICT)
counter = 0
if not argsDICT['H']:
for h in header:
vcfouthandle.write(h)
with open(argsDICT['ivcf'],'rU') as vcfhandle:
for lines in vcfhandle:
if lines[0]=='#':
continue
record = vcfRecord(lines, argsDICT['CHROM'], argsDICT['POS'], argsDICT['ID'], argsDICT['REF'], argsDICT['ALT'], argsDICT['QUAL'], argsDICT['FILTER'], argsDICT['INFO'], argsDICT['FORMAT'], argsDICT['samples_pos'], argsDICT['tabix'])
#continue if not in chromosome_ids
if record.CHROM not in argsDICT['chr']:
continue
#else
parse_record(record, argsDICT, polytypeDICT, filterDICT, tsvouthandle, vcfouthandle)
if not argsDICT['po']:
counter += 1
if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) != 0:
sys.stderr.write('.')
if counter%argsDICT['ps'] == 0 and counter%(argsDICT['ps']*5) == 0:
sys.stderr.write('o')
sys.stderr.write('\n')
print polytypeDICT
print filterDICT
if __name__ == '__main__':
main()
#TESTING
#args.ivcf='test.vcf'
#args.o='test.out'
#args.chr='chr22'
#args.id='test'
#args.samples='BLANK,NA12878,NA12892,NA19238'
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment