Newer
Older
#author: Kristian K Ullrich
#date: July 2017
#email: ullrich@evolbio.mpg.de
#chr1 - IRA_AFG
#[X]: Mmd_FRA
#[Y]: Mmd_GER
#[Z]: Mmd_IRA
#[O]: Mmm_AFG
popX <- "Mmd_FRA"
popY <- "Mmd_GER"
popZ <- "Mmd_IRA"
popO <- "Mmm_AFG"
TMP_DIR <- "/tmp"
popX.pos <- 2
popY.pos <- 3
popZ.pos <- 4
popO.pos <- 5
SEQ_FILE <- "http://wwwuser.gwdg.de/~evolbio/evolgen/wildmouse/introgression/mpileup_pop_mv/fasta/consensus/CAS_FRA_GER_IRA_AFG_SPRE.mpileup.q0Q10.chr1.bcfcall.mv.remIndels.recode.refmajorsample.ref.consensus.fasta"
chr <- "chr1"
OUT_FILE <- paste0(TMP_DIR,"/",popX,"_",popY,".",popZ,".",popO,".",chr,".tsv")
WSIZE <- 25000
WJUMP <- 25000
DISTMODEL <- "K80"
#####
library(Biostrings)
library(ape)
source("https://gitlab.gwdg.de/evolgen/tree/master/scripts/sliding_window_steps_generator.r")
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
options(scipen=22)
calc_delta_consensus_four<-function(seq,popX.pos,popY.pos,popZ.pos,popO.pos,TMP,distMODEL="K80"){
MISSING<-NA
TREELENGTH<-NA
TREE<-NA
TREETOPOLOGY<-NA
dK80.XZO<-NA
dK80.YZO<-NA
OUT<-list(MISSING,dK80.XZO,dK80.YZO,TREELENGTH,TREE,TREETOPOLOGY)
names(OUT)<-c("MISSING","dK80.XZO","dK80.YZO","TREELENGTH","TREE","TREETOPOLOGY")
tmp.seq<-seq[c(popX.pos,popY.pos,popZ.pos,popO.pos)]
names(tmp.seq)<-c("X","Y","Z","O")
tmp.seq.XX<-gregexpr("X",tmp.seq[1])
tmp.seq.XY<-gregexpr("X",tmp.seq[2])
tmp.seq.XZ<-gregexpr("X",tmp.seq[3])
tmp.seq.XO<-gregexpr("X",tmp.seq[4])
tmp.seq.NX<-gregexpr("N",tmp.seq[1])
tmp.seq.NY<-gregexpr("N",tmp.seq[2])
tmp.seq.NZ<-gregexpr("N",tmp.seq[3])
tmp.seq.NO<-gregexpr("N",tmp.seq[4])
remove.pos<-unique(c(tmp.seq.XX[[1]],tmp.seq.XY[[1]],tmp.seq.XZ[[1]],tmp.seq.XO[[1]],tmp.seq.NX[[1]],tmp.seq.NY[[1]],tmp.seq.NZ[[1]],tmp.seq.NO[[1]]))
remove.pos<-remove.pos[remove.pos!="-1"]
if(length(remove.pos)!=0){
tmp.seq.rem<-BStringSet(apply(as.matrix(tmp.seq)[,-remove.pos,drop=FALSE],1,function(x) paste(x,collapse="")))
}
if(length(remove.pos)==0){
tmp.seq.rem<-tmp.seq
}
OUT$MISSING<-length(remove.pos)
writeXStringSet(tmp.seq.rem,file=paste0(TMP,"/tmp.seq.rem.fasta"))
tmp.dna<-read.dna(paste0(TMP,"/tmp.seq.rem.fasta"),format="fasta")
if(length(tmp.dna)==0){
return(OUT)
}
if(all(is.na(base.freq(tmp.dna)))){
return(OUT)
}
tmp.dist.dna.MODEL<-dist.dna(tmp.dna,as.matrix=T,model=distMODEL)
if(any(is.na(tmp.dist.dna.MODEL))){
return(OUT)
}
dXY<-tmp.dist.dna.MODEL[1,2]
dXZ<-tmp.dist.dna.MODEL[1,3]
dXO<-tmp.dist.dna.MODEL[1,4]
dYZ<-tmp.dist.dna.MODEL[2,3]
dYO<-tmp.dist.dna.MODEL[2,4]
dZO<-tmp.dist.dna.MODEL[3,4]
dK80.XZO<-dXO-dXZ
dK80.YZO<-dYO-dYZ
OUT$dK80.XZO<-dK80.XZO
OUT$dK80.YZO<-dK80.YZO
tmp.dna.nj<-midpoint(nj(tmp.dist.dna.MODEL))
OUT$TREELENGTH<-sum(tmp.dna.nj$edge.length)
OUT$TREE<-write.tree(tmp.dna.nj)
tmp.dna.nj$edge.length<-NULL
OUT$TREETOPOLOGY<-write.tree(tmp.dna.nj)
return(OUT)
}
#####MAIN#####
sink(OUT_FILE)
cat("#[X]: ")
cat(popX)
cat("\n")
cat("#[Y]: ")
cat(popY)
cat("\n")
cat("#[Z]: ")
cat(popZ)
cat("\n")
cat("#[O]: ")
cat(popO)
cat("\n")
cat("CHR\tSTART\tEND\t")
cat("MISSING\tdK80.XZO\tdK80.YZO\tTREELENGTH\tTREE\tTREETOPOLOGY\n")
tmp.chr.seq<-readBStringSet(SEQ_FILE)
tmp.sw<-sliding_window_steps_generator(window=WSIZE,jump=WJUMP,start.by=1,end.by=unique(width(tmp.chr.seq)))
pb<-txtProgressBar(min=1,max=dim(tmp.sw)[2],style=3)
for(i in 1:dim(tmp.sw)[2]){
tmp.out<-subseq(tmp.chr.seq,tmp.sw[1,i],tmp.sw[2,i])
fourout<-calc_delta_consensus_four(tmp.out,popX.pos,popY.pos,popZ.pos,popO.pos,TMP_DIR,DISTMODEL)
cat(chr)
cat("\t")
cat(tmp.sw[1,i])
cat("\t")
cat(tmp.sw[2,i])
cat("\t")
cat(unlist(fourout),sep="\t")
cat("\n")
}
close(pb)
sink(NULL)
#####
quit()