added simdiv.py

f26825fe · Kristian Ullrich · 5b3dd851 · f26825fe
Commit f26825fe authored 8 years ago by Kristian Ullrich
--- a/scripts/simdiv.py
+++ b/scripts/simdiv.py
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+
+'''
+Author: Krisian Ullrich
+date: Dezember 2016
+email: ullrich@evolbio.mpg.de
+License: MIT
+
+The MIT License (MIT)
+
+Copyright (c) 2016 Kristian Ullrich
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+'''
+
+
+import sys
+import argparse
+import numpy as np
+import random as rnd
+from Bio import Seq
+from Bio import SeqIO
+from Bio import SeqRecord
+
+
+def duplicate_sequence(rec):
+    duplicated_seq = SeqRecord.SeqRecord(rec.seq.tomutable())
+    duplicated_seq.id = rec.id
+    duplicated_seq.name = rec.name
+    duplicated_seq.description = rec.description
+    return duplicated_seq
+
+
+def mutate_pos(nuc):
+    if nuc in '.-NRYSWKMBDHV':
+        return [nuc, 0]
+    if nuc not in '.-NRYSWKMBDHV':
+        return [rnd.sample([x for x in ['A','T','C','G'] if x != nuc],1)[0], 1]
+
+
+def change_random_pos(fasta_dict_, div_, chr_, po_, ps_):
+    if chr_ == 'true':
+        sample_len = []
+        all_changed_pos = []
+        changed_sites = 0
+        for k in fasta_dict_.keys():
+            k_changed_sites = 0
+            k_changed_pos = {}
+            k_sample_len = int(np.ceil(len(fasta_dict_[k])*div_))
+            print('%s Sample len: %s' % (k,str(k_sample_len)))
+            sample_len.append(k_sample_len)
+            i = 0
+            while i != k_sample_len and k_changed_sites != k_sample_len and len(k_changed_pos) != len(fasta_dict_[k]):
+                change_pos = rnd.randint(0,len(fasta_dict_[k])-1)
+                if k+'_'+str(change_pos) in k_changed_pos:
+                    continue
+                if k+'_'+str(change_pos) not in k_changed_pos:
+                    k_changed_pos[k+'_'+str(change_pos)] = 0
+                    fasta_dict_[k].seq[change_pos], j = mutate_pos(fasta_dict_[k][change_pos])
+                    k_changed_sites += j
+                    i += j
+                if po_ == 'true' and j != 0:
+                    if k_changed_sites%ps_ == 0 and k_changed_sites%(ps_*5) != 0:
+                        sys.stderr.write('.')
+                    if k_changed_sites%ps_ == 0 and k_changed_sites%(ps_*5) == 0:
+                        sys.stderr.write('o')
+            all_changed_pos.append(k_changed_pos.keys())
+            changed_sites += k_changed_sites
+        return [sample_len, changed_sites, all_changed_pos]
+    if chr_ == 'false':
+        changed_sites = 0
+        all_changed_pos = []
+        k_changed_pos = {}
+        total_len = np.sum([len(fasta_dict_[x]) for x in fasta_dict_])
+        sample_len = int(np.ceil(total_len*div_))
+        print('Sample len: %s' % (str(sample_len)))
+        i = 0
+        while i != sample_len and changed_sites != sample_len and len(k_changed_pos) != total_len:
+            k = rnd.sample(fasta_dict_.keys(),1)[0]
+            change_pos = rnd.randint(0,len(fasta_dict_[k])-1)
+            if k+'_'+str(change_pos) in k_changed_pos:
+                continue
+            if k+'_'+str(change_pos) not in k_changed_pos:
+                k_changed_pos[k+'_'+str(change_pos)] = 0
+                fasta_dict_[k].seq[change_pos], j = mutate_pos(fasta_dict_[k][change_pos])
+                changed_sites += j
+                i += j
+            if po_ == 'true' and j != 0:
+                if changed_sites%ps_ == 0 and changed_sites%(ps_*5) != 0:
+                    sys.stderr.write('.')
+                if changed_sites%ps_ == 0 and changed_sites%(ps_*5) == 0:
+                    sys.stderr.write('o')
+        all_changed_pos.append(k_changed_pos.keys())
+        return [sample_len, changed_sites, all_changed_pos]
+
+
+def main():
+    parser = parser = argparse.ArgumentParser(prog='simdiv', usage='%(prog)s [options]',
+                      description = 'Takes a reference fasta file as input and produces divergent fasta file given a divergence procent cutoff.')
+    parser.add_argument('-i', help = 'input fasta file')
+    parser.add_argument('-o', help = 'output fasta file')
+    parser.add_argument('-d', help = 'specify divergence procent [default: 0.01]', type = float, default = 0.01)
+    parser.add_argument('-chr', help = 'specify if each chromosome should be sampled indivdually [default: false]', choices = ['true','false'], default = 'false')
+    parser.add_argument('-po', help='specify if progress should be printed based on changed sites processed [default: true]', choices = ['true','false'], default = 'true')
+    parser.add_argument('-ps', help='specify progress steps [default: 1e5]', default='1e5')
+    parser.add_argument('-v', '--verbose', help='increase output verbosity', action='store_true')
+    args = parser.parse_args()
+    if args.i is None:
+        parser.print_help()
+        sys.exit('\n#####\nExit program: Please specify fasta input file.')
+    original_fasta = SeqIO.parse(args.i, "fasta")
+    if args.o is None:
+        parser.print_help()
+        sys.exit('\n#####\nExit program: Please specify fasta output file.')
+    outfile = args.o
+    option_div = args.d
+    option_chr = args.chr
+    option_po = args.po
+    option_ps = int(float(args.ps))
+    print("command arguments used:")
+    print(args)
+    fasta_dict = {}
+    fasta_len = []
+    print('Start parsing reference fasta files')
+    for record in original_fasta:
+        fasta_dict[record.id] = duplicate_sequence(record)
+        fasta_len.append(len(record))
+    print('Finished parsing reference fasta files')
+    print('Start sampling')
+    sample_len, changed_sites, all_changed_pos = change_random_pos(fasta_dict, option_div, option_chr, option_po, option_ps)
+    print('Finished sampling')
+    print('Sampled sites: %s' % (str(sample_len)))
+    print('Changed sites: %s' % (str(changed_sites)))
+    if args.verbose:
+        print('All changed pos: %s' % (str(all_changed_pos)))
+    print('Start writing fasta output')
+    SeqIO.write(fasta_dict.values(), outfile, "fasta")
+    print('Finshed writing fasta output')
+
+
+if __name__ == '__main__':
+    main()
+