-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDCSsim.py
More file actions
1647 lines (1319 loc) · 78 KB
/
DCSsim.py
File metadata and controls
1647 lines (1319 loc) · 78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# coding=utf8
"""
%prog <FASTA> <BED> <INT> [options]
Based on the provided <FASTA> sequence differential peaks (DPs) are simulated, restricted to if <--is_white_list> or constrained by the specified <BED> file.
<INT> defines the number of simulated replicates of the two samples.
@author: Thomas Eder
"""
from __future__ import print_function
from __future__ import division
import sys
import os
import time
import datetime
import HTSeq
import numpy as np
import scipy as sp
import multiprocessing as mp
import matplotlib.pyplot as plt
from builtins import range
from future.utils import lrange
from matplotlib.backends.backend_pdf import PdfPages
from optparse import OptionParser
from pybedtools import BedTool, Interval
from random import random, randint
from scipy.stats import beta, laplace
from numpy.random import random_sample, rand, multivariate_normal, choice, gamma, dirichlet
from multiprocessing.managers import BaseManager
MAX_TEST = 10000 #number of maximal test iterations, can influence runtime
#classes
class HelpfulOptionParser(OptionParser):
"""An OptionParser that prints full help on errors."""
def error(self, msg):
self.print_help(sys.stderr)
self.exit(2, "\n%s: error: %s\n" % (self.get_prog_name(), msg))
class Timer:
"""Timer to track time to run."""
def __init__(self):
self.start = time.time()
def restart(self):
self.start = time.time()
def get_time_hhmmss(self):
end = time.time()
m, s = divmod(end - self.start, 60)
h, m = divmod(m, 60)
time_str = "%02d:%02d:%02d" % (h, m, s)
return time_str
class Parameters:
"""Class to store the main init parameters."""
def __init__(self, sequence, valid_regions, bins, n_reps_sample1, n_reps_sample2, prot_count_n, prot_count_p, protein_size,\
frag_len_max, frag_dist_on, frag_dist_mn_mean, frag_dist_mn_cov, frag_dist_prob, prot_dist_mn_mean, prot_dist_mn_cov,\
frag_count_sh, frag_count_sc, frag_count_op, frag_count_om, frag_count_os, frag_count_scaling, frag_count_lp_sc, \
frag_count_ex_lo, frag_count_ex_sc, beta_values, frag_len_mean, frag_len_dev, skewness, valid_regions_array):
self.sequence = sequence
self.valid_regions = valid_regions
self.bins = bins
self.n_reps_sample1 = n_reps_sample1
self.n_reps_sample2 = n_reps_sample2
self.prot_count_n = prot_count_n
self.prot_count_p = prot_count_p
self.protein_size = protein_size
self.frag_len_max = frag_len_max
self.prot_dist_mn_mean = prot_dist_mn_mean
self.prot_dist_mn_cov = prot_dist_mn_cov
self.frag_count_sh = frag_count_sh
self.frag_count_sc = frag_count_sc
self.frag_count_op = frag_count_op
self.frag_count_om = frag_count_om
self.frag_count_os = frag_count_os
self.frag_count_scaling = frag_count_scaling
self.frag_count_lp_sc = frag_count_lp_sc
self.frag_count_ex_lo = frag_count_ex_lo
self.frag_count_ex_sc = frag_count_ex_sc
self.beta_values = beta_values
self.frag_len_mean = frag_len_mean
self.frag_len_dev = frag_len_dev
self.skewness = skewness
self.valid_regions_array = valid_regions_array
self.frag_dist_on = frag_dist_on
self.frag_dist_mn_mean = frag_dist_mn_mean
self.frag_dist_mn_cov = frag_dist_mn_cov
self.frag_dist_prob = frag_dist_prob
def get_all(self):
""""Returns all parameters stored"""
return self.sequence, self.valid_regions, self.bins ,self.n_reps_sample1, self.n_reps_sample2, self.prot_count_n, self.prot_count_p, \
self.protein_size, self.frag_len_max, self.frag_dist_on, self.frag_dist_mn_mean, self.frag_dist_mn_cov, self.frag_dist_prob, self.prot_dist_mn_mean, \
self.prot_dist_mn_cov, self.frag_count_sh, self.frag_count_sc, self.frag_count_op, self.frag_count_om, self.frag_count_os, \
self.frag_count_scaling, self.frag_count_lp_sc, self.frag_count_ex_lo, self.frag_count_ex_sc, \
self.beta_values, self.frag_len_mean, self.frag_len_dev, self.skewness, self.valid_regions_array
class Report_data:
"""Class to store report data"""
def __init__(self, n_reps_sample1, n_reps_sample2):
self.r_prot_pos = []
self.r_dom_nprot = []
self.r_prot_offset_positions = []
self.r_nfrag = []
self.r_n_frag1 = []
self.r_n_frag2 = []
self.r_fragshift = []
self.r_fraglen = []
self.r_n_frag1_reps = [[] for i in lrange(n_reps_sample1)]
self.r_n_frag2_reps = [[] for i in lrange(n_reps_sample2)]
self.db_peaks_sample1 = 0
self.db_peaks_sample2 = 0
self.n_domains = 0
def _extend(self, n_domains, r_prot_pos, r_dom_nprot, r_prot_offset_positions, r_nfrag, r_n_frag1, r_n_frag2, r_fragshift, r_fraglen, r_n_frag1_reps, r_n_frag2_reps, db_peaks_sample1, db_peaks_sample2):
"""Extend report data lists"""
self.n_domains += n_domains
self.r_prot_pos.extend(r_prot_pos)
self.r_dom_nprot.extend(r_dom_nprot)
self.r_prot_offset_positions.extend(r_prot_offset_positions)
self.r_nfrag.extend(r_nfrag)
self.r_n_frag1.extend(r_n_frag1)
self.r_n_frag2.extend(r_n_frag2)
self.r_fragshift.extend(r_fragshift)
self.r_fraglen.extend(r_fraglen)
rep_index = 0
for rep in self.r_n_frag1_reps:
rep.extend(r_n_frag1_reps[rep_index])
rep_index += 1
rep_index = 0
for rep in self.r_n_frag2_reps:
rep.extend(r_n_frag2_reps[rep_index])
rep_index += 1
self.db_peaks_sample1 += db_peaks_sample1
self.db_peaks_sample2 += db_peaks_sample2
class Counter(object):
"""Counter for the number of domains to create, locks for multiprocessing"""
def __init__(self, initval=0, maxval=50):
self.val = mp.Value('i', initval)
self.lock = mp.Lock()
self.maxval = maxval
def increment(self):
with self.lock:
if(self.val.value >= self.maxval):
return -1
else:
self.val.value += 1
return self.val.value
def get_value(self):
with self.lock:
return self.val.value
def is_max(self):
with self.lock:
if(self.val.value >= self.maxval):
return True
else:
return False
class Chromosome(object):
"""The chromosome were we want to simulate differential binding peaks on, it has the following attributes:
Attributes:
sequence: str of the genomic sequence
name: str of the chromosome name
length: integer of seqeunce length
domain_list: list of domains
valid_regions: regions which are not blacklisted
bins: probabilites of the valid_regions based on their length
n_reps_sample1: number of replicates in sample1
n_reps_sample2: number of replicates in sample2
is_white_list: if the supplied bed file is treated as white-list
threads: number of processes used to create domains, proteins and fragments
"""
def __init__(self, sequence, chrom_name, bedfile_path, n_reps_sample1, n_reps_sample2, is_white_list, threads):
"""Returns a chromosome object"""
self.sequence = sequence
self.name = chrom_name
self.length = len(sequence)
self.domain_list = []
self.valid_regions, self.bins = self._get_valid_regions(bedfile_path, self.length, is_white_list) #define valid regions based on provided bed file
self.n_reps_sample1 = n_reps_sample1
self.n_reps_sample2 = n_reps_sample2
self.is_white_list=is_white_list
self.threads = threads
def init_domains(self, n_domains_start, n_domains, prot_count_n, prot_count_p, protein_size, frag_len_max, frag_dist_on, frag_dist_mn_mean, \
frag_dist_mn_cov, frag_dist_prob, prot_dist_mn_mean, prot_dist_mn_cov, frag_count_sh, frag_count_sc, frag_count_op, frag_count_om, frag_count_os, \
frag_count_scaling, frag_count_lp_sc, frag_count_ex_lo, frag_count_ex_sc, beta_values, frag_len_mean, frag_len_dev, skewness):
"""Creates n_domains on the sequence"""
j = 0
work = []
valid_regions_array = []
if(len(self.domain_list) == 0):
#build valid regions array for quick checks...
valid_regions_array = np.zeros(self.length+1,bool) #array of false values
for region in self.valid_regions:
for i in lrange(region.start,region.stop+1):
valid_regions_array[i] = True #set region valid
#create obj to store all parameters
parameterobj = Parameters(self.sequence, self.valid_regions, self.bins, self.n_reps_sample1, self.n_reps_sample2, prot_count_n, prot_count_p,\
protein_size, frag_len_max, frag_dist_on, frag_dist_mn_mean, frag_dist_mn_cov, frag_dist_prob, prot_dist_mn_mean, prot_dist_mn_cov, \
frag_count_sh, frag_count_sc, frag_count_op, frag_count_om, frag_count_os, frag_count_scaling, frag_count_lp_sc, \
frag_count_ex_lo, frag_count_ex_sc, beta_values, frag_len_mean, frag_len_dev, skewness, valid_regions_array)
if (self.threads > 1): #use multiple processes
print("Initializing domains %s to %s in %s processes" %(n_domains_start + 1, n_domains_start + n_domains , self.threads))
workers = self.threads #number of workers
counter = Counter(n_domains_start, n_domains_start + n_domains) #process safe counter from n_domains_start to the number of domains, it is used to name the domains too
process_list = []
pipe_list = []
for _ in range(workers): #each worker creates new domains
recv_end, send_end = mp.Pipe(False) #create unidirectional pipe
p = mp.Process(target=_init_add_domains_mp, args=(counter, parameterobj, send_end))
pipe_list.append(recv_end)
p.daemon = True
p.start() #start process
process_list.append(p) #append to process list
while True: #fill progress bar
complete_count = counter.get_value() - n_domains_start
_progress_bar(complete_count, n_domains )
if(complete_count == n_domains): #wait till all workers are done...
break
time.sleep(0.1)
print("\n\nCollecting results from %s threads" % self.threads)
collect_count = 0
for pipe in pipe_list: #go through pipe list and get results from the end points
domain_list = pipe.recv()
self.domain_list.extend(domain_list)
collect_count += len(domain_list)
_progress_bar(collect_count, n_domains)
pipe.close()
for p in process_list:
p.join()
print("\n")
else: #use just a single process
self.domain_list = _init_add_domains(parameterobj, n_domains_start, n_domains)
else:
print("Domains already initialized")
def _get_valid_regions(self, bedfile_path, chromosome_length, is_white_list):
"""Return valid regions and accumulated probabilities to peak one region"""
if(is_white_list):
regions = BedTool(bedfile_path)
return self.__get_valid_region(regions)
else:
black_regions = BedTool(bedfile_path)
valid_regions = BedTool('%s 0 %s' %(self.name, chromosome_length), from_string=True) #create whole chromosome region
return self.__subtract_from_valid_region(valid_regions, black_regions) #subtracted blacklisted regions from valid regions in this case the whole chromosome
def __subtract_from_valid_region(self, valid_regions, black_regions):
"""Subtract <black_regions> from <valid_regions> and return bins and <valid_regions>"""
remaining_regions = valid_regions.subtract(black_regions) #get only valid regions without overlapping blacklisted regions
if(len(remaining_regions) == 0):
parser.error("No valid sequence region left, please check provided bed-file (blacklist/whitelist) and add valid/remove blacklisted regions or reduce \
the number of domains '-d'.")
total_length = 0
relative_lengths = []
for region in remaining_regions:
total_length += len(region) #sum up regions
relative_lengths.append(len(region))
prob = list(map(lambda x: x/total_length, relative_lengths)) #get probability based on region length
bins = np.add.accumulate(prob)
return remaining_regions, bins
def __get_valid_region(self, valid_regions):
"""Return bins and <valid_regions>"""
total_length = 0
relative_lengths = []
for region in valid_regions:
total_length += len(region)
relative_lengths.append(len(region))
prob = list(map(lambda x: x/total_length, relative_lengths))
bins = np.add.accumulate(prob)
return valid_regions, bins
def wipe_data(self):
"""Removes all Domains, Proteins and Fragments"""
for domain in self.domain_list :
domain.wipe_data()
del(domain)
self.domain_list = []
class SpikeInChromosome(Chromosome):
"""The chromosome of the spike-in species, it has the following attributes:
Attributes:
sequence: str of the genomic sequence
name: str of the chromosome name
length: integer of sequence length
"""
def __init__(self, sequence, chrom_name):
"""Return a chromosome object"""
self.sequence = sequence
self.name = chrom_name
self.length = len(sequence)
class Domain(object):
"""A domain in the genome sequence, it has the following attributes:
Attributes:
name: name or number of the domain
protein_list: list of proteins
n_proteins: integer of the number of proteins
protein_positions: list of protein positions
self.protein_offset_positions: list of the protein offset positions from domain start
n_reps_sample1: number of replicates in sample1
n_reps_sample2: number of replicates in sample2
"""
def __init__(self, n_reps_sample1, n_reps_sample2, name):
"""Return a domain object"""
self.name = name
self.protein_list = []
self.n_proteins = 0
self.protein_positions = []
self.protein_offset_positions = []
self.n_reps_sample1 = n_reps_sample1
self.n_reps_sample2 = n_reps_sample2
def init_proteins(self, genome_sequence, valid_regions, bins, prot_count_n, prot_count_p, protein_size, frag_len_max, frag_dist_on, frag_dist_mn_mean,\
frag_dist_mn_cov, frag_dist_prob, prot_dist_mn_mean, prot_dist_mn_cov, frag_count_sh, frag_count_sc, frag_count_op, frag_count_om, frag_count_os,\
frag_count_scaling, frag_count_lp_sc, frag_count_ex_lo, frag_count_ex_sc, beta_values, frag_len_mean, frag_len_dev, \
skewness, valid_regions_array):
"""Creates n_proteins of proteins in the domain range"""
if(len(self.protein_list) == 0):
#get number of proteins in this domain
tmp_n_proteins = self._sample_neg_bin(prot_count_n, prot_count_p) #number will be change if we hit a blacklisted region
#get positions in valid regions...
self.protein_positions, self.n_proteins, self.protein_offset_positions = self._get_protein_positions(tmp_n_proteins, protein_size, prot_dist_mn_mean, \
prot_dist_mn_cov, genome_sequence, valid_regions, bins,frag_len_mean, frag_len_max, valid_regions_array)
prot_count = 1
for pos in self.protein_positions:
min_protein_pos = 0 #save min and max of fragments
max_protein_pos = 0
counts_frag_sample1, counts_frag_sample2 = 0, 0
tmp_fragments_sample1, tmp_fragments_sample2 = [], []
#init one protein
proteinobj = Protein(pos, self.n_reps_sample1, self.n_reps_sample2, self.name, prot_count)
#init all fragments in that protein
proteinobj.init_fragments(genome_sequence, frag_count_sh, frag_count_sc, frag_count_op, frag_count_om, frag_count_os, \
frag_count_scaling, frag_count_lp_sc, frag_count_ex_lo, frag_count_ex_sc, beta_values, \
frag_len_mean, frag_len_dev, frag_len_max, frag_dist_on, frag_dist_mn_mean, frag_dist_mn_cov, frag_dist_prob, protein_size, skewness)
#add to list
self.protein_list.append(proteinobj)
prot_count += 1
else:
print("Proteins already initialized")
def _get_protein_positions(self, tmp_n_proteins, protein_size, prot_dist_mn_mean, prot_dist_mn_cov, genome_sequence, valid_regions, bins, frag_len_mean, frag_len_max, \
valid_regions_array):
"""Get protein positions in valid regions"""
count = 0
while(count < MAX_TEST):
count += 1
#get protein distances in this domain
first_protein_offset_positions = self._sample_multivariate_normal(tmp_n_proteins, protein_size, prot_dist_mn_mean, prot_dist_mn_cov)
#get most left protein position
protein_left_position = self._get_left_protein_position(genome_sequence, tmp_n_proteins, int(max(first_protein_offset_positions)), valid_regions, bins, frag_len_mean, frag_len_max, protein_size)
#check positions 1 to n for valid regions, if not just omit them and add offsets to left_position
protein_positions, n_proteins, protein_offset_positions = self._get_check_offset_positions(protein_left_position, first_protein_offset_positions, valid_regions_array, protein_size, frag_len_mean)
if(n_proteins > 0): #we found at least one valid region
return protein_positions, n_proteins, protein_offset_positions
parser.error("Could not find at least one valid protein position please check the bed-file (blacklist/whitelist) and the parameters regarding protein size and number.")
def _sample_multivariate_normal(self, k, protein_size, prot_dist_mn_mean, prot_dist_mn_cov):
"""Return k samples from a multivariate normal distribution """
res = list(map(lambda x: max(1,int(choice(list(x)))), multivariate_normal(prot_dist_mn_mean, prot_dist_mn_cov, k)))
res.insert(0,int(protein_size/2)) #make sure first element is more than half of protein size
res.pop() #remove last element to be sure that we did not change number of proteins
#res = [int(x) for x in res] #all elements to int
return res
def _get_left_protein_position(self, genome_seq, number_proteins, max_offset, valid_regions, bins, frag_len_mean, frag_len_max, protein_size):
"""Return position of leftmost protein"""
offset = (number_proteins - 1) * max_offset #get theoretical maximal offset
count = 0
while (count < MAX_TEST): # test only MAX_TEST regions after that exit..
count += 1
valid = True
pos = self._get_random_position(valid_regions, bins) #get one position in an randomly selected valid_region
#check the potential sequence for illegal characters... also do not accept N´s
tmp_seq = genome_seq[pos - frag_len_max : pos + frag_len_max + offset]
if tmp_seq.count('A') + tmp_seq.count('C') + tmp_seq.count('G') + tmp_seq.count('T') == len(tmp_seq):
return pos
parser.error("Could not find a sequence with valid nucleotides, please check the provided fasta file and or bed-file (blacklist/whitelist).")
def _sample_neg_bin(self, n, p):
"""Sample negative binomial distribution"""
return int(max(np.random.negative_binomial(n, p),1))
def _get_random_position(self, valid_regions, bins):
"""Return random start and end positions of region (+/- fragment length mean), based on accumulated probability (based on region size) in bins"""
chosen_region = np.digitize(random_sample(1), bins)[0].item() #get one random bin and convert numpyint to int
return randint((valid_regions[chosen_region].start ), (valid_regions[chosen_region].stop ))
def _get_check_offset_positions(self, protein_left_position, protein_offset_positions, valid_regions_array, protein_size, frag_len_mean):
"""Checks if the offset positions realtive to the left position are in valid regions, if not removes them"""
new_protein_offset_positions = []
approx_dif = abs(frag_len_mean - protein_size)//2 #estimate shifts of fragments
n_proteins = len(protein_offset_positions)
protein_absolute_positions = protein_left_position + np.cumsum(protein_offset_positions) #add offset positions
rm_list = []
rm_index = 0
for offset_pos in protein_absolute_positions:
isgood = True
start = int(offset_pos - approx_dif)
stop = int(offset_pos + approx_dif)
for i in lrange(start,stop+1):
if(valid_regions_array[i] == False):#found overlap with blacklisted region
isgood = False
n_proteins -= 1
rm_list.append(rm_index)
break
if(isgood):#all positions are valid
new_protein_offset_positions.append(offset_pos)
rm_index += 1
protein_offset_positions = np.delete(protein_offset_positions, rm_list) #remove also from offset list
return new_protein_offset_positions, n_proteins, protein_offset_positions
def wipe_data(self):
"""Removes all Proteins and Fragments"""
for protein in self.protein_list :
protein.wipe_data()
del(protein)
class Protein(object):
"""A protein in a domain, it has the following attributes:
Attributes:
start = start position (most left (5') fragment)
stop = stop position (most right (3') fragment)
number_frags = number of fragments
sample1_mean_fragment_count = mean fragment count in sample1
sample2_mean_fragment_count = mean fragment count in sample2
fragment_count = fragment count in this object
name = name or id of the protein
domain_name = name or if of its domain
position = position of the protein (summit of the fragments)
n_reps_sample1: number of replicates in sample1
n_reps_sample2: number of replicates in sample2
sample1_replicate_counts = counts of fragments per replicate in sample1
sample2_replicate_counts = counts of fragments per replicate in sample2
sample1_replicate_list = list of the fragment-objects in the respecitve replicates of sample1
sample2_replicate_list = list of the fragment-objects in the respecitve replicates of sample2
"""
def __init__(self, position, n_reps_sample1, n_reps_sample2, domain_name, name):
"""Return a protein object"""
self.start = 0
self.stop = 0
self.number_frags = 0
self.sample1_mean_fragment_count = 0
self.sample2_mean_fragment_count = 0
self.fragment_count = 0
self.name = name
self.domain_name = domain_name
self.position = position
self.n_replicates_sample1 = n_reps_sample1
self.n_replicates_sample2 = n_reps_sample2
self.sample1_replicate_counts = [0] * self.n_replicates_sample1
self.sample2_replicate_counts = [0] * self.n_replicates_sample2
self.sample1_replicate_list = [[] for i in lrange(self.n_replicates_sample1)]
self.sample2_replicate_list = [[] for i in lrange(self.n_replicates_sample2)]
def init_fragments(self, genome_sequence, frag_count_sh, frag_count_sc, frag_count_op, frag_count_om, frag_count_os, \
frag_count_scaling, frag_count_lp_sc, frag_count_ex_lo, frag_count_ex_sc, beta_values, \
frag_len_mean, frag_len_dev, frag_len_max, frag_dist_on, frag_dist_mn_mean, frag_dist_mn_cov, frag_dist_prob, protein_size, skewness):
"""Creates n_fragments in this protein"""
if(self.fragment_count == 0 ):
min_protein_pos = 0
max_protein_pos = 0
sample1_list = []
sample2_list = []
sample1_count = 0
sample2_count = 0
#get number of fragments per protein
if(np.random.random() <= frag_count_op):
self.number_frags = np.random.lognormal(mean=frag_count_om,sigma=frag_count_os,size=1) #if yes, it is taken from the gamma
else:
self.number_frags = np.random.gamma(shape=frag_count_sh,scale=frag_count_sc,size=1) #if no, from the lognormal distribution
self.number_frags = max(1, int(self.number_frags)) #set to 1 or more
#get read fraction for sample1 (sample2 = 1-x)
read_frac = np.random.beta(options.beta_values[0], options.beta_values[1])
#now choose which way we want to select the number of fragments
if( frag_count_scaling == "none"):
#no scaling, no changes in nfrags and beta
self.number_frags = self.number_frags
read_frac = read_frac
elif (frag_count_scaling == "beta") :#nfrags scaling via Laplace, beta not changed
self.number_frags = self._scale_laplace(self.number_frags, read_frac, frag_count_lp_sc)
elif (frag_count_scaling == "frag") :#nfrags scaling via exp distribution, number_frags not changed
read_frac = self._scale_exp(self.number_frags, read_frac, frag_count_ex_lo, frag_count_ex_sc)
else:
print("Unknown scaling method, %s, please choose 'none','frag' or 'beta', exiting now" % (frag_count_scaling))
exit(1)
#we will distribute the fragments to two samples with x replicates
self.number_frags = self.number_frags * (self.n_replicates_sample1 + self.n_replicates_sample2)
for _ in lrange(self.number_frags):
#init one fragment with position, sequence and type
fragmentobj=Fragment(self.position, genome_sequence, "p", frag_len_mean, frag_len_dev, frag_len_max, frag_dist_on, frag_dist_mn_mean, \
frag_dist_mn_cov, frag_dist_prob, protein_size, self.domain_name, self.name )
#to get peak start and stop we need to save min and max of fragment positions
if(min_protein_pos == 0 or (fragmentobj.position - fragmentobj.frag_length//2) < min_protein_pos):
min_protein_pos = fragmentobj.position - fragmentobj.frag_length//2
if(max_protein_pos == 0 or (fragmentobj.position + fragmentobj.frag_length//2) > max_protein_pos):
max_protein_pos = fragmentobj.position + fragmentobj.frag_length//2
if random() <= read_frac: #add to sample 1 or sample 2 and to list
sample1_count += 1
sample1_list.append(fragmentobj)
else:
sample2_count += 1
sample2_list.append(fragmentobj)
self.fragment_count += 1
#save positions
self.start = min_protein_pos
self.stop = max_protein_pos
#save get distribution into replicates
self.sample1_replicate_list, self.sample1_replicate_counts = self._fragments_to_replicates(sample1_list, skewness, self.n_replicates_sample1)
self.sample2_replicate_list, self.sample2_replicate_counts = self._fragments_to_replicates(sample2_list, skewness, self.n_replicates_sample2)
#save mean counts per sample
self.sample1_mean_fragment_count = sample1_count / self.n_replicates_sample1
self.sample2_mean_fragment_count = sample2_count / self.n_replicates_sample2
else:
print("Fragments already initialized")
def _fragments_to_replicates(self, fragment_list, skewness, n_replicates):
"""puts fragments in a randomly selected replicate"""
#get fragments into replicates
bins = np.cumsum([0] + list(dirichlet(np.array([skewness] * n_replicates), 1)[0])) #sample dirichlet dist, get array, make list, add 0 to beginning and sum up all values to 1, so we end up with an array of bins with probabilities
replicate_counts = [0] * n_replicates
replicate_list = [[] for i in lrange(n_replicates)]
for fragmentobj in fragment_list: #go through fragments
index = np.digitize([random()], bins)[0].item() - 1 #get index-1 of random number (between 0 or 1) fitting to the right bin
replicate_list[index].append(fragmentobj) #add element to replicate via choosen index
replicate_counts[index] += 1
return replicate_list, replicate_counts #return number of fragments per replicate
def _scale_laplace(self, nfrags, read_frac, scale):
"""Scale beta result based on Laplace distribution"""
loc = 0.5 #fixed
top_nfrgs_scale = laplace.pdf(0.5, loc, scale)#make sure at postion 0.5 is 100%
scale_factor = 1 / top_nfrgs_scale
nfrags_scale = laplace.pdf(read_frac, loc, scale) *scale_factor #the bigger the difference from beta the smaller the sample...
if(nfrags_scale == 0):
nfrags = nfrags
else:
nfrags = int(nfrags *nfrags_scale) #new fragment scaling based on beta result
return nfrags
def _scale_lognorm(self, nfrags, read_frac, sigma, loc, scale):
"""Scale number_frags result based on Lognorm distribution"""
#loc=scale*-1
top_beta_scale = sp.stats.lognorm.pdf(1,s=sigma,loc=loc, scale=scale)#to get to 100% at size 10
scale_factor=1/top_beta_scale #this is the factor to set beta_frac to 100% at 1
beta_scale = sp.stats.lognorm.pdf(nfrags,s=sigma,loc=loc, scale=scale) *scale_factor
if(beta_scale == 0):
read_frac = read_frac
else:
read_frac = ((read_frac-0.5)*beta_scale)+0.5 #center and scale it based on beta_scale from exponential distri, so reduce to 0.5 for high values
return read_frac
def _scale_exp(self, nfrags, read_frac, loc, scale):
"""Scale number_frags result based on Exponential distribution"""
top_beta_scale = sp.stats.expon.pdf(loc,loc=loc, scale=scale)#to get to 100% at size loc
scale_factor=1/top_beta_scale #this is the factor to set beta_frac to 100% at 1
beta_scale = sp.stats.expon.pdf(nfrags,loc=loc, scale=scale) *scale_factor
read_frac_old = read_frac
if(beta_scale == 0):
read_frac = read_frac
else:
read_frac = ((read_frac-0.5)*beta_scale)+0.5 #center and scale it based on beta_scale from exponential distribution, we do this to reduce the read_frac towards 0.5 for high values
return read_frac
def _get_factor(self, x, loc=0.5, scale=0.2):
"""laplace distribution for distribution into replicates"""
return laplace.pdf(x, loc, scale)
def __lt__(self, other): #for sorting
return self.start < other.start
def wipe_data(self):
"""Removes all Fragments"""
self.sample1_replicate_counts = []
self.sample2_replicate_counts = []
self.sample1_replicate_list = []
self.sample2_replicate_list = []
class Fragment(object):
"""A fragment in a protein, it has the following attributes:
Attributes:
domain_name: name or id of the domain
protein_name: name or id of the protein
frag_length: int of the length of the fragment
rand_shift: int of the shift inside the protein
position: int of the final position of the fragment
sequence: string of the sequence of the fragment
frag_type: string of the fragment type
"""
def __init__(self, prot_position, genome_seq, frag_type, frag_len_mean, frag_len_dev, frag_len_max, frag_dist_on, frag_dist_mn_mean, frag_dist_mn_cov,\
frag_dist_prob, protein_size, domain_name, protein_name):
"""Return a fragment object"""
self.domain_name = domain_name
self.protein_name = protein_name
self.frag_length = self._sample_frag_length(frag_len_mean, frag_len_dev, frag_len_max) #get fragment length
if not (frag_dist_on): #simply random choosen shift
self.rand_shift = randint(-abs(frag_len_mean - protein_size)//2, abs(frag_len_mean - protein_size)//2) #add random shift inside the protein size
else : #shift from multivariate distribution defined by user
shift = self._sample_multivariate_normal(protein_size, frag_dist_mn_mean, frag_dist_mn_cov, frag_dist_prob)
self.rand_shift = int(-abs(frag_len_mean - protein_size)//2) + shift #from most left position on add shift
self.position = prot_position + self.rand_shift #final position of the fragment
#get sequence
self.sequence = genome_seq[self.position - self.frag_length//2 : self.position + self.frag_length//2]
self.frag_type = frag_type
def _sample_frag_length(self, mean, dev, frag_len_max):
"""Return length of fragment for peaks"""
return min(abs(int(np.random.normal(loc = mean, scale = dev))), frag_len_max)
def _sample_multivariate_normal(self, protein_size, frag_dist_mn_mean, frag_dist_mn_cov, frag_dist_prob):
"""Return a sample from a multivariate normal distribution """
res = min(max(int(choice(multivariate_normal(frag_dist_mn_mean, frag_dist_mn_cov), p = frag_dist_prob)), 1), protein_size)
#choose a random element from the n distributions, min = 1 and max = protein_size
return int(res)
class NoiseFragment(Fragment):
"""A noise fragment it has the following attributes:
Attributes:
position: integer of the final position of the fragment
frag_length: integer of th length of the fragment
sequence: string of the sequence of the fragment
frag_type: string of the fragment type
"""
def __init__(self, position, genome_seq, frag_type, frag_len_mean, frag_len_dev, frag_len_max):
"""Return a fragment object"""
self.position = position
self.frag_length = self._sample_frag_length(frag_len_mean, frag_len_dev, frag_len_max) #get fragment length
self.sequence = genome_seq[self.position - self.frag_length//2 : self.position + self.frag_length//2]
self.frag_type = frag_type
class Sample(object):
"""A sample which holds all replikates with fragments per replikate, it has the following attributes:
Attributes:
replicate_list: list of replicate-objects
n_replicates: number of replicates
sample_id: string of the id/name
chromosome: str of the chromosome name
weighted_bins: list of weights of the bins of the chromosome
noise_regions: list of the regions the weighted bins are there for
myinput: input-object for this sample
"""
def __init__(self, chromosome, n_replicates, sample_id, weighted_bins, noise_regions):
"""Return a sample object"""
self.replicate_list = []
self.n_replicates = n_replicates
self.sample_id = sample_id
self.chromosome = chromosome
self.weighted_bins = []
self.noise_regions = []
self.weighted_bins = weighted_bins
self.noise_regions = noise_regions
self.myinput=Input()
rep_id = 1 #start with 1 and increase per replicate
for _ in lrange(n_replicates): #create replicates
replicateobj = Replicate(rep_id)
self.replicate_list.append(replicateobj)
rep_id += 1
def get_fragment_number(self):
"""Returns the number of fragments in this sample"""
n_fragments = 0
for replicate in self.replicate_list:
n_fragments += replicate.get_fragment_number()
return n_fragments
def add_db_fragments(self, fragment_list, index):
"""Adds a fragment-list from a db peak to the chosen index of the replicates"""
self.replicate_list[index].add_fragments(fragment_list) #add fragment
def add_noise(self, back_avg, read_length, frag_len_mean, frag_len_dev, frag_len_max):
"""Adds noise to all replicates of this sample"""
for rep in self.replicate_list:
print("Adding noise fragments to sample %s replicate %s" % (self.sample_id, rep.replicate_id))
rep.add_noise(self.chromosome, self.weighted_bins, self.noise_regions, back_avg, read_length, frag_len_mean, frag_len_dev, frag_len_max)
def write_fasta(self, prefix, read_length, read_count, append = False):
"""Write reads of fragments in the replicates to a fasta file"""
for rep in self.replicate_list:#write fasta per replicate
read_count = rep.write_fasta(self.sample_id, prefix, read_length, read_count, append)
self.myinput.write_fasta(self.sample_id, prefix, read_length)#write input fasta
return read_count
def add_input(self, frag_len_mean, frag_len_dev, frag_len_max, back_avg, read_length):
"""Adds input to this sample"""
print("Adding Input fragments to sample %s " % (self.sample_id))
self.myinput.create_fragments(self.chromosome, self.weighted_bins, self.noise_regions, frag_len_mean, frag_len_dev, frag_len_max, back_avg, read_length)
def add_spike_in(self, spike_in_chrom, si_weighted_bins, si_regions, spike_in_cov, read_length, frag_len_mean, frag_len_dev, frag_len_max):
"""Adds spike-in fragments to this sample"""
for rep in self.replicate_list:
print("Adding spike-in fragments to sample %s replicate %s" % (self.sample_id, rep.replicate_id))
rep.add_spike_in(spike_in_chrom, si_weighted_bins, si_regions, spike_in_cov, read_length, frag_len_mean, frag_len_dev, frag_len_max)
print("Adding spike-in fragments to Input of sample %s " % (self.sample_id))
self.myinput.add_spike_in(spike_in_chrom, si_weighted_bins, si_regions, spike_in_cov, read_length, frag_len_mean, frag_len_dev, frag_len_max)
class Replicate(object):
"""A replikate with fragments per replikate, it has the following attributes:
Attributes:
fragment_list: list of fragment-objects
n_peak_fragments: integer of number fragments from peaks
n_noise_fragments: integer of number fragments from noise
n_spike_in_fragments: integer of number of spike-in fragments
replicate_id: str of the id/name
"""
def __init__(self, replicate_id):
"""Return a sample object"""
self.fragment_list= []
self.n_peak_fragments = 0
self.n_noise_fragments = 0
self.n_spike_in_fragments = 0
self.replicate_id = replicate_id
def get_fragment_number(self):
"""Return number of fragments in this replicate"""
return len(self.fragment_list)
def add_fragments(self, fragment_list):
"""Add a fragment to this replicate"""
self.fragment_list.extend(fragment_list)
self.n_peak_fragments+=len(fragment_list)
def add_noise(self, chrom, bins, noise_regions, back_avg, read_length, frag_len_mean, frag_len_dev, frag_len_max):
"""Add noise to this replicate"""
number_noise_reads = int(back_avg * chrom.length // read_length) #only the provided genome coverage is taken into account
s = (int(max(bins))-1) * np.random.random_sample(number_noise_reads) #get probability per noise-fragment
chosen_regions = np.digitize(s, bins) #get indices of bins where the reads belong
for cri in chosen_regions: #get indexes
pos = randint(noise_regions[cri].start, noise_regions[cri].stop) #get random position inside the region
noisefragmentobj=NoiseFragment(pos, chrom.sequence, "n", frag_len_mean, frag_len_dev, frag_len_max) #create new noise fragment with position, sequence and type
self.fragment_list.append(noisefragmentobj) #add to fragment list
self.n_noise_fragments+=1
def add_spike_in(self, spike_in_chrom, si_weighted_bins, si_regions, spike_in_cov, read_length, frag_len_mean, frag_len_dev, frag_len_max):
"""Add spike-in fragments to this replicate"""
number_spike_in_reads = int(spike_in_cov * spike_in_chrom.length // read_length) #only the provided genome coverage is taken into account
s = (int(max(si_weighted_bins))-1) * np.random.random_sample(number_spike_in_reads) #get probability per noise-fragment
chosen_regions = np.digitize(s, si_weighted_bins) #get indices of bins where the reads belong
for cri in chosen_regions: #get indexes
pos = randint(si_regions[cri].start, si_regions[cri].stop) #get random position inside the region
sifragmentobj=NoiseFragment(pos,spike_in_chrom.sequence,"s", frag_len_mean, frag_len_dev, frag_len_max) #create spike-in fragment, with position, sequence and type
self.fragment_list.append(sifragmentobj) #add to fragment list
self.n_spike_in_fragments+=1
def write_fasta(self, sample_id, prefix, read_length, read_count, append = False):
"""Write reads of fragment to a fasta file"""
if(len(self.fragment_list) != 0):
name = prefix + "_sample%s-rep%s.fasta" %(sample_id, self.replicate_id)
print("Writing reads to fasta file: %s" % (name))
if(append):
fastafile = open(name, "a") #open file stream to write
else:
fastafile = open(name, "w")
for frag in self.fragment_list:#randomly choose if forward or reverse
if random() < 0.5: #forward
seq = frag.sequence[:read_length]
if not len(seq) == read_length:
seq = seq.rjust(read_length,'N') #if it is shorter than the read length fill it up with N´s
else: #reverse
seq = frag.sequence[len(frag.sequence) - read_length:]
if not len(seq) == read_length:
seq = seq.rjust(read_length,'N')
seq = self._reverse_complement(seq)
#write
if(frag.frag_type == "p"): #add more domain and protein infos
fastafile.write(">read%s_%s_d%sp%s\n%s\n" %(read_count, frag.frag_type, frag.domain_name, frag.protein_name, seq))
else:
fastafile.write(">read%s_%s\n%s\n" %(read_count, frag.frag_type, seq))
read_count += 1
return read_count
else:
print("No fragments in replicate to write")
def _reverse_complement(self, sequence, rev=True):
"""Return the reverse complement of a DNA sequence sequence"""
_complement = dict(A="T", T="A", C="G", G="C", N="N")
reverse = reversed(sequence) if rev else sequence
try:
reversecomplement = (_complement[x] for x in reverse)
except KeyError:
print("Seqeunce %s could not be transfered to referce complement" %sequence)
return sequence
return "".join(reversecomplement) # join the elements into a string
class Input(Replicate):
"""The input with fragments, it has the following attributes:
Attributes:
fragment_list: list of fragment-objects
n_fragments: integer of number fragments
n_spike_in_fragments: integer of spike-in fragments
"""
def __init__(self):
"""Return a sample object"""
self.fragment_list= []
self.n_fragments = 0
self.n_spike_in_fragments = 0
def create_fragments(self, chrom, bins, noise_regions, frag_len_mean, frag_len_dev, frag_len_max, back_avg, read_length):
"""Add fragments to this input"""
number_noise_reads = int(back_avg * chrom.length // read_length) #only the provided genome coverage is taken into account
s = (int(max(bins))-1) * np.random.random_sample(number_noise_reads) #get probability per noise-fragment
chosen_regions = np.digitize(s, bins) #get indices of bins where the reads belong
for cri in chosen_regions: #get indexes
pos = randint(noise_regions[cri].start, noise_regions[cri].stop) #get random position inside the region
noisefragmentobj=NoiseFragment(pos,chrom.sequence,"i", frag_len_mean, frag_len_dev, frag_len_max) #create new input fragment with position, sequence and type
self.fragment_list.append(noisefragmentobj) #add to fragment list
self.n_fragments+=1
def write_fasta(self, sample_id, prefix, read_length):
"""Write reads of fragment to a fasta file"""
if(len(self.fragment_list) != 0): #if no input is required there are no fragments so do not write a fasta
name = prefix + "_sample%s-INPUT.fasta" %(sample_id)
print("Writing reads to fasta file: %s" % (name))
fastafile = open(name, "w") #open file stream to write
entry_count = 1
for frag in self.fragment_list:
if random() < 0.5: #randomly choose if forward or reverse
seq_forward = frag.sequence[:read_length]
if not len(seq_forward) == read_length:
seq_forward = seq_forward.rjust(read_length,'N') #if it is shorter than the read length fill it up with N´s
fastafile.write(">read%s_%s\n%s\n" %(entry_count, frag.frag_type, seq_forward))
else:
seq_reverse = frag.sequence[len(frag.sequence) - read_length:]
if not len(seq_reverse) == read_length:
seq_reverse = seq_reverse.rjust(read_length,'N')
seq_reverse = self._reverse_complement(seq_reverse)
fastafile.write(">read%s_%s\n%s\n" %(entry_count, frag.frag_type, seq_reverse))
entry_count += 1
#helper functions
def _progress_bar(count, total, suffix=''):
"""Simple progress bar, takes tha actual count of completed tasls and the total tasks that need to be completed"""
bar_len = 60
filled_len = int(round(bar_len * count / float(total)))
percents = round(100.0 * count / float(total), 1)
bar = '=' * filled_len + '-' * (bar_len - filled_len)
if(suffix == ''):
sys.stdout.write('[%s] %s%s \r' % (bar, percents, '%'))
else:
sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', suffix))
sys.stdout.flush()
def _init_add_domains_mp(counter, parametersobj, results_list):
"""Initialize domains for multiprocessing"""
tmp_results_list = [] #store created object in this list
while True:
domain_number = counter.increment() #add 1 to the counter, one more domain done...
if(domain_number == -1): #reached maximum end here...
break
sequence, valid_regions, bins, n_reps_sample1, n_reps_sample2, prot_count_n, prot_count_p, protein_size, frag_len_max, frag_dist_on, frag_dist_mn_mean, \
frag_dist_mn_cov, frag_dist_prob, prot_dist_mn_mean, prot_dist_mn_cov, frag_count_sh, frag_count_sc, frag_count_op, frag_count_om, frag_count_os, \
frag_count_scaling, frag_count_lp_sc, frag_count_ex_lo, frag_count_ex_sc, beta_values, frag_len_mean, frag_len_dev, skewness, \