DCSsim/DCSsim.py at master · Edert/DCSsim

History

1647 lines (1319 loc) · 78 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

#!/usr/bin/env python

# coding=utf8

"""

%prog <FASTA> <BED> <INT> [options]

Based on the provided <FASTA> sequence differential peaks (DPs) are simulated, restricted to if <--is_white_list> or constrained by the specified <BED> file.

<INT> defines the number of simulated replicates of the two samples.

@author: Thomas Eder

"""

from __future__ import print_function

from __future__ import division

import sys

import os

import time

import datetime

import HTSeq

import numpy as np

import scipy as sp

import multiprocessing as mp

import matplotlib.pyplot as plt

from builtins import range

from future.utils import lrange

from matplotlib.backends.backend_pdf import PdfPages

from optparse import OptionParser

from pybedtools import BedTool, Interval

from random import random, randint

from scipy.stats import beta, laplace

from numpy.random import random_sample, rand, multivariate_normal, choice, gamma, dirichlet

from multiprocessing.managers import BaseManager

MAX_TEST = 10000 #number of maximal test iterations, can influence runtime

#classes

class HelpfulOptionParser(OptionParser):

"""An OptionParser that prints full help on errors."""

def error(self, msg):

self.print_help(sys.stderr)

self.exit(2, "\n%s: error: %s\n" % (self.get_prog_name(), msg))

class Timer:

"""Timer to track time to run."""

def __init__(self):

self.start = time.time()

def restart(self):

self.start = time.time()

def get_time_hhmmss(self):

end = time.time()

m, s = divmod(end - self.start, 60)

h, m = divmod(m, 60)

time_str = "%02d:%02d:%02d" % (h, m, s)

return time_str

class Parameters:

"""Class to store the main init parameters."""

def __init__(self, sequence, valid_regions, bins, n_reps_sample1, n_reps_sample2, prot_count_n, prot_count_p, protein_size,\

frag_len_max, frag_dist_on, frag_dist_mn_mean, frag_dist_mn_cov, frag_dist_prob, prot_dist_mn_mean, prot_dist_mn_cov,\

frag_count_sh, frag_count_sc, frag_count_op, frag_count_om, frag_count_os, frag_count_scaling, frag_count_lp_sc, \

frag_count_ex_lo, frag_count_ex_sc, beta_values, frag_len_mean, frag_len_dev, skewness, valid_regions_array):

self.sequence = sequence

self.valid_regions = valid_regions

self.bins = bins

self.n_reps_sample1 = n_reps_sample1

self.n_reps_sample2 = n_reps_sample2

self.prot_count_n = prot_count_n

self.prot_count_p = prot_count_p

self.protein_size = protein_size

self.frag_len_max = frag_len_max

self.prot_dist_mn_mean = prot_dist_mn_mean

self.prot_dist_mn_cov = prot_dist_mn_cov

self.frag_count_sh = frag_count_sh

self.frag_count_sc = frag_count_sc

self.frag_count_op = frag_count_op

self.frag_count_om = frag_count_om

self.frag_count_os = frag_count_os

self.frag_count_scaling = frag_count_scaling

self.frag_count_lp_sc = frag_count_lp_sc

self.frag_count_ex_lo = frag_count_ex_lo

self.frag_count_ex_sc = frag_count_ex_sc

self.beta_values = beta_values

self.frag_len_mean = frag_len_mean

self.frag_len_dev = frag_len_dev

self.skewness = skewness

self.valid_regions_array = valid_regions_array

self.frag_dist_on = frag_dist_on

self.frag_dist_mn_mean = frag_dist_mn_mean

self.frag_dist_mn_cov = frag_dist_mn_cov

self.frag_dist_prob = frag_dist_prob

def get_all(self):

""""Returns all parameters stored"""

return self.sequence, self.valid_regions, self.bins ,self.n_reps_sample1, self.n_reps_sample2, self.prot_count_n, self.prot_count_p, \

self.protein_size, self.frag_len_max, self.frag_dist_on, self.frag_dist_mn_mean, self.frag_dist_mn_cov, self.frag_dist_prob, self.prot_dist_mn_mean, \

self.prot_dist_mn_cov, self.frag_count_sh, self.frag_count_sc, self.frag_count_op, self.frag_count_om, self.frag_count_os, \

self.frag_count_scaling, self.frag_count_lp_sc, self.frag_count_ex_lo, self.frag_count_ex_sc, \

self.beta_values, self.frag_len_mean, self.frag_len_dev, self.skewness, self.valid_regions_array

class Report_data:

"""Class to store report data"""

def __init__(self, n_reps_sample1, n_reps_sample2):

self.r_prot_pos = []

self.r_dom_nprot = []

self.r_prot_offset_positions = []

self.r_nfrag = []

self.r_n_frag1 = []

self.r_n_frag2 = []

self.r_fragshift = []

self.r_fraglen = []

self.r_n_frag1_reps = [[] for i in lrange(n_reps_sample1)]

self.r_n_frag2_reps = [[] for i in lrange(n_reps_sample2)]

self.db_peaks_sample1 = 0

self.db_peaks_sample2 = 0

self.n_domains = 0

def _extend(self, n_domains, r_prot_pos, r_dom_nprot, r_prot_offset_positions, r_nfrag, r_n_frag1, r_n_frag2, r_fragshift, r_fraglen, r_n_frag1_reps, r_n_frag2_reps, db_peaks_sample1, db_peaks_sample2):

"""Extend report data lists"""

self.n_domains += n_domains

self.r_prot_pos.extend(r_prot_pos)

self.r_dom_nprot.extend(r_dom_nprot)

self.r_prot_offset_positions.extend(r_prot_offset_positions)

self.r_nfrag.extend(r_nfrag)

self.r_n_frag1.extend(r_n_frag1)

self.r_n_frag2.extend(r_n_frag2)

self.r_fragshift.extend(r_fragshift)

self.r_fraglen.extend(r_fraglen)

rep_index = 0

for rep in self.r_n_frag1_reps:

rep.extend(r_n_frag1_reps[rep_index])

rep_index += 1

rep_index = 0

for rep in self.r_n_frag2_reps:

rep.extend(r_n_frag2_reps[rep_index])

rep_index += 1

self.db_peaks_sample1 += db_peaks_sample1

self.db_peaks_sample2 += db_peaks_sample2

class Counter(object):

"""Counter for the number of domains to create, locks for multiprocessing"""

def __init__(self, initval=0, maxval=50):

self.val = mp.Value('i', initval)

self.lock = mp.Lock()

self.maxval = maxval

def increment(self):

with self.lock:

if(self.val.value >= self.maxval):

return -1

else:

self.val.value += 1

return self.val.value

def get_value(self):

with self.lock:

return self.val.value

def is_max(self):

with self.lock:

if(self.val.value >= self.maxval):

return True

else:

return False

class Chromosome(object):

"""The chromosome were we want to simulate differential binding peaks on, it has the following attributes:

Attributes:

sequence: str of the genomic sequence

length: integer of seqeunce length

domain_list: list of domains

valid_regions: regions which are not blacklisted

bins: probabilites of the valid_regions based on their length

n_reps_sample1: number of replicates in sample1

n_reps_sample2: number of replicates in sample2

is_white_list: if the supplied bed file is treated as white-list

threads: number of processes used to create domains, proteins and fragments

"""

def __init__(self, sequence, chrom_name, bedfile_path, n_reps_sample1, n_reps_sample2, is_white_list, threads):

"""Returns a chromosome object"""

self.sequence = sequence

self.name = chrom_name

self.length = len(sequence)

self.domain_list = []

self.valid_regions, self.bins = self._get_valid_regions(bedfile_path, self.length, is_white_list) #define valid regions based on provided bed file

self.n_reps_sample1 = n_reps_sample1

self.n_reps_sample2 = n_reps_sample2

self.is_white_list=is_white_list

self.threads = threads

def init_domains(self, n_domains_start, n_domains, prot_count_n, prot_count_p, protein_size, frag_len_max, frag_dist_on, frag_dist_mn_mean, \

frag_dist_mn_cov, frag_dist_prob, prot_dist_mn_mean, prot_dist_mn_cov, frag_count_sh, frag_count_sc, frag_count_op, frag_count_om, frag_count_os, \

frag_count_scaling, frag_count_lp_sc, frag_count_ex_lo, frag_count_ex_sc, beta_values, frag_len_mean, frag_len_dev, skewness):

"""Creates n_domains on the sequence"""

j = 0

work = []

valid_regions_array = []

if(len(self.domain_list) == 0):

#build valid regions array for quick checks...

valid_regions_array = np.zeros(self.length+1,bool) #array of false values

for region in self.valid_regions:

for i in lrange(region.start,region.stop+1):

valid_regions_array[i] = True #set region valid

#create obj to store all parameters

parameterobj = Parameters(self.sequence, self.valid_regions, self.bins, self.n_reps_sample1, self.n_reps_sample2, prot_count_n, prot_count_p,\

protein_size, frag_len_max, frag_dist_on, frag_dist_mn_mean, frag_dist_mn_cov, frag_dist_prob, prot_dist_mn_mean, prot_dist_mn_cov, \

frag_count_sh, frag_count_sc, frag_count_op, frag_count_om, frag_count_os, frag_count_scaling, frag_count_lp_sc, \

frag_count_ex_lo, frag_count_ex_sc, beta_values, frag_len_mean, frag_len_dev, skewness, valid_regions_array)

if (self.threads > 1): #use multiple processes

print("Initializing domains %s to %s in %s processes" %(n_domains_start + 1, n_domains_start + n_domains , self.threads))

workers = self.threads #number of workers

counter = Counter(n_domains_start, n_domains_start + n_domains) #process safe counter from n_domains_start to the number of domains, it is used to name the domains too

process_list = []

pipe_list = []

for _ in range(workers): #each worker creates new domains

recv_end, send_end = mp.Pipe(False) #create unidirectional pipe

p = mp.Process(target=_init_add_domains_mp, args=(counter, parameterobj, send_end))

pipe_list.append(recv_end)

p.daemon = True

p.start() #start process

process_list.append(p) #append to process list

while True: #fill progress bar

complete_count = counter.get_value() - n_domains_start

_progress_bar(complete_count, n_domains )

if(complete_count == n_domains): #wait till all workers are done...

break

time.sleep(0.1)

print("\n\nCollecting results from %s threads" % self.threads)

collect_count = 0

for pipe in pipe_list: #go through pipe list and get results from the end points

domain_list = pipe.recv()

self.domain_list.extend(domain_list)

collect_count += len(domain_list)

_progress_bar(collect_count, n_domains)

pipe.close()

for p in process_list:

p.join()

print("\n")

else: #use just a single process

self.domain_list = _init_add_domains(parameterobj, n_domains_start, n_domains)

else:

print("Domains already initialized")

def _get_valid_regions(self, bedfile_path, chromosome_length, is_white_list):

"""Return valid regions and accumulated probabilities to peak one region"""

if(is_white_list):

regions = BedTool(bedfile_path)

return self.__get_valid_region(regions)

else:

black_regions = BedTool(bedfile_path)

valid_regions = BedTool('%s 0 %s' %(self.name, chromosome_length), from_string=True) #create whole chromosome region

return self.__subtract_from_valid_region(valid_regions, black_regions) #subtracted blacklisted regions from valid regions in this case the whole chromosome

def __subtract_from_valid_region(self, valid_regions, black_regions):

"""Subtract <black_regions> from <valid_regions> and return bins and <valid_regions>"""

remaining_regions = valid_regions.subtract(black_regions) #get only valid regions without overlapping blacklisted regions

if(len(remaining_regions) == 0):

parser.error("No valid sequence region left, please check provided bed-file (blacklist/whitelist) and add valid/remove blacklisted regions or reduce \

the number of domains '-d'.")

total_length = 0

relative_lengths = []

for region in remaining_regions:

total_length += len(region) #sum up regions

relative_lengths.append(len(region))

prob = list(map(lambda x: x/total_length, relative_lengths)) #get probability based on region length

bins = np.add.accumulate(prob)

return remaining_regions, bins

def __get_valid_region(self, valid_regions):

"""Return bins and <valid_regions>"""

total_length = 0

relative_lengths = []

for region in valid_regions:

total_length += len(region)

relative_lengths.append(len(region))

prob = list(map(lambda x: x/total_length, relative_lengths))

bins = np.add.accumulate(prob)

return valid_regions, bins

def wipe_data(self):

"""Removes all Domains, Proteins and Fragments"""

for domain in self.domain_list :

domain.wipe_data()

del(domain)

self.domain_list = []

class SpikeInChromosome(Chromosome):

"""The chromosome of the spike-in species, it has the following attributes:

Attributes:

sequence: str of the genomic sequence

length: integer of sequence length

"""

def __init__(self, sequence, chrom_name):

"""Return a chromosome object"""

self.sequence = sequence

self.name = chrom_name

self.length = len(sequence)

class Domain(object):

"""A domain in the genome sequence, it has the following attributes:

Attributes:

protein_list: list of proteins

n_proteins: integer of the number of proteins

protein_positions: list of protein positions

self.protein_offset_positions: list of the protein offset positions from domain start

n_reps_sample1: number of replicates in sample1

n_reps_sample2: number of replicates in sample2

"""

def __init__(self, n_reps_sample1, n_reps_sample2, name):

"""Return a domain object"""

self.name = name

self.protein_list = []

self.n_proteins = 0

self.protein_positions = []

self.protein_offset_positions = []

self.n_reps_sample1 = n_reps_sample1

self.n_reps_sample2 = n_reps_sample2

def init_proteins(self, genome_sequence, valid_regions, bins, prot_count_n, prot_count_p, protein_size, frag_len_max, frag_dist_on, frag_dist_mn_mean,\

frag_dist_mn_cov, frag_dist_prob, prot_dist_mn_mean, prot_dist_mn_cov, frag_count_sh, frag_count_sc, frag_count_op, frag_count_om, frag_count_os,\

frag_count_scaling, frag_count_lp_sc, frag_count_ex_lo, frag_count_ex_sc, beta_values, frag_len_mean, frag_len_dev, \

skewness, valid_regions_array):

"""Creates n_proteins of proteins in the domain range"""

if(len(self.protein_list) == 0):

#get number of proteins in this domain

tmp_n_proteins = self._sample_neg_bin(prot_count_n, prot_count_p) #number will be change if we hit a blacklisted region

#get positions in valid regions...

self.protein_positions, self.n_proteins, self.protein_offset_positions = self._get_protein_positions(tmp_n_proteins, protein_size, prot_dist_mn_mean, \

prot_dist_mn_cov, genome_sequence, valid_regions, bins,frag_len_mean, frag_len_max, valid_regions_array)

prot_count = 1

for pos in self.protein_positions:

min_protein_pos = 0 #save min and max of fragments

max_protein_pos = 0

counts_frag_sample1, counts_frag_sample2 = 0, 0

tmp_fragments_sample1, tmp_fragments_sample2 = [], []

#init one protein

proteinobj = Protein(pos, self.n_reps_sample1, self.n_reps_sample2, self.name, prot_count)

#init all fragments in that protein

proteinobj.init_fragments(genome_sequence, frag_count_sh, frag_count_sc, frag_count_op, frag_count_om, frag_count_os, \

frag_count_scaling, frag_count_lp_sc, frag_count_ex_lo, frag_count_ex_sc, beta_values, \

frag_len_mean, frag_len_dev, frag_len_max, frag_dist_on, frag_dist_mn_mean, frag_dist_mn_cov, frag_dist_prob, protein_size, skewness)

#add to list

self.protein_list.append(proteinobj)

prot_count += 1

else:

print("Proteins already initialized")

def _get_protein_positions(self, tmp_n_proteins, protein_size, prot_dist_mn_mean, prot_dist_mn_cov, genome_sequence, valid_regions, bins, frag_len_mean, frag_len_max, \

valid_regions_array):

"""Get protein positions in valid regions"""

count = 0

while(count < MAX_TEST):

count += 1

#get protein distances in this domain

first_protein_offset_positions = self._sample_multivariate_normal(tmp_n_proteins, protein_size, prot_dist_mn_mean, prot_dist_mn_cov)

#get most left protein position

protein_left_position = self._get_left_protein_position(genome_sequence, tmp_n_proteins, int(max(first_protein_offset_positions)), valid_regions, bins, frag_len_mean, frag_len_max, protein_size)

#check positions 1 to n for valid regions, if not just omit them and add offsets to left_position

protein_positions, n_proteins, protein_offset_positions = self._get_check_offset_positions(protein_left_position, first_protein_offset_positions, valid_regions_array, protein_size, frag_len_mean)

if(n_proteins > 0): #we found at least one valid region

return protein_positions, n_proteins, protein_offset_positions

parser.error("Could not find at least one valid protein position please check the bed-file (blacklist/whitelist) and the parameters regarding protein size and number.")

def _sample_multivariate_normal(self, k, protein_size, prot_dist_mn_mean, prot_dist_mn_cov):

"""Return k samples from a multivariate normal distribution """

res = list(map(lambda x: max(1,int(choice(list(x)))), multivariate_normal(prot_dist_mn_mean, prot_dist_mn_cov, k)))

res.insert(0,int(protein_size/2)) #make sure first element is more than half of protein size

res.pop() #remove last element to be sure that we did not change number of proteins

#res = [int(x) for x in res] #all elements to int

return res

def _get_left_protein_position(self, genome_seq, number_proteins, max_offset, valid_regions, bins, frag_len_mean, frag_len_max, protein_size):

"""Return position of leftmost protein"""

offset = (number_proteins - 1) * max_offset #get theoretical maximal offset

count = 0

while (count < MAX_TEST): # test only MAX_TEST regions after that exit..

count += 1

valid = True

pos = self._get_random_position(valid_regions, bins) #get one position in an randomly selected valid_region

#check the potential sequence for illegal characters... also do not accept N´s

tmp_seq = genome_seq[pos - frag_len_max : pos + frag_len_max + offset]

if tmp_seq.count('A') + tmp_seq.count('C') + tmp_seq.count('G') + tmp_seq.count('T') == len(tmp_seq):

return pos

parser.error("Could not find a sequence with valid nucleotides, please check the provided fasta file and or bed-file (blacklist/whitelist).")

def _sample_neg_bin(self, n, p):

"""Sample negative binomial distribution"""

return int(max(np.random.negative_binomial(n, p),1))

def _get_random_position(self, valid_regions, bins):

"""Return random start and end positions of region (+/- fragment length mean), based on accumulated probability (based on region size) in bins"""

chosen_region = np.digitize(random_sample(1), bins)[0].item() #get one random bin and convert numpyint to int

return randint((valid_regions[chosen_region].start ), (valid_regions[chosen_region].stop ))

def _get_check_offset_positions(self, protein_left_position, protein_offset_positions, valid_regions_array, protein_size, frag_len_mean):

"""Checks if the offset positions realtive to the left position are in valid regions, if not removes them"""

new_protein_offset_positions = []

approx_dif = abs(frag_len_mean - protein_size)//2 #estimate shifts of fragments

n_proteins = len(protein_offset_positions)

protein_absolute_positions = protein_left_position + np.cumsum(protein_offset_positions) #add offset positions

rm_list = []

rm_index = 0

for offset_pos in protein_absolute_positions:

isgood = True

start = int(offset_pos - approx_dif)

stop = int(offset_pos + approx_dif)

for i in lrange(start,stop+1):

if(valid_regions_array[i] == False):#found overlap with blacklisted region

isgood = False

n_proteins -= 1

rm_list.append(rm_index)

break

if(isgood):#all positions are valid

new_protein_offset_positions.append(offset_pos)

rm_index += 1

protein_offset_positions = np.delete(protein_offset_positions, rm_list) #remove also from offset list

return new_protein_offset_positions, n_proteins, protein_offset_positions

def wipe_data(self):

"""Removes all Proteins and Fragments"""

for protein in self.protein_list :

protein.wipe_data()

del(protein)

class Protein(object):

"""A protein in a domain, it has the following attributes:

Attributes:

start = start position (most left (5') fragment)

stop = stop position (most right (3') fragment)

number_frags = number of fragments

sample1_mean_fragment_count = mean fragment count in sample1

sample2_mean_fragment_count = mean fragment count in sample2

fragment_count = fragment count in this object

name = name or id of the protein

domain_name = name or if of its domain

position = position of the protein (summit of the fragments)

n_reps_sample1: number of replicates in sample1

n_reps_sample2: number of replicates in sample2

sample1_replicate_counts = counts of fragments per replicate in sample1

sample2_replicate_counts = counts of fragments per replicate in sample2

sample1_replicate_list = list of the fragment-objects in the respecitve replicates of sample1

sample2_replicate_list = list of the fragment-objects in the respecitve replicates of sample2

"""

def __init__(self, position, n_reps_sample1, n_reps_sample2, domain_name, name):

"""Return a protein object"""

self.start = 0

self.stop = 0

self.number_frags = 0

self.sample1_mean_fragment_count = 0

self.sample2_mean_fragment_count = 0

self.fragment_count = 0

self.name = name

self.domain_name = domain_name

self.position = position

self.n_replicates_sample1 = n_reps_sample1

self.n_replicates_sample2 = n_reps_sample2

self.sample1_replicate_counts = [0] * self.n_replicates_sample1

self.sample2_replicate_counts = [0] * self.n_replicates_sample2

self.sample1_replicate_list = [[] for i in lrange(self.n_replicates_sample1)]

self.sample2_replicate_list = [[] for i in lrange(self.n_replicates_sample2)]

def init_fragments(self, genome_sequence, frag_count_sh, frag_count_sc, frag_count_op, frag_count_om, frag_count_os, \

frag_count_scaling, frag_count_lp_sc, frag_count_ex_lo, frag_count_ex_sc, beta_values, \

frag_len_mean, frag_len_dev, frag_len_max, frag_dist_on, frag_dist_mn_mean, frag_dist_mn_cov, frag_dist_prob, protein_size, skewness):

"""Creates n_fragments in this protein"""

if(self.fragment_count == 0 ):

min_protein_pos = 0

max_protein_pos = 0

sample1_list = []

sample2_list = []

sample1_count = 0

sample2_count = 0

#get number of fragments per protein

if(np.random.random() <= frag_count_op):

self.number_frags = np.random.lognormal(mean=frag_count_om,sigma=frag_count_os,size=1) #if yes, it is taken from the gamma

else:

self.number_frags = np.random.gamma(shape=frag_count_sh,scale=frag_count_sc,size=1) #if no, from the lognormal distribution

self.number_frags = max(1, int(self.number_frags)) #set to 1 or more

#get read fraction for sample1 (sample2 = 1-x)

read_frac = np.random.beta(options.beta_values[0], options.beta_values[1])

#now choose which way we want to select the number of fragments

if( frag_count_scaling == "none"):

#no scaling, no changes in nfrags and beta

self.number_frags = self.number_frags

read_frac = read_frac

elif (frag_count_scaling == "beta") :#nfrags scaling via Laplace, beta not changed

self.number_frags = self._scale_laplace(self.number_frags, read_frac, frag_count_lp_sc)

elif (frag_count_scaling == "frag") :#nfrags scaling via exp distribution, number_frags not changed

read_frac = self._scale_exp(self.number_frags, read_frac, frag_count_ex_lo, frag_count_ex_sc)

else:

print("Unknown scaling method, %s, please choose 'none','frag' or 'beta', exiting now" % (frag_count_scaling))

exit(1)

#we will distribute the fragments to two samples with x replicates

self.number_frags = self.number_frags * (self.n_replicates_sample1 + self.n_replicates_sample2)

for _ in lrange(self.number_frags):

#init one fragment with position, sequence and type

fragmentobj=Fragment(self.position, genome_sequence, "p", frag_len_mean, frag_len_dev, frag_len_max, frag_dist_on, frag_dist_mn_mean, \

frag_dist_mn_cov, frag_dist_prob, protein_size, self.domain_name, self.name )

#to get peak start and stop we need to save min and max of fragment positions

if(min_protein_pos == 0 or (fragmentobj.position - fragmentobj.frag_length//2) < min_protein_pos):

min_protein_pos = fragmentobj.position - fragmentobj.frag_length//2

if(max_protein_pos == 0 or (fragmentobj.position + fragmentobj.frag_length//2) > max_protein_pos):

max_protein_pos = fragmentobj.position + fragmentobj.frag_length//2

if random() <= read_frac: #add to sample 1 or sample 2 and to list

sample1_count += 1

sample1_list.append(fragmentobj)

else:

sample2_count += 1

sample2_list.append(fragmentobj)

self.fragment_count += 1

#save positions

self.start = min_protein_pos

self.stop = max_protein_pos

#save get distribution into replicates

self.sample1_replicate_list, self.sample1_replicate_counts = self._fragments_to_replicates(sample1_list, skewness, self.n_replicates_sample1)

self.sample2_replicate_list, self.sample2_replicate_counts = self._fragments_to_replicates(sample2_list, skewness, self.n_replicates_sample2)

#save mean counts per sample

self.sample1_mean_fragment_count = sample1_count / self.n_replicates_sample1

self.sample2_mean_fragment_count = sample2_count / self.n_replicates_sample2

else:

print("Fragments already initialized")

def _fragments_to_replicates(self, fragment_list, skewness, n_replicates):

"""puts fragments in a randomly selected replicate"""

#get fragments into replicates

bins = np.cumsum([0] + list(dirichlet(np.array([skewness] * n_replicates), 1)[0])) #sample dirichlet dist, get array, make list, add 0 to beginning and sum up all values to 1, so we end up with an array of bins with probabilities

replicate_counts = [0] * n_replicates

replicate_list = [[] for i in lrange(n_replicates)]

for fragmentobj in fragment_list: #go through fragments

index = np.digitize([random()], bins)[0].item() - 1 #get index-1 of random number (between 0 or 1) fitting to the right bin

replicate_list[index].append(fragmentobj) #add element to replicate via choosen index

replicate_counts[index] += 1

return replicate_list, replicate_counts #return number of fragments per replicate

def _scale_laplace(self, nfrags, read_frac, scale):

"""Scale beta result based on Laplace distribution"""

loc = 0.5 #fixed

top_nfrgs_scale = laplace.pdf(0.5, loc, scale)#make sure at postion 0.5 is 100%

scale_factor = 1 / top_nfrgs_scale

nfrags_scale = laplace.pdf(read_frac, loc, scale) *scale_factor #the bigger the difference from beta the smaller the sample...

if(nfrags_scale == 0):

nfrags = nfrags

else:

nfrags = int(nfrags *nfrags_scale) #new fragment scaling based on beta result

return nfrags

def _scale_lognorm(self, nfrags, read_frac, sigma, loc, scale):

"""Scale number_frags result based on Lognorm distribution"""

#loc=scale*-1

top_beta_scale = sp.stats.lognorm.pdf(1,s=sigma,loc=loc, scale=scale)#to get to 100% at size 10

scale_factor=1/top_beta_scale #this is the factor to set beta_frac to 100% at 1

beta_scale = sp.stats.lognorm.pdf(nfrags,s=sigma,loc=loc, scale=scale) *scale_factor

if(beta_scale == 0):

read_frac = read_frac

else:

read_frac = ((read_frac-0.5)*beta_scale)+0.5 #center and scale it based on beta_scale from exponential distri, so reduce to 0.5 for high values

return read_frac

def _scale_exp(self, nfrags, read_frac, loc, scale):

"""Scale number_frags result based on Exponential distribution"""

top_beta_scale = sp.stats.expon.pdf(loc,loc=loc, scale=scale)#to get to 100% at size loc

scale_factor=1/top_beta_scale #this is the factor to set beta_frac to 100% at 1

beta_scale = sp.stats.expon.pdf(nfrags,loc=loc, scale=scale) *scale_factor

read_frac_old = read_frac

if(beta_scale == 0):

read_frac = read_frac

else:

read_frac = ((read_frac-0.5)*beta_scale)+0.5 #center and scale it based on beta_scale from exponential distribution, we do this to reduce the read_frac towards 0.5 for high values

return read_frac

def _get_factor(self, x, loc=0.5, scale=0.2):

"""laplace distribution for distribution into replicates"""

return laplace.pdf(x, loc, scale)

def __lt__(self, other): #for sorting

return self.start < other.start

def wipe_data(self):

"""Removes all Fragments"""

self.sample1_replicate_counts = []

self.sample2_replicate_counts = []

self.sample1_replicate_list = []

self.sample2_replicate_list = []

class Fragment(object):

"""A fragment in a protein, it has the following attributes:

Attributes:

domain_name: name or id of the domain

protein_name: name or id of the protein

frag_length: int of the length of the fragment

rand_shift: int of the shift inside the protein

position: int of the final position of the fragment

sequence: string of the sequence of the fragment

frag_type: string of the fragment type

"""

def __init__(self, prot_position, genome_seq, frag_type, frag_len_mean, frag_len_dev, frag_len_max, frag_dist_on, frag_dist_mn_mean, frag_dist_mn_cov,\

frag_dist_prob, protein_size, domain_name, protein_name):

"""Return a fragment object"""

self.domain_name = domain_name

self.protein_name = protein_name

self.frag_length = self._sample_frag_length(frag_len_mean, frag_len_dev, frag_len_max) #get fragment length

if not (frag_dist_on): #simply random choosen shift

self.rand_shift = randint(-abs(frag_len_mean - protein_size)//2, abs(frag_len_mean - protein_size)//2) #add random shift inside the protein size

else : #shift from multivariate distribution defined by user

shift = self._sample_multivariate_normal(protein_size, frag_dist_mn_mean, frag_dist_mn_cov, frag_dist_prob)

self.rand_shift = int(-abs(frag_len_mean - protein_size)//2) + shift #from most left position on add shift

self.position = prot_position + self.rand_shift #final position of the fragment

#get sequence

self.sequence = genome_seq[self.position - self.frag_length//2 : self.position + self.frag_length//2]

self.frag_type = frag_type

def _sample_frag_length(self, mean, dev, frag_len_max):

"""Return length of fragment for peaks"""

return min(abs(int(np.random.normal(loc = mean, scale = dev))), frag_len_max)

def _sample_multivariate_normal(self, protein_size, frag_dist_mn_mean, frag_dist_mn_cov, frag_dist_prob):

"""Return a sample from a multivariate normal distribution """

res = min(max(int(choice(multivariate_normal(frag_dist_mn_mean, frag_dist_mn_cov), p = frag_dist_prob)), 1), protein_size)

#choose a random element from the n distributions, min = 1 and max = protein_size

return int(res)

class NoiseFragment(Fragment):

"""A noise fragment it has the following attributes:

Attributes:

position: integer of the final position of the fragment

frag_length: integer of th length of the fragment

sequence: string of the sequence of the fragment

frag_type: string of the fragment type

"""

def __init__(self, position, genome_seq, frag_type, frag_len_mean, frag_len_dev, frag_len_max):

"""Return a fragment object"""

self.position = position

self.frag_length = self._sample_frag_length(frag_len_mean, frag_len_dev, frag_len_max) #get fragment length

self.sequence = genome_seq[self.position - self.frag_length//2 : self.position + self.frag_length//2]

self.frag_type = frag_type

class Sample(object):

"""A sample which holds all replikates with fragments per replikate, it has the following attributes:

Attributes:

replicate_list: list of replicate-objects

n_replicates: number of replicates

sample_id: string of the id/name

chromosome: str of the chromosome name

weighted_bins: list of weights of the bins of the chromosome

noise_regions: list of the regions the weighted bins are there for

myinput: input-object for this sample

"""

def __init__(self, chromosome, n_replicates, sample_id, weighted_bins, noise_regions):

"""Return a sample object"""

self.replicate_list = []

self.n_replicates = n_replicates

self.sample_id = sample_id

self.chromosome = chromosome

self.weighted_bins = []

self.noise_regions = []

self.weighted_bins = weighted_bins

self.noise_regions = noise_regions

self.myinput=Input()

rep_id = 1 #start with 1 and increase per replicate

for _ in lrange(n_replicates): #create replicates

replicateobj = Replicate(rep_id)

self.replicate_list.append(replicateobj)

rep_id += 1

def get_fragment_number(self):

"""Returns the number of fragments in this sample"""

n_fragments = 0

for replicate in self.replicate_list:

n_fragments += replicate.get_fragment_number()

return n_fragments

def add_db_fragments(self, fragment_list, index):

"""Adds a fragment-list from a db peak to the chosen index of the replicates"""

self.replicate_list[index].add_fragments(fragment_list) #add fragment

def add_noise(self, back_avg, read_length, frag_len_mean, frag_len_dev, frag_len_max):

"""Adds noise to all replicates of this sample"""

for rep in self.replicate_list:

print("Adding noise fragments to sample %s replicate %s" % (self.sample_id, rep.replicate_id))

rep.add_noise(self.chromosome, self.weighted_bins, self.noise_regions, back_avg, read_length, frag_len_mean, frag_len_dev, frag_len_max)

def write_fasta(self, prefix, read_length, read_count, append = False):

"""Write reads of fragments in the replicates to a fasta file"""

for rep in self.replicate_list:#write fasta per replicate

read_count = rep.write_fasta(self.sample_id, prefix, read_length, read_count, append)

self.myinput.write_fasta(self.sample_id, prefix, read_length)#write input fasta

return read_count

def add_input(self, frag_len_mean, frag_len_dev, frag_len_max, back_avg, read_length):

"""Adds input to this sample"""

print("Adding Input fragments to sample %s " % (self.sample_id))

self.myinput.create_fragments(self.chromosome, self.weighted_bins, self.noise_regions, frag_len_mean, frag_len_dev, frag_len_max, back_avg, read_length)

def add_spike_in(self, spike_in_chrom, si_weighted_bins, si_regions, spike_in_cov, read_length, frag_len_mean, frag_len_dev, frag_len_max):

"""Adds spike-in fragments to this sample"""

for rep in self.replicate_list:

print("Adding spike-in fragments to sample %s replicate %s" % (self.sample_id, rep.replicate_id))

rep.add_spike_in(spike_in_chrom, si_weighted_bins, si_regions, spike_in_cov, read_length, frag_len_mean, frag_len_dev, frag_len_max)

print("Adding spike-in fragments to Input of sample %s " % (self.sample_id))

self.myinput.add_spike_in(spike_in_chrom, si_weighted_bins, si_regions, spike_in_cov, read_length, frag_len_mean, frag_len_dev, frag_len_max)

class Replicate(object):

"""A replikate with fragments per replikate, it has the following attributes:

Attributes:

fragment_list: list of fragment-objects

n_peak_fragments: integer of number fragments from peaks

n_noise_fragments: integer of number fragments from noise

n_spike_in_fragments: integer of number of spike-in fragments

replicate_id: str of the id/name

"""

def __init__(self, replicate_id):

"""Return a sample object"""

self.fragment_list= []

self.n_peak_fragments = 0

self.n_noise_fragments = 0

self.n_spike_in_fragments = 0

self.replicate_id = replicate_id

def get_fragment_number(self):

"""Return number of fragments in this replicate"""

return len(self.fragment_list)

def add_fragments(self, fragment_list):

"""Add a fragment to this replicate"""

self.fragment_list.extend(fragment_list)

self.n_peak_fragments+=len(fragment_list)

def add_noise(self, chrom, bins, noise_regions, back_avg, read_length, frag_len_mean, frag_len_dev, frag_len_max):

"""Add noise to this replicate"""

number_noise_reads = int(back_avg * chrom.length // read_length) #only the provided genome coverage is taken into account

s = (int(max(bins))-1) * np.random.random_sample(number_noise_reads) #get probability per noise-fragment

chosen_regions = np.digitize(s, bins) #get indices of bins where the reads belong

for cri in chosen_regions: #get indexes

pos = randint(noise_regions[cri].start, noise_regions[cri].stop) #get random position inside the region

noisefragmentobj=NoiseFragment(pos, chrom.sequence, "n", frag_len_mean, frag_len_dev, frag_len_max) #create new noise fragment with position, sequence and type

self.fragment_list.append(noisefragmentobj) #add to fragment list

self.n_noise_fragments+=1

def add_spike_in(self, spike_in_chrom, si_weighted_bins, si_regions, spike_in_cov, read_length, frag_len_mean, frag_len_dev, frag_len_max):

"""Add spike-in fragments to this replicate"""

number_spike_in_reads = int(spike_in_cov * spike_in_chrom.length // read_length) #only the provided genome coverage is taken into account

s = (int(max(si_weighted_bins))-1) * np.random.random_sample(number_spike_in_reads) #get probability per noise-fragment

chosen_regions = np.digitize(s, si_weighted_bins) #get indices of bins where the reads belong

for cri in chosen_regions: #get indexes

pos = randint(si_regions[cri].start, si_regions[cri].stop) #get random position inside the region

sifragmentobj=NoiseFragment(pos,spike_in_chrom.sequence,"s", frag_len_mean, frag_len_dev, frag_len_max) #create spike-in fragment, with position, sequence and type

self.fragment_list.append(sifragmentobj) #add to fragment list

self.n_spike_in_fragments+=1

def write_fasta(self, sample_id, prefix, read_length, read_count, append = False):

"""Write reads of fragment to a fasta file"""

if(len(self.fragment_list) != 0):

name = prefix + "_sample%s-rep%s.fasta" %(sample_id, self.replicate_id)

print("Writing reads to fasta file: %s" % (name))

if(append):

fastafile = open(name, "a") #open file stream to write

else:

fastafile = open(name, "w")

for frag in self.fragment_list:#randomly choose if forward or reverse

if random() < 0.5: #forward

seq = frag.sequence[:read_length]

if not len(seq) == read_length:

seq = seq.rjust(read_length,'N') #if it is shorter than the read length fill it up with N´s

else: #reverse

seq = frag.sequence[len(frag.sequence) - read_length:]

if not len(seq) == read_length:

seq = seq.rjust(read_length,'N')

seq = self._reverse_complement(seq)

#write

if(frag.frag_type == "p"): #add more domain and protein infos

fastafile.write(">read%s_%s_d%sp%s\n%s\n" %(read_count, frag.frag_type, frag.domain_name, frag.protein_name, seq))

else:

fastafile.write(">read%s_%s\n%s\n" %(read_count, frag.frag_type, seq))

read_count += 1

return read_count

else:

print("No fragments in replicate to write")

def _reverse_complement(self, sequence, rev=True):

"""Return the reverse complement of a DNA sequence sequence"""

_complement = dict(A="T", T="A", C="G", G="C", N="N")

reverse = reversed(sequence) if rev else sequence

try:

reversecomplement = (_complement[x] for x in reverse)

except KeyError:

print("Seqeunce %s could not be transfered to referce complement" %sequence)

return sequence

return "".join(reversecomplement) # join the elements into a string

class Input(Replicate):

"""The input with fragments, it has the following attributes:

Attributes:

fragment_list: list of fragment-objects

n_fragments: integer of number fragments

n_spike_in_fragments: integer of spike-in fragments

"""

def __init__(self):

"""Return a sample object"""

self.fragment_list= []

self.n_fragments = 0

self.n_spike_in_fragments = 0

def create_fragments(self, chrom, bins, noise_regions, frag_len_mean, frag_len_dev, frag_len_max, back_avg, read_length):

"""Add fragments to this input"""

number_noise_reads = int(back_avg * chrom.length // read_length) #only the provided genome coverage is taken into account

s = (int(max(bins))-1) * np.random.random_sample(number_noise_reads) #get probability per noise-fragment

chosen_regions = np.digitize(s, bins) #get indices of bins where the reads belong

for cri in chosen_regions: #get indexes

pos = randint(noise_regions[cri].start, noise_regions[cri].stop) #get random position inside the region

noisefragmentobj=NoiseFragment(pos,chrom.sequence,"i", frag_len_mean, frag_len_dev, frag_len_max) #create new input fragment with position, sequence and type

self.fragment_list.append(noisefragmentobj) #add to fragment list

self.n_fragments+=1

def write_fasta(self, sample_id, prefix, read_length):

"""Write reads of fragment to a fasta file"""

if(len(self.fragment_list) != 0): #if no input is required there are no fragments so do not write a fasta

name = prefix + "_sample%s-INPUT.fasta" %(sample_id)

print("Writing reads to fasta file: %s" % (name))

fastafile = open(name, "w") #open file stream to write

entry_count = 1

for frag in self.fragment_list:

if random() < 0.5: #randomly choose if forward or reverse

seq_forward = frag.sequence[:read_length]

if not len(seq_forward) == read_length:

seq_forward = seq_forward.rjust(read_length,'N') #if it is shorter than the read length fill it up with N´s

fastafile.write(">read%s_%s\n%s\n" %(entry_count, frag.frag_type, seq_forward))

else:

seq_reverse = frag.sequence[len(frag.sequence) - read_length:]

if not len(seq_reverse) == read_length:

seq_reverse = seq_reverse.rjust(read_length,'N')

seq_reverse = self._reverse_complement(seq_reverse)

fastafile.write(">read%s_%s\n%s\n" %(entry_count, frag.frag_type, seq_reverse))

entry_count += 1

#helper functions

def _progress_bar(count, total, suffix=''):

"""Simple progress bar, takes tha actual count of completed tasls and the total tasks that need to be completed"""

bar_len = 60

filled_len = int(round(bar_len * count / float(total)))

percents = round(100.0 * count / float(total), 1)

bar = '=' * filled_len + '-' * (bar_len - filled_len)

if(suffix == ''):

sys.stdout.write('[%s] %s%s \r' % (bar, percents, '%'))

else:

sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', suffix))

sys.stdout.flush()

def _init_add_domains_mp(counter, parametersobj, results_list):

"""Initialize domains for multiprocessing"""

tmp_results_list = [] #store created object in this list

while True:

domain_number = counter.increment() #add 1 to the counter, one more domain done...

if(domain_number == -1): #reached maximum end here...

break

sequence, valid_regions, bins, n_reps_sample1, n_reps_sample2, prot_count_n, prot_count_p, protein_size, frag_len_max, frag_dist_on, frag_dist_mn_mean, \

frag_dist_mn_cov, frag_dist_prob, prot_dist_mn_mean, prot_dist_mn_cov, frag_count_sh, frag_count_sc, frag_count_op, frag_count_om, frag_count_os, \

frag_count_scaling, frag_count_lp_sc, frag_count_ex_lo, frag_count_ex_sc, beta_values, frag_len_mean, frag_len_dev, skewness, \

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

DCSsim.py

Latest commit

History

DCSsim.py

File metadata and controls