Skip to content

Commit e300970

Browse files
added VCF option
1 parent 50243c5 commit e300970

9 files changed

Lines changed: 128 additions & 559 deletions

necessaryInputFiles/estimatedScalesPerMotif_1.9.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -608,7 +608,6 @@ YY2 0.36414221661192414 0.1197822205054896 0.36914221661192415 0.004550173412417
608608
ZNF263 0.3664249734204516 0.6911615070060347 0.3814249734204516 0.0009356869382043845 10.0 180347
609609
CREM 0.31219331330440686 0.036600094377248524 0.31719331330440687 0.016669345370631257 10.0 180347
610610
BNC2 0.4000797155770753 0.03603972115491639 0.3950797155770753 0.003165480716324813 9.0 180347
611-
CTCF(MA1929.1) 0.005439814961248099 352.3756457725069 0.015439814961248098 1.7979520750635232 33.0 180347
612611
CTCF(MA1930.1) 0.21864908904955682 4.5707367694603995 0.2386490890495568 0.038382574541625074 34.0 180347
613612
ELK1::HOXA1 0.36324707598009787 0.0075736777540162704 0.36324707598009787 0.0075736777540162704 14.0 180347
614613
ELK1::HOXB13 0.3268067749488819 0.0067727454898398745 0.3268067749488819 0.0067727454898398745 15.0 180347

src/HandleInOutput.hpp

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
#include <getopt.h> //einlesen der argumente
1414
#include <unordered_map>
1515
#include <random>
16+
17+
18+
//own classes
19+
#include "callBashCommand.hpp"
20+
1621
using namespace std;
1722

1823
class InOutput{
@@ -36,6 +41,7 @@ class InOutput{
3641
void readScaleValues(string scaleFile, unordered_map<string, double>& scales);
3742
void getInfoSNPs(vector<string>& leadSNPs, unordered_map<string, vector<string>>& proxySNPs);
3843
int getNumberSNPs(string inputFile);
44+
void fileFormatVCF();
3945

4046
//getter
4147
double getPvalue();
@@ -79,7 +85,7 @@ class InOutput{
7985

8086
private: //glaube das sollte nicht private sein
8187
int num_threads = 1; //-n
82-
double pvalue = 0.05; //-p
88+
double pvalue = 0.5; //-p
8389
double pvalue_diff = 0.01; //-c
8490
string frequence = ""; //-b
8591
string footprint = ""; //-f path to footprint file
@@ -263,15 +269,17 @@ void InOutput::parseInputPara(int argc, char *argv[]){
263269
}
264270

265271
PFMs = argv[optind++];
266-
cout <<"PFM dir: " << PFMs << endl;
267-
272+
cout <<"PFM dir: " << PFMs << endl;
268273
snpsNotUnique = argv[optind++];
269274
snps = outputDir + "SNPsUnique.bed";
270275
cout <<"SNP file: " << snpsNotUnique << endl;
271276
genome = argv[optind++];
272277
cout << "genome file: " << genome << endl;
273278
scaleFile = argv[optind++];
274279
cout << "scale file: " << scaleFile << endl;
280+
281+
282+
275283
}
276284

277285
ostream& operator<< (ostream& os, InOutput& io){
@@ -342,6 +350,27 @@ int InOutput::getNumberSNPs(string inputFile){
342350
}
343351

344352

353+
// check if the snp file is in vcf format and if so parse in bed-like format
354+
void InOutput::fileFormatVCF(){
355+
356+
// is snp file in bed-like format or VCF format?
357+
string formatedSNPFile = outputDir + "formatedSNPs.txt";
358+
string helper = snpsNotUnique; // copy of the file name
359+
char delim = '.';
360+
// check if file ending is vcf
361+
getToken(helper, delim); // split file name at point
362+
//cout << helper << endl;
363+
if (helper == "vcf" or helper == "VCF"){
364+
//string formatedSNPFile = outputDir + "formatedSNPs.txt";
365+
// call python script to parse vcf to bed-like format
366+
BashCommand bc_(getGenome()); //constructor bashcommand class
367+
bc_.callFormatingScript(snpsNotUnique, formatedSNPFile);
368+
snpsNotUnique = formatedSNPFile;
369+
}
370+
cout << "SNP file after VCF check: " << snpsNotUnique << endl;
371+
372+
}
373+
345374
//TODO:kommentieren
346375
void InOutput::parseSNPsBedfile(string inputFile, int entriesSNPFile){
347376

@@ -516,7 +545,7 @@ void InOutput::callHelp(){
516545
cout << "Call program with ./src/differentialBindingAffinity_multipleSNPs\noptinal parameters:\n" <<
517546
"-o outputDir (default SNEEP_output/, if you want to specific it, it must be done as first argument)\n" <<
518547
"-n number threads (default 1)\n" <<
519-
"-p pvalue for motif hits (default 0.05)\n"<<
548+
"-p pvalue for motif hits (default 0.5)\n"<<
520549
"-c pvalue differential binding (default 0.01)\n" <<
521550
"-b base frequency for PFMs -> PWMs ( /necessaryInputFiles/frequency.txt)\n" <<
522551
//"-s file where the computed scales per motif are stored ( necessaryInputFiles/estimatedScalesPerMotif_1.9.txt) \n" <<

src/callBashCommand.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ class BashCommand{
3434
void sed(string options, string input, string output);
3535
//void callHistogram(string input, string output, string sourceDir);
3636
string getGenomeFile();
37+
void callFormatingScript(string file, string formatedSNPFile);
3738

3839
private:
3940
// string path_bedtools;
@@ -178,6 +179,14 @@ void BashCommand::bedtoolsShuffle(string randomSequences, string genomesFile, st
178179
//
179180
//}
180181

182+
void BashCommand::callFormatingScript(string file,string formatedSNPFile){
183+
184+
string command = "formatVCF.py " + file + " " + formatedSNPFile;
185+
//cout << command << endl;
186+
system(command.c_str());
187+
return;
188+
}
189+
181190
void BashCommand::anyCommand(string command){
182191
system(command.c_str());
183192
//cout << command<< endl;

src/differentialBindingAffinity_multipleSNPs.cpp

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
#include <math.h>
3131

3232
//for parallelization
33-
#include <omp.h>
33+
//#include <omp.h>
3434

3535
using namespace std;
3636

@@ -70,7 +70,7 @@ double cdf_laplace_abs_max(double scale, double numberKmers, double value);
7070

7171
int main(int argc, char *argv[]){
7272

73-
// cout << "HELLO" << endl;
73+
cout << "HELLO github sneep version" << endl;
7474

7575
//to output doubles whith a total of 17 digits
7676
typedef numeric_limits< double > dbl;
@@ -85,13 +85,16 @@ int main(int argc, char *argv[]){
8585
return (0);
8686
}
8787

88-
//create instance bashCommand
88+
//create instance bashCommand -> moved to handleInOutput.hpp -> parseInputPara
8989
BashCommand bc(io.getGenome()); //constructor bashcommand class
9090
bc.mkdir(io.getOutputDir(), "-p", true); //make outputDir
9191
ofstream info = io.openFile(io.getInfoFile(), false); //open info file
9292
info << io << endl; // write settings
9393
info.close();
9494

95+
// check if snps file is bed-like format or CVF
96+
io.fileFormatVCF(); // if file is CVF format parse to bed-like format and store new file in outputDir as formatedSNPs.txt
97+
9598
io.checkIfSNPsAreUnique(); // removes SNPs from inputSNP list which are not unique and stores them in info file
9699
vector<string>leadSNPs; // a list of all lead SNPs
97100
unordered_map<string, vector<string>> proxySNPs; // for each lead SNP a list of al proxy SNPs is provided
@@ -294,18 +297,33 @@ int main(int argc, char *argv[]){
294297
//calculate probabiblity for sequences of the current TF
295298
if (probProSeq(PWMs[j], wildtypeSeq, pvalues, io.getPvalue(), posMut, firstSeq) == 1){ //first_seq contains wildtype sequence
296299
sigHit = 1;
300+
}
301+
//print out pvalues of binding score for all kmers of the current TF for wildtype seq
302+
/* cout << motif << "\twiltype";
303+
for (auto i: firstSeq){
304+
cout << '\t' << i;
297305
}
306+
cout << "\n"; */
298307
//calculate pvalue of the mutated sequence
299308
if (probProSeq(PWMs[j], mutSeq, pvalues, io.getPvalue(), posMut, secondSeq) == 1){ //second_seq contain mutated sequence
300309
sigHit = 1;
301310
}
311+
//print out pvalues of binding score for all kmers of the current TF for alterative seq
312+
/*cout << motif << "\talternative";
313+
for (auto i: firstSeq){
314+
cout << '\t' << i;
315+
}
316+
cout << "\n";*/
317+
302318
//int maxDiff_index = 0;
303319
if (sigHit == 1){ //check if there is a sig hit in one of the kmers otherwise skip diffBind score and pvalue correction
304320
for (int l = 0; l < firstSeq.size(); ++l){
305321

306322
if (firstSeq[l] <= io.getPvalue() or secondSeq[l] <= io.getPvalue()){
307323
posSigHit = pos + floor(l/2); //deterime position of the hit
308324
diffBinding = differentialBindingAffinity(firstSeq[l], secondSeq[l], log_, scales[motif], lenMotif*2);
325+
//cout << diffBinding << "\t" << lenMotif << "\t" << motif << endl;
326+
309327
if (diffBinding <= maxDiffBinding){// and (firstSeq[l] <= io.getPvalue() or secondSeq[l] <= io.getPvalue())){
310328
maxDiffBinding = diffBinding;
311329
maxPosSigHit = posSigHit;

src/findZerosEachMotif.py

100644100755
File mode changed.

src/formatVCF.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/usr/bin/env python
2+
3+
import sys, os
4+
5+
if (len(sys.argv) < 3):
6+
print("python3 formatVCF.py inputFile, outputFile")
7+
else:
8+
inputFile = sys.argv[1]
9+
outputFile = sys.argv[2]
10+
11+
with open(inputFile, 'r') as i, open(outputFile, 'w') as o:
12+
for line in i:
13+
if line[0] == "#":
14+
continue
15+
else:
16+
line = line.strip().split('\t')
17+
chr_ = line[0]
18+
pos = line[1]
19+
rsid = line[2]
20+
if rsid ==".": # in bed-like file not given info is given as - (not . as in vcf file)
21+
rsid = "-"
22+
ref = line[3]
23+
alt = line[4]
24+
info = line[7]
25+
maf = "-1"
26+
stop = False
27+
if chr_ == "." or pos == "." or ref == "." or alt == ".": # important info missing -> skip this snp
28+
continue
29+
elif "," in ref and "," in alt: # if for the reference and the alternative allele more than one option is provided -> skip the snp
30+
continue
31+
else:
32+
## is maf given?
33+
if "MAF" in info:
34+
info = info.split("MAF=")
35+
maf = info[1]
36+
#print(maf)
37+
if ";" in maf:
38+
maf = maf.split(';')[0]
39+
# print(maf)
40+
## is there more than one alternative allele given?
41+
ref = ref.split(',')
42+
## is there more than one reference allele given?
43+
alt = alt.split(',')
44+
45+
## write output to new file
46+
if len(ref) > 1: # more than one reference allele but only one alternative allele
47+
for elem in ref:
48+
o.write("chr" + chr_ + '\t' + str(int(pos)-1) + '\t' + pos + '\t' + elem + '\t' + alt[0] + '\t'+ rsid + '\t' + maf + '\n')
49+
50+
elif len(alt) > 1: # more than one reference allele but only one alternative allele
51+
for elem in alt:
52+
o.write("chr" + chr_ + '\t' + str(int(pos)-1) + '\t' + pos + '\t'+ ref[0] + '\t' + elem + '\t'+ rsid + '\t' + maf + '\n' )
53+
else:
54+
o.write("chr" + chr_ + '\t' + str(int(pos)-1) + '\t' + pos + '\t' + ref[0] + '\t' + alt[0] + '\t'+ rsid + '\t' + maf + '\n')
55+
56+
57+
58+
59+
60+
61+
62+
63+
64+

0 commit comments

Comments
 (0)