Skip to content

Commit 3930e72

Browse files
committed
added --bam option for stdin streaming; now skipping invalid mappings (mapped_len<10 or pos<1)
1 parent 954b58c commit 3930e72

6 files changed

Lines changed: 73 additions & 71 deletions

File tree

gclib/GBam.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -250,11 +250,12 @@ GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
250250
this->add_aux(tag, atype, alen, adata);
251251
}//add_aux()
252252

253-
void interpret_CIGAR(char cop, int cl, int aln_start) {
254-
// NB: Closed ranges
253+
int interpret_CIGAR(char cop, int cl, int aln_start) {
254+
// returns the number of bases "aligned" (matches or mismatches) from the read
255255
// gpos = current genomic position (will end up as right coordinate on the genome)
256256
// rpos = read position (will end up as the length of the read)
257257
// cop = CIGAR operation, cl = operation length
258+
int mbases = 0; //count "aligned" bases (includes mismatches)
258259
int rpos = 0;
259260
int gpos = aln_start;
260261
int num_mismatches=0; //NM tag value = edit distance
@@ -267,7 +268,9 @@ switch (cop) {
267268
//printf("[%d-%d]", gpos, gpos + cl - 1);
268269
gpos+=cl;
269270
rpos+=cl;
271+
++mbases;
270272
break;
273+
271274
case BAM_CPAD:
272275
// printf("[%d-%d]", pos, pos + cl - 1); // Spans positions, No Coverage
273276
gpos+=cl;
@@ -282,6 +285,7 @@ switch (cop) {
282285
//soft clipped bases, present in SEQ
283286
rpos+=cl;
284287
break;
288+
285289
case BAM_CINS: // I
286290
// No Coverage
287291
// adds cl bases "throughput" but not genomic position "coverage" (gap in genomic seq)
@@ -312,6 +316,7 @@ switch (cop) {
312316
fprintf(stderr, "Unhandled cigar_op %d:%d\n", cop, cl);
313317
//printf("?");
314318
}
319+
return mbases;
315320
} // interpret_CIGAR(), just a reference of CIGAR operations interpretation
316321

317322
void GBamRecord::setupCoordinates() {
@@ -323,6 +328,7 @@ switch (cop) {
323328
memcpy(cigar, p, c->n_cigar * sizeof(uint32_t));
324329
//--- UBsan protection end
325330
int l=0;
331+
mapped_len=0;
326332
start=c->pos+1; //genomic start coordinate, 1-based (BAM core.pos is 0-based)
327333
int exstart=c->pos;
328334
for (int i = 0; i < c->n_cigar; ++i) {
@@ -336,12 +342,14 @@ switch (cop) {
336342
//exon ends here
337343
GSeg exon(exstart+1,c->pos+l);
338344
exons.Add(exon);
345+
mapped_len+=exon.len();
339346
l += cigar[i]>>4;
340347
exstart=c->pos+l;
341348
}
342349
}
343350
GSeg exon(exstart+1,c->pos+l);
344351
exons.Add(exon);
352+
mapped_len+=exon.len();
345353
end=c->pos+l; //genomic start coordinate
346354
delete[] cigar; //UBsan protection
347355
}

gclib/GBam.h

Lines changed: 37 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@ class GBamRecord: public GSeg {
2424
uint8_t abuf[512];
2525
public:
2626
GVec<GSeg> exons; //coordinates will be 1-based
27+
int mapped_len; //sum of exon lengths
2728
//created from a reader:
2829
void bfree_on_delete(bool b_free=true) { novel=b_free; }
29-
GBamRecord(bam1_t* from_b=NULL, bam_header_t* b_header=NULL, bool b_free=true):exons(1) {
30+
GBamRecord(bam1_t* from_b=NULL, bam_header_t* b_header=NULL, bool b_free=true):exons(1),
31+
mapped_len(0) {
3032
bam_header=NULL;
3133
if (from_b==NULL) {
3234
b=bam_init1();
@@ -41,7 +43,8 @@ class GBamRecord: public GSeg {
4143
setupCoordinates();//set 1-based coordinates (start, end and exons array)
4244
}
4345

44-
GBamRecord(GBamRecord& r):GSeg(r.start, r.end), exons(r.exons) { //copy constructor
46+
GBamRecord(GBamRecord& r):GSeg(r.start, r.end), exons(r.exons),
47+
mapped_len(r.mapped_len) { //copy constructor
4548
//makes a new copy of the bam1_t record etc.
4649
clear();
4750
b=bam_dup1(r.b);
@@ -57,6 +60,7 @@ class GBamRecord: public GSeg {
5760
start=r.start;
5861
end=r.end;
5962
exons = r.exons;
63+
mapped_len=r.mapped_len;
6064
return *this;
6165
}
6266

@@ -68,6 +72,7 @@ class GBamRecord: public GSeg {
6872
}
6973
b=NULL;
7074
exons.Clear();
75+
mapped_len=0;
7176
bam_header=NULL;
7277
}
7378

@@ -194,46 +199,47 @@ class GBamReader {
194199
};
195200

196201
public:
197-
void bopen(const char* filename) {
202+
void bopen(const char* filename, bool forceBAM=false) {
198203
if (strcmp(filename, "-")==0) {
199-
//if stdin was given, we HAVE to assume it's text SAM format, sorry
200-
bam_file=samopen(filename, "r", 0);
204+
//if stdin was given, we assume it's text SAM, unless forceBAM was given
205+
if (forceBAM) bam_file=samopen(filename, "rb", 0);
206+
else bam_file=samopen(filename, "r", 0);
201207
}
202208
else {
203-
//BAM files have the zlib signature bytes at the beginning: 1F 8B 08
204-
//if that's not present, we assume sam file
205-
FILE* f=Gfopen(filename);
206-
if (f==NULL) {
207-
//DEBUG only: make the program suspend itself indefinitely
208-
/*int selfpid=getpid();
209-
GMessage("DEBUG STOP ( pid: %d ) entering wait loop, kill to terminate..\n",
210-
selfpid);
211-
while (true) {
212-
sleep(5);
213-
}
214-
*/
215-
exit(1); //GError("Error opening file %s!\n", filename);
216-
}
217-
byte fsig[3];
218-
size_t rd=fread(fsig, 1, 3, f);
219-
fclose(f);
220-
if (rd<3) GError("Error reading from file %s!\n",filename);
221-
if ((fsig[0]==0x1F && fsig[1]==0x8B && fsig[2]==0x08) ||
222-
(fsig[0]=='B' && fsig[1]=='A' && fsig[2]=='M')) {
223-
bam_file=samopen(filename, "rb", 0); //BAM or uncompressed BAM
209+
FILE* f=Gfopen(filename);
210+
if (f==NULL) {
211+
GError("Error opening SAM/BAM file %s!\n", filename);
212+
}
213+
if (forceBAM) {
214+
//directed to open this as a BAM file
215+
if (forceBAM) bam_file=samopen(filename, "rb", 0);
216+
}
217+
else {
218+
//try to guess if it's BAM or SAM
219+
//BAM files have the zlib signature bytes at the beginning: 1F 8B 08
220+
//if that's not present then we assume text SAM
221+
byte fsig[3];
222+
size_t rd=fread(fsig, 1, 3, f);
223+
fclose(f);
224+
if (rd<3) GError("Error reading from file %s!\n",filename);
225+
if ((fsig[0]==0x1F && fsig[1]==0x8B && fsig[2]==0x08) ||
226+
(fsig[0]=='B' && fsig[1]=='A' && fsig[2]=='M')) {
227+
bam_file=samopen(filename, "rb", 0); //BAM or uncompressed BAM
224228
}
225-
else { //assume text SAM file
226-
bam_file=samopen(filename, "r", 0);
229+
else { //assume text SAM file
230+
bam_file=samopen(filename, "r", 0);
227231
}
228232
}
233+
}
229234
if (bam_file==NULL)
230235
GError("Error: could not open SAM file %s!\n",filename);
231236
fname=Gstrdup(filename);
232-
}
233-
GBamReader(const char* fn) {
237+
}
238+
239+
GBamReader(const char* fn, bool forceBAM=false) {
234240
bam_file=NULL;
235241
fname=NULL;
236-
bopen(fn);
242+
bopen(fn, forceBAM);
237243
}
238244

239245
bam_header_t* header() {

rlink.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ const float mismatchfrac=0.02;
3333
const int max_trf_number=40000; // maximum number of transfrag accepted so that the memory doesn't blow up
3434

3535
extern bool mergeMode;
36+
extern bool forceBAM; //for stdin alignment data
3637

3738
extern bool verbose;
3839
extern bool debugMode;

samtools-0.1.18/bam.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "kstring.h"
88
#include "sam_header.h"
99

10-
int bam_is_be = 0, bam_verbose = 2;
10+
int bam_is_be = 0, bam_verbose = 1;
1111
char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
1212

1313
/**************************

stringtie.cpp

Lines changed: 19 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,8 @@ bool ballgown=false;
164164
//no more reads will be considered for a bundle if the local coverage exceeds this value
165165
//(each exon is checked for this)
166166

167+
bool forceBAM = false; //useful for stdin (piping alignments into StringTie)
168+
167169
bool mergeMode = false; //--merge option
168170
bool keepTempFiles = false; //--keeptmp
169171

@@ -232,8 +234,6 @@ void writeUnbundledGuides(GVec<GRefData>& refdata, FILE* fout, FILE* gout=NULL);
232234

233235
#ifndef NOTHREADS
234236

235-
bool waitForThreads(); //wait for at least 1 worker thread to enter "ready" state
236-
237237
bool noThreadsWaiting();
238238

239239
void workerThread(GThreadData& td); // Thread function
@@ -253,7 +253,7 @@ int main(int argc, char * const argv[]) {
253253
// == Process arguments.
254254
GArgs args(argc, argv,
255255
//"debug;help;fast;xhvntj:D:G:C:l:m:o:a:j:c:f:p:g:");
256-
"debug;help;version;keeptmp;merge;exclude=zZSEihvteux:n:j:s:D:G:C:l:m:o:a:j:c:f:p:g:P:M:Bb:A:F:T:");
256+
"debug;help;version;keeptmp;bam;merge;exclude=zZSEihvteux:n:j:s:D:G:C:l:m:o:a:j:c:f:p:g:P:M:Bb:A:F:T:");
257257
args.printError(USAGE, true);
258258

259259
processOptions(args);
@@ -361,12 +361,9 @@ const char* ERR_BAM_SORT="\nError: the input alignment file is not sorted!\n";
361361
gseqNames=GffObj::names; //might have been populated already by gff data
362362
gffnames_ref(gseqNames); //initialize the names collection if not guided
363363

364-
//GBamReader bamreader(bamfname.chars());
365-
366364

367365
GHash<int> hashread; //read_name:pos:hit_index => readlist index
368-
//GHash<int> bgeneids(false); //guide gene IDs/names in the bundle
369-
//set of annotation transcript for the current locus
366+
370367
GList<GffObj>* guides=NULL; //list of transcripts on a specific chromosome
371368

372369
int currentstart=0, currentend=0;
@@ -434,6 +431,11 @@ if (tstackSize<DEF_TSTACK_SIZE) defStackSize=DEF_TSTACK_SIZE;
434431
//delete brec;
435432
if ((brec=bamreader.next())!=NULL) {
436433
if (brec->isUnmapped()) continue;
434+
if (brec->start<1 || brec->mapped_len<10) {
435+
if (verbose) GMessage("Warning: invalid mapping found for read %s (position=%d, mapped length=%d)\n",
436+
brec->name(), brec->start, brec->mapped_len);
437+
continue;
438+
}
437439

438440
refseqName=brec->refName();
439441
xstrand=brec->spliceStrand();
@@ -563,8 +565,10 @@ if (tstackSize<DEF_TSTACK_SIZE) defStackSize=DEF_TSTACK_SIZE;
563565
#ifndef NOTHREADS
564566
GLockGuard<GFastMutex> lock(logMutex);
565567
#endif
566-
printTime(stderr);
567-
if(Num_Fragments) GMessage(" %g aligned fragments found.\n", Num_Fragments);
568+
if (Num_Fragments) {
569+
printTime(stderr);
570+
GMessage(" %g aligned fragments found.\n", Num_Fragments);
571+
}
568572
//GMessage(" Done reading alignments.\n");
569573
}
570574
noMoreBundles();
@@ -702,13 +706,13 @@ if (tstackSize<DEF_TSTACK_SIZE) defStackSize=DEF_TSTACK_SIZE;
702706
fclose(f_out);
703707
if (c_out && c_out!=stdout) fclose(c_out);
704708

705-
if(verbose)
706-
GMessage("Total number of records without the XS tag: %d\n",no_xs);
709+
if(verbose && no_xs>0)
710+
GMessage("Number spliced alignments missing the XS tag (skipped): %d\n",no_xs);
707711

708712
if(!mergeMode) {
709713
if(verbose) {
710-
GMessage("Total count of aligned fragments: %g\n",Num_Fragments);
711-
GMessage("Fragment coverage length:%g\n",Frag_Len/Num_Fragments);
714+
GMessage("Total count of aligned fragments: %g\n", Num_Fragments);
715+
GMessage("Fragment coverage length: %g\n", Frag_Len/Num_Fragments);
712716
}
713717

714718
f_out=stdout;
@@ -835,6 +839,7 @@ void processOptions(GArgs& args) {
835839
exit(0);
836840
}
837841
debugMode=(args.getOpt("debug")!=NULL || args.getOpt('D')!=NULL);
842+
forceBAM=(args.getOpt("bam")!=NULL); //assume the stdin stream is BAM instead of text SAM
838843
mergeMode=(args.getOpt("merge")!=NULL);
839844
keepTempFiles=(args.getOpt("keeptmp")!=NULL);
840845
fast=!(args.getOpt('Z')!=NULL);
@@ -1165,9 +1170,6 @@ void processBundle(BundleData* bundle) {
11651170
GLockGuard<GFastMutex> lock(logMutex);
11661171
#endif
11671172
printTime(stderr);
1168-
/*GMessage(">bundle %s:%d-%d(%d) (%djs, %d guides) loaded, begins processing...\n",
1169-
bundle->refseq.chars(), bundle->start, bundle->end, bundle->numreads,
1170-
bundle->junction.Count(), bundle->keepguides.Count());*/
11711173
GMessage(">bundle %s:%d-%d(%lu) (%d guides) loaded, begins processing...\n",
11721174
bundle->refseq.chars(), bundle->start, bundle->end, bundle->numreads,
11731175
bundle->keepguides.Count());
@@ -1208,7 +1210,6 @@ void processBundle(BundleData* bundle) {
12081210

12091211
}
12101212
#endif
1211-
//int ngenes=infer_transcripts(bundle, fast | bundle->covSaturated);
12121213
int ngenes=infer_transcripts(bundle);
12131214

12141215
if (ballgown && bundle->rc_data) {
@@ -1235,7 +1236,6 @@ void processBundle(BundleData* bundle) {
12351236
#ifndef NOTHREADS
12361237
GLockGuard<GFastMutex> lock(logMutex);
12371238
#endif
1238-
printTime(stderr);
12391239
/*
12401240
SumReads+=bundle->sumreads;
12411241
SumFrag+=bundle->sumfrag;
@@ -1247,6 +1247,7 @@ void processBundle(BundleData* bundle) {
12471247
fprintf(stderr,"Number of fragments in bundle: %g with length %g\n",bundle->num_fragments,bundle->frag_len);
12481248
fprintf(stderr,"Number of fragments in bundle: %g with sum %g\n",bundle->num_fragments,bundle->frag_len);
12491249
*/
1250+
printTime(stderr);
12501251
GMessage("^bundle %s:%d-%d(%d) done (%d processed potential transcripts).\n",bundle->refseq.chars(),
12511252
bundle->start, bundle->end, bundle->readlist.Count(), bundle->pred.Count());
12521253
#ifdef GMEMTRACE
@@ -1272,22 +1273,6 @@ bool noThreadsWaiting() {
12721273
return (v<1);
12731274
}
12741275

1275-
bool waitForThreads() {
1276-
bool noneWaiting=true;
1277-
DBGPRINT("##> waiting for a thread to become available..\n");
1278-
while (noneWaiting) {
1279-
waitMutex.lock();
1280-
noneWaiting=(threadsWaiting<1);
1281-
waitMutex.unlock();
1282-
if (noneWaiting) {
1283-
DBGPRINT("##>all threads busy, sleep_for 2ms\n");
1284-
this_thread::sleep_for(chrono::milliseconds(2));
1285-
}
1286-
}
1287-
DBGPRINT("##> there are workers ready now.\n");
1288-
return(!noneWaiting);
1289-
}
1290-
12911276
void workerThread(GThreadData& td) {
12921277
GPVec<BundleData>* bundleQueue = (GPVec<BundleData>*)td.udata;
12931278
//wait for a ready bundle in the queue, until there is no hope for incoming bundles

tmerge.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22

33

44
void TInputFiles::Add(const char* fn) {
5-
if (fileExists(fn)<2) {
5+
GStr sfn(fn);
6+
if (sfn!="-" && fileExists(fn)<2) {
67
GError("Error: input file %s cannot be found!\n",
78
fn);
89
}
9-
GStr sfn(fn);
10+
1011
files.Add(sfn);
1112
}
1213

@@ -133,7 +134,8 @@ int TInputFiles::start() {
133134
}
134135
//stringtie multi-BAM input
135136
for (int i=0;i<bamfiles.Count();++i) {
136-
GBamReader* bamreader=new GBamReader(bamfiles[i].chars());
137+
GBamReader* bamreader=new GBamReader(bamfiles[i].chars(),
138+
(bamfiles[i]=="-" && forceBAM));
137139
readers.Add(bamreader);
138140
GBamRecord* brec=bamreader->next();
139141
if (brec)

0 commit comments

Comments
 (0)