-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMergeSamFiles.java
More file actions
228 lines (202 loc) · 11 KB
/
MergeSamFiles.java
File metadata and controls
228 lines (202 loc) · 11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam;
import htsjdk.samtools.MergingSamRecordIterator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SamFileHeaderMerger;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.SamRecordIntervalIteratorFactory;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.SamOrBam;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Reads a SAM or BAM file and combines the output to one file
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
usage = MergeSamFiles.USAGE_SUMMARY + MergeSamFiles.USAGE_DETAILS,
usageShort = MergeSamFiles.USAGE_SUMMARY,
programGroup = SamOrBam.class
)
public class MergeSamFiles extends CommandLineProgram {
private static final Log log = Log.getInstance(MergeSamFiles.class);
static final String USAGE_SUMMARY = "Merges multiple SAM and/or BAM files into a single file. ";
static final String USAGE_DETAILS = "This tool is used for combining SAM and/or BAM files from different runs or read groups, similarly " +
"to the \"merge\" function of Samtools (http://www.htslib.org/doc/samtools.html). " +
"<br /><br />Note that to prevent errors in downstream processing, it is critical to identify/label read groups appropriately. " +
"If different samples contain identical read group IDs, this tool will avoid collisions by modifying the read group IDs to be " +
"unique. For more information about read groups, see the " +
"<a href='https://www.broadinstitute.org/gatk/guide/article?id=6472'>GATK Dictionary entry.</a> <br /><br />" +
"<br />" +
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar MergeSamFiles \\<br />" +
" I=input_1.bam \\<br />" +
" I=input_2.bam \\<br />" +
" O=merged_files.bam" +
"</pre>" +
"<hr />"
;
@Option(shortName = "I", doc = "SAM or BAM input file", minElements = 1)
public List<File> INPUT = new ArrayList<File>();
@Option(shortName = "O", doc = "SAM or BAM file to write merged result to")
public File OUTPUT;
@Option(shortName = StandardOptionDefinitions.SORT_ORDER_SHORT_NAME, doc = "Sort order of output file", optional = true)
public SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
@Option(doc = "If true, assume that the input files are in the same sort order as the requested output sort order, even if their headers say otherwise.",
shortName = StandardOptionDefinitions.ASSUME_SORTED_SHORT_NAME)
public boolean ASSUME_SORTED = false;
@Option(shortName = "MSD", doc = "Merge the sequence dictionaries", optional = true)
public boolean MERGE_SEQUENCE_DICTIONARIES = false;
@Option(doc = "Option to create a background thread to encode, " +
"compress and write to disk the output file. The threaded version uses about 20% more CPU and decreases " +
"runtime by ~20% when writing out a compressed BAM file.")
public boolean USE_THREADING = false;
@Option(doc = "Comment(s) to include in the merged output file's header.", optional = true, shortName = "CO")
public List<String> COMMENT = new ArrayList<String>();
@Option(shortName = "RGN", doc = "An interval list file that contains the locations of the positions to merge. "+
"Assume bam are sorted and indexed. "+
"The resulting file will contain alignments that may overlap with genomic regions outside the requested region. "+
"Unmapped reads are discarded.",
optional = true)
public File INTERVALS = null;
private static final int PROGRESS_INTERVAL = 1000000;
/** Required main method implementation. */
public static void main(final String[] argv) {
System.exit(new MergeSamFiles().instanceMain(argv));
}
/** Combines multiple SAM/BAM files into one. */
@Override
protected int doWork() {
boolean matchedSortOrders = true;
// read interval list if it is defined
final List<Interval> intervalList = (INTERVALS == null ? null : IntervalList.fromFile(INTERVALS).uniqued().getIntervals() );
// map reader->iterator used if INTERVALS is defined
final Map<SamReader, CloseableIterator<SAMRecord> > samReaderToIterator = new HashMap<SamReader, CloseableIterator<SAMRecord> >(INPUT.size());
// Open the files for reading and writing
final List<SamReader> readers = new ArrayList<SamReader>();
final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>();
{
SAMSequenceDictionary dict = null; // Used to try and reduce redundant SDs in memory
for (final File inFile : INPUT) {
IOUtil.assertFileIsReadable(inFile);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(inFile);
if ( INTERVALS != null ) {
if( ! in.hasIndex() ) throw new PicardException("Merging with interval but Bam file is not indexed "+ inFile);
final CloseableIterator<SAMRecord> samIterator = new SamRecordIntervalIteratorFactory().makeSamRecordIntervalIterator(in, intervalList, true);
samReaderToIterator.put(in, samIterator);
}
readers.add(in);
headers.add(in.getFileHeader());
// A slightly hackish attempt to keep memory consumption down when merging multiple files with
// large sequence dictionaries (10,000s of sequences). If the dictionaries are identical, then
// replace the duplicate copies with a single dictionary to reduce the memory footprint.
if (dict == null) {
dict = in.getFileHeader().getSequenceDictionary();
} else if (dict.equals(in.getFileHeader().getSequenceDictionary())) {
in.getFileHeader().setSequenceDictionary(dict);
}
matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER;
}
}
// If all the input sort orders match the output sort order then just merge them and
// write on the fly, otherwise setup to merge and sort before writing out the final file
IOUtil.assertFileIsWritable(OUTPUT);
final boolean presorted;
final SAMFileHeader.SortOrder headerMergerSortOrder;
final boolean mergingSamRecordIteratorAssumeSorted;
if (matchedSortOrders || SORT_ORDER == SAMFileHeader.SortOrder.unsorted || ASSUME_SORTED || INTERVALS != null ) {
log.info("Input files are in same order as output so sorting to temp directory is not needed.");
headerMergerSortOrder = SORT_ORDER;
mergingSamRecordIteratorAssumeSorted = ASSUME_SORTED;
presorted = true;
} else {
log.info("Sorting input files using temp directory " + TMP_DIR);
headerMergerSortOrder = SAMFileHeader.SortOrder.unsorted;
mergingSamRecordIteratorAssumeSorted = false;
presorted = false;
}
final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(headerMergerSortOrder, headers, MERGE_SEQUENCE_DICTIONARIES);
final MergingSamRecordIterator iterator;
// no interval defined, get an iterator for the whole bam
if( intervalList == null) {
iterator = new MergingSamRecordIterator(headerMerger, readers, mergingSamRecordIteratorAssumeSorted);
}
else {
// show warning related to https://github.com/broadinstitute/picard/pull/314/files
log.info("Warning: merged bams from different interval lists may contain the same read in both files");
iterator = new MergingSamRecordIterator(headerMerger, samReaderToIterator, true);
}
final SAMFileHeader header = headerMerger.getMergedHeader();
for (final String comment : COMMENT) {
header.addComment(comment);
}
header.setSortOrder(SORT_ORDER);
final SAMFileWriterFactory samFileWriterFactory = new SAMFileWriterFactory();
if (USE_THREADING) {
samFileWriterFactory.setUseAsyncIo(true);
}
final SAMFileWriter out = samFileWriterFactory.makeSAMOrBAMWriter(header, presorted, OUTPUT);
// Lastly loop through and write out the records
final ProgressLogger progress = new ProgressLogger(log, PROGRESS_INTERVAL);
while (iterator.hasNext()) {
final SAMRecord record = iterator.next();
out.addAlignment(record);
progress.record(record);
}
log.info("Finished reading inputs.");
for(final CloseableIterator<SAMRecord> iter : samReaderToIterator.values()) CloserUtil.close(iter);
CloserUtil.close(readers);
out.close();
return 0;
}
@Override
protected String[] customCommandLineValidation() {
if (CREATE_INDEX && SORT_ORDER != SAMFileHeader.SortOrder.coordinate) {
return new String[]{"Can't CREATE_INDEX unless SORT_ORDER is coordinate"};
}
return null;
}
}