-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDownsampleSam.java
More file actions
139 lines (123 loc) · 6.77 KB
/
DownsampleSam.java
File metadata and controls
139 lines (123 loc) · 6.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/*
* The MIT License
*
* Copyright (c) 2015 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam;
import htsjdk.samtools.DownsamplingIteratorFactory;
import htsjdk.samtools.DownsamplingIteratorFactory.Strategy;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.DownsamplingIterator;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.SamOrBam;
import java.io.File;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.Random;
/**
* Class to randomly downsample a BAM file while respecting that we should either retain or discard
* all of the reads for a template - i.e. all reads with the same name, whether first or second of
* pair, secondary or supplementary, all travel together.
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
usage = DownsampleSam.USAGE_SUMMARY + DownsampleSam.USAGE_DETAILS,
usageShort = DownsampleSam.USAGE_SUMMARY,
programGroup = SamOrBam.class
)
public class DownsampleSam extends CommandLineProgram {
static final String USAGE_SUMMARY = "Downsample a SAM or BAM file. ";
static final String USAGE_DETAILS = "This tool applies a random downsampling algorithm to a SAM or BAM file to retain " +
"only a random subset of the reads. Reads in a mate-pair are either both kept or both discarded. Reads marked as not primary " +
"alignments are all discarded. Each read is given a probability P of being retained so that runs performed with the exact " +
"same input in the same order and with the same value for RANDOM_SEED will produce the same results." +
"All reads for a template are kept or discarded as a unit, with the goal of retaining reads" +
"from PROBABILITY * input templates. While this will usually result in approximately " +
"PROBABILITY * input reads being retained also, for very small PROBABILITIES this may not " +
"be the case.\n" +
"A number of different downsampling strategies are supported using the STRATEGY option:\n\n" +
"ConstantMemory: " + DownsamplingIteratorFactory.CONSTANT_MEMORY_DESCRPTION + "\n\n" +
"HighAccuracy: " + DownsamplingIteratorFactory.HIGH_ACCURACY_DESCRIPTION + "\n\n" +
"Chained: " + DownsamplingIteratorFactory.CHAINED_DESCRIPTION + "\n\n" +
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar DownsampleSam \\<br />" +
" I=input.bam \\<br />" +
" O=downsampled.bam" +
"</pre>" +
"<hr />";
@Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM or BAM file to downsample.")
public File INPUT;
@Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output, downsampled, SAM or BAM file to write.")
public File OUTPUT;
@Option(shortName="S", doc="The downsampling strategy to use. See usage for discussion.")
public Strategy STRATEGY = Strategy.ConstantMemory;
@Option(shortName = "R", doc = "Random seed to use if deterministic behavior is desired. " +
"Setting to null will cause multiple invocations to produce different results.")
public Integer RANDOM_SEED = 1;
@Option(shortName = "P", doc = "The probability of keeping any individual read, between 0 and 1.")
public double PROBABILITY = 1;
@Option(shortName = "A", doc = "The accuracy that the downsampler should try to achieve if the selected strategy supports it. " +
"Note that accuracy is never guaranteed, but some strategies will attempt to provide accuracy within the requested bounds." +
"Higher accuracy will generally require more memory.")
public double ACCURACY = 0.0001;
private final Log log = Log.getInstance(DownsampleSam.class);
public static void main(final String[] args) {
new DownsampleSam().instanceMainWithExit(args);
}
@Override
protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
// Warn the user if they are running with P=1; 0 <= P <= 1 is checked by the DownsamplingIteratorFactory
if (PROBABILITY == 1) {
log.warn("Running DownsampleSam with PROBABILITY=1! This will likely just recreate the input file.");
}
final Random r = RANDOM_SEED == null ? new Random() : new Random(RANDOM_SEED);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT);
final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Wrote");
final DownsamplingIterator iterator = DownsamplingIteratorFactory.make(INPUT, STRATEGY, PROBABILITY, ACCURACY, RANDOM_SEED);
while (iterator.hasNext()) {
final SAMRecord rec = iterator.next();
out.addAlignment(rec);
progress.record(rec);
}
out.close();
CloserUtil.close(in);
final NumberFormat fmt = new DecimalFormat("0.00%");
log.info("Finished downsampling.");
log.info("Kept ", iterator.getAcceptedCount(), " out of ", iterator.getSeenCount(), " reads (", fmt.format(iterator.getAcceptedFraction()), ").");
return 0;
}
}