Skip to content

Commit 2fb444b

Browse files
committed
changes for v1.0.3
1 parent b186c3c commit 2fb444b

80 files changed

Lines changed: 766 additions & 508 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Code/CMakeLists.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
cmake_minimum_required(VERSION 3.9)
2-
project(STARE
3-
LANGUAGES C CXX
4-
)
2+
project(STARE)
53

64
if(NOT CMAKE_BUILD_TYPE)
75
set(CMAKE_BUILD_TYPE Release)

Code/PSCM_to_PSEM.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@
1111
// Modified by Dennis Hecker
1212
// Institute of Cardiovascular Regeneration
1313

14+
/*
15+
* Compilation:
16+
* g++ PSCM_to_PSEM.cpp -std=c++11 -o PSCM_to_PSEM
17+
*/
18+
1419
#include<stdio.h>
1520
#include<cmath>
1621
#include<stdlib.h>

Code/ReplaceInvalidChars.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
* a txt-file with the length of the longest sequence in the file and the average CG content.
1414
* Required for TRAP to read the sequence file as stream.
1515
*
16+
* Compilation:
1617
* g++ ReplaceInvalidChars.cpp -std=c++11 -O3 -o ReplaceInvalidChars
17-
* ./ReplaceInvalidChars -i file -o output -d path_for_SeqMeta_file
1818
*
1919
* Part of STARE: https://github.com/SchulzLab/STARE
2020
*/
@@ -25,7 +25,9 @@ int main(int argc, char **argv) {
2525
// ____________________________________________________________
2626
// FETCH AND CHECK INPUT ARGS
2727
// ____________________________________________________________
28-
string parameter_help = "-i input sequence file\n-o output file\n-d file to write the sequence metadata to";
28+
string parameter_help = "-i input sequence file\n"
29+
"-o output file\n"
30+
"-d file to write the sequence metadata to";
2931

3032
vector<string> h_flags = {"h", "-h", "--h", "help", "-help", "--help"};
3133
for (int i = 1; i < argc; ++i) {

Code/STARE.sh

Lines changed: 55 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3,28 +3,30 @@ set -e # To abort the whole script if one function returns an error.
33

44
# See https://github.com/SchulzLab/STARE for more information and usage.
55
# Adapted from TEPIC: https://github.com/SchulzLab/TEPIC
6-
help="STARE version 1.0.2
6+
version_num="1.0.3"
7+
help="STARE version ""$version_num""
78
Usage: ./STARE.sh
89
[-b/--bed_file bed file containing open chromatin regions]
10+
[-a/--annotation gene annotation file in gtf-format]
911
[-g/--genome input fasta file in RefSeq format]
1012
[-s/--pscm file with PSCMs in transfac format] OR [-p/--psem file with PSEMs of TFs]
11-
[-a/--annotation gene annotation file in gtf-format, required to generate the gene view]
1213
[-o/--output prefix_path of output files]\n
1314
Optional parameters:
14-
[-u/--genes file with rows of gene IDs/symbols to limit the output (else all in gtf)]
15+
[-w/--window window size around TSS for mapping regions to genes (default 50KB; 5MB for ABC-mode)]
1516
[-n/--column column in the -b file containing the average per base signal within a peak, start counting at 1]
16-
[-y/--gc_content Mean GC-content to calculate PSEMs, by default this is automatically derived from your bed_file]
1717
[-c/--cores number of cores to use (default 1)]
1818
[-x/--exclude_bed bed-file with regions to exclude (e.g. blacklisted regions)]
19-
[-w/--window window size around TSS for mapping regions to genes (default 50KB; 5MB for ABC-mode)]
20-
[-e/--decay indicating whether exponential distance decay should be used (default TRUE, but not used in ABC-mode)]
21-
[-f/--contact_folder folder with normalized Hi-C contact files for each chromosome in coordinate format. Expects gzipped files.]
19+
[-u/--genes file with rows of gene IDs/symbols to limit the output (else all in gtf)]
20+
[-i/--tss_mode 'all_tss' to average across all annotated TSS for ABC-scoring or '5_tss' to use only the 5' TSS (default all_tss)]
21+
[-q/--adapted_abc whether to use the adapted ABC-score (default True)]
22+
[-f/--contact_folder folder with normalized Hi-C contact files for each chromosome in coordinate format, expects gzipped files. Set to False to use a contact estimate based on distance.]
2223
[-k/--bin_size bin-size of the Hi-C files]
2324
[-t/--cutoff cut-off for the ABC-score (default 0.02), set to 0 to get all scored interactions]
24-
[-q/--adapted_abc whether to use the adapted ABC-score (default True)]
25-
[-m/--enhancer_window window size around enhancers for the -q adjustment (default 5MB, minimally set to -w)]
2625
[-d/--pseudocount whether to use pseudocount for the contact frequency in the ABC-model (default True)]
26+
[-m/--enhancer_window window size around enhancers for the -q adjustment (default 5MB, minimally set to -w)]
2727
[-r/--existing_abc ABC-scoring file, if already calculated once for this input to avoid redundant calculation]
28+
[-y/--gc_content Mean GC-content to calculate PSEMs, by default this is automatically derived from your bed_file]
29+
[-e/--decay indicating whether exponential distance decay should be used (default TRUE, but not used in ABC-mode)]
2830
[-z/--reshape write a binary output (default False), optional input for GAZE]"
2931

3032
# ------------------------------------------------------------------------------------------------------
@@ -42,6 +44,7 @@ pscms=""
4244
pscm_cg=""
4345
genes="0"
4446
column="0"
47+
tss_mode="all_tss"
4548
window=""
4649
decay="TRUE"
4750
hic_contactfolder=""
@@ -59,7 +62,7 @@ die() { echo "$*" >&2; exit 2; } # complain to STDERR and exit with error
5962
needs_arg() { if [ -z "$OPTARG" ]; then die "No arg for --$OPT option"; fi; } # Required to enable long options.
6063

6164
# Parsing command line.
62-
while getopts hvg:b:o:c:p:s:y:u:n:a:w:e:f:k:t:r:x:z:d:q:m:-: OPT; do
65+
while getopts hvg:b:o:c:p:s:y:u:n:i:a:w:e:f:k:t:r:x:z:d:q:m:-: OPT; do
6366
if [ "$OPT" = "-" ]; then # long option: reformulate OPT and OPTARG
6467
OPT="${OPTARG%%=*}" # extract long option name
6568
OPTARG="${OPTARG#$OPT}" # extract long option argument (may be empty)
@@ -77,6 +80,7 @@ while getopts hvg:b:o:c:p:s:y:u:n:a:w:e:f:k:t:r:x:z:d:q:m:-: OPT; do
7780
y | gc_content) needs_arg; pscm_cg=$OPTARG;;
7881
u | genes) needs_arg; genes=$OPTARG;;
7982
n | column) needs_arg; column=$OPTARG;;
83+
i | tss_mode) needs_arg; tss_mode=$OPTARG;;
8084
a | annotation) needs_arg; annotation=$OPTARG;;
8185
w | window) needs_arg; window=$OPTARG;;
8286
e | decay) needs_arg; decay=$OPTARG;;
@@ -96,7 +100,7 @@ done
96100

97101
if [ "$print_version" -eq 1 ];
98102
then
99-
echo "STARE version 1.0.2"
103+
echo "STARE version ""$version_num"
100104
exit 1;
101105
fi
102106

@@ -137,11 +141,20 @@ then
137141
fi
138142

139143
if [ -n "$hic_contactfolder" ] || [ -n "$hic_binsize" ] && [[ "$existing_abc" == "0" ]]; # With an existing ABC-file, the other flags are ignored.
140-
then
141-
if [ -z "$hic_contactfolder" ] || [ -z "$hic_binsize" ] || [ -z "$column" ];
144+
then # If the hic-contactfolder is set to false we can run the ABC-scoring with contact estimate.
145+
if [ "${hic_contactfolder}" != "FALSE" ] || [ "${hic_contactfolder}" != "False" ] || [ "${hic_contactfolder}" != "false" ] || [ "${hic_contactfolder}" != "F" ] || [ "${hic_contactfolder}" != "0" ];
142146
then
143-
echo "For the ABC-score calculation the column with the peak signal (-n/--column), the path to the normalized contact files (-f/--contact_folder) as well as the the bin size (-k/--bin_size) are required."
144-
exit 1;
147+
if [ -z "$hic_contactfolder" ] || [ -z "$hic_binsize" ] || [ -z "$column" ];
148+
then
149+
echo "For the ABC-score calculation the column with the peak signal (-n/--column), the path to the normalized contact files (-f/--contact_folder) as well as the the bin size (-k/--bin_size) are required."
150+
exit 1;
151+
fi
152+
else
153+
if [ -z "$column" ];
154+
then
155+
echo "For the ABC-score calculation the column with the peak signal (-n/--column) is required."
156+
exit 1;
157+
fi
145158
fi
146159
fi
147160

@@ -174,6 +187,7 @@ then
174187
adjustedABC="FALSE";
175188
fi
176189

190+
177191
# ------------------------------------------------------------------------------------------------------
178192
# WRITE METADATA FILE
179193
# ------------------------------------------------------------------------------------------------------
@@ -195,7 +209,7 @@ metadatafile=${prefix_path}_metadata.amd.tsv
195209
# Create metadata file.
196210
touch "$metadatafile"
197211
echo "[Description]" >> "$metadatafile"
198-
echo "process STARE 1.0.2" >> "$metadatafile"
212+
echo "process STARE ""$version_num" >> "$metadatafile"
199213
echo -e "run_by_user\t""$USER" >> "$metadatafile"
200214
echo -e "date\t""$d" >> "$metadatafile"
201215
echo -e "time\t""$t" >> "$metadatafile"
@@ -205,64 +219,65 @@ echo "[Command]" >> "$metadatafile"
205219
echo "STARE.sh ""$*" >> "$metadatafile"
206220
echo "" >> "$metadatafile"
207221
echo "[Inputs]" >> "$metadatafile"
208-
echo -e "region_file\t""$regions" >> "$metadatafile"
222+
echo -e "-b region_file\t""$regions" >> "$metadatafile"
209223
if [ -n "$column" ] ;
210224
then
211-
echo -e "signal_column\t""$column" >> "$metadatafile"
225+
echo -e "-n signal_column\t""$column" >> "$metadatafile"
212226
fi
213227
if [[ "$genes" != "0" ]];
214228
then
215-
echo -e "gene set\t""$genes" >> "$metadatafile"
229+
echo -e "-u gene set\t""$genes" >> "$metadatafile"
216230
fi
217231
if [ -n "$exclude_regions" ] ;
218232
then
219-
echo -e "excluded regions\t""$exclude_regions" >> "$metadatafile"
233+
echo -e "-x excluded regions\t""$exclude_regions" >> "$metadatafile"
220234
fi
221235
echo "" >> "$metadatafile"
222236
echo "[References]" >> "$metadatafile"
223-
echo -e "genome_reference\t""$genome" >> "$metadatafile"
237+
echo -e "-g genome_reference\t""$genome" >> "$metadatafile"
224238
if [ -n "$pscms" ];
225239
then
226-
echo -e "pscms\t""$pscms" >> "$metadatafile"
240+
echo -e "-s pscms\t""$pscms" >> "$metadatafile"
227241
if [ -n "$pscm_cg" ];
228242
then
229-
echo -e "CG-content\t""$pscm_cg" >> "$metadatafile"
243+
echo -e "-y GC-content\t""$pscm_cg" >> "$metadatafile"
230244
else
231-
echo -e "CG-content\tautomatic from bed_file" >> "$metadatafile"
245+
echo -e "GC-content\tautomatic from bed_file" >> "$metadatafile"
232246
fi
233247
else
234-
echo -e "psems\t""$psems" >> "$metadatafile"
248+
echo -e "-p psems\t""$psems" >> "$metadatafile"
235249
fi
236-
echo -e "genome_annotation\t""$annotation">> "$metadatafile"
250+
echo -e "-a genome_annotation\t""$annotation">> "$metadatafile"
237251

238252
echo "" >> "$metadatafile"
239253
echo "[Output path]" >> "$metadatafile"
240254
echo "$prefixP" >> "$metadatafile"
241255

242256
echo "" >> "$metadatafile"
243257
echo "[Parameters]" >> "$metadatafile"
244-
echo -e "cores\t""$cores" >> "$metadatafile"
245-
echo -e "window\t"$window >> "$metadatafile"
258+
echo -e "-c cores\t""$cores" >> "$metadatafile"
259+
echo -e "-w window\t"$window >> "$metadatafile"
260+
echo -e "-i tss_mode\t""$tss_mode" >> "$metadatafile"
246261
if [ -z "$hic_contactfolder" ] && [[ "$existing_abc" == "0" ]];
247262
then
248-
echo -e "decay\t""$decay" >> "$metadatafile"
263+
echo -e "-e decay\t""$decay" >> "$metadatafile"
249264
fi
250265
if [ -n "$hic_contactfolder" ] && [[ "$existing_abc" == "0" ]];
251266
then
252-
echo -e "path with hi-c contact files\t""$hic_contactfolder" >> "$metadatafile"
253-
echo -e "bin size of hi-c contacts\t""$hic_binsize" >> "$metadatafile"
254-
echo -e "ABC-score cut-off\t""$abc_cutoff" >> "$metadatafile"
255-
echo -e "Use pseudocount for contact frequency\t""$pseudocount" >> "$metadatafile"
256-
echo -e "Use adaptedABC version\t""$adjustedABC" >> "$metadatafile"
257-
echo -e "Window size for the adaptedABC\t""$enhancer_window" >> "$metadatafile"
267+
echo -e "-f path with hi-c contact files\t""$hic_contactfolder" >> "$metadatafile"
268+
echo -e "-k bin size of hi-c contacts\t""$hic_binsize" >> "$metadatafile"
269+
echo -e "-t ABC-score cut-off\t""$abc_cutoff" >> "$metadatafile"
270+
echo -e "-d Use pseudocount for contact frequency\t""$pseudocount" >> "$metadatafile"
271+
echo -e "-q Use adaptedABC version\t""$adjustedABC" >> "$metadatafile"
272+
echo -e "-m Window size for the adaptedABC\t""$enhancer_window" >> "$metadatafile"
258273
fi
259274
if [[ "$existing_abc" != "0" ]];
260275
then
261-
echo -e "existing ABC-score file that was used\t""$existing_abc" >> "$metadatafile"
276+
echo -e "-r existing ABC-score file that was used\t""$existing_abc" >> "$metadatafile"
262277
fi
263278
if [[ "$reshaping" == "TRUE" ]];
264279
then
265-
echo -e "Reshaping to binary output\t""$reshaping" >> "$metadatafile"
280+
echo -e "-z Reshaping to binary output\t""$reshaping" >> "$metadatafile"
266281
fi
267282

268283
echo "" >> "$metadatafile"
@@ -345,8 +360,8 @@ fi
345360
# ------------------------------------------------------------------------------------------------------
346361
startt=`date +%s`
347362
# Use TRAP to compute transcription factor affinities to the above extracted sequences.
348-
affinity=${prefix_path}_Affinity.txt
349363
#affinity="/projects/triangulate/work/STARE/Hocker_scHeart/Hocker_Affinities.txt"
364+
affinity=${prefix_path}_Affinity.txt
350365
echo "Starting TRAP"
351366
"${working_dir}"/TRAPmulti "$psems" "${prefix_path}"_FilteredSequences.fa "${prefix_path}"_SeqMeta.txt "$cores" > "${affinity}"
352367
rm "${prefix_path}"_FilteredSequences.fa
@@ -364,7 +379,7 @@ then
364379
echo "ABC-scoring region-gene interactions"
365380
mkdir "${prefixP}""/ABC_output"
366381
abc_prefix_path=${prefixP}"/ABC_output/"${base_prefix}
367-
"${working_dir}"/STARE_ABCpp -b "${filteredRegions}"_sorted.bed -n "${column}" -a "${annotation}" -w "${window}" -f "${hic_contactfolder}" -k "${hic_binsize}" -t "${abc_cutoff}" -o "${abc_prefix_path}" -d "${pseudocount}" -q "${adjustedABC}" -m "${enhancer_window}" -c "${cores}" -u "${genes}"
382+
"${working_dir}"/STARE_ABCpp -b "${filteredRegions}"_sorted.bed -n "${column}" -a "${annotation}" -w "${window}" -f "${hic_contactfolder}" -k "${hic_binsize}" -t "${abc_cutoff}" -o "${abc_prefix_path}" -d "${pseudocount}" -q "${adjustedABC}" -m "${enhancer_window}" -c "${cores}" -u "${genes}" -i "${tss_mode}"
368383
existing_abc=${abc_prefix_path}"_ABCpp_scoredInteractions.txt.gz"
369384
fi
370385

@@ -386,3 +401,4 @@ rm "${affinity}"
386401
rm "${filteredRegions}"_sorted.bed
387402

388403
echo "Congratulations it worked!"
404+

0 commit comments

Comments
 (0)