diff --git a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/features/PublicationReference.java b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/features/PublicationReference.java new file mode 100644 index 0000000000..ff8c582b58 --- /dev/null +++ b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/features/PublicationReference.java @@ -0,0 +1,91 @@ +package org.biojava.nbio.core.sequence.features; + +import java.util.ArrayList; +import java.util.List; + +public class PublicationReference { + + public enum ReferenceType { + UNKNOWN, PUBMED, PATENT, DIRECT_SUBMISSION; + } + + private String id, title, journal; + private ReferenceType referenceType; + private List authors = new ArrayList<>(); + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getJournal() { + return journal; + } + + public void setJournal(String journal) { + this.journal = journal; + } + + public ReferenceType getReferenceType() { + return referenceType; + } + + public void setReferenceType(ReferenceType referenceType) { + this.referenceType = referenceType; + } + + public List getAuthors() { + return authors; + } + + public void setAuthors(List authors) { + this.authors = authors; + } + + @Override + public String toString() { + return "PublicationReference{" + + "id='" + id + '\'' + + ", title='" + title + '\'' + + ", journal='" + journal + '\'' + + ", referenceType=" + referenceType + + ", authors=" + authors + + '}'; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + PublicationReference that = (PublicationReference) o; + + if (id != null ? !id.equals(that.id) : that.id != null) return false; + if (title != null ? !title.equals(that.title) : that.title != null) return false; + if (journal != null ? !journal.equals(that.journal) : that.journal != null) return false; + if (referenceType != that.referenceType) return false; + return authors != null ? authors.equals(that.authors) : that.authors == null; + + } + + @Override + public int hashCode() { + int result = id != null ? id.hashCode() : 0; + result = 31 * result + (title != null ? title.hashCode() : 0); + result = 31 * result + (journal != null ? journal.hashCode() : 0); + result = 31 * result + (referenceType != null ? referenceType.hashCode() : 0); + result = 31 * result + (authors != null ? authors.hashCode() : 0); + return result; + } +} \ No newline at end of file diff --git a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/features/PublicationReferenceAuthor.java b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/features/PublicationReferenceAuthor.java new file mode 100644 index 0000000000..18df630825 --- /dev/null +++ b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/features/PublicationReferenceAuthor.java @@ -0,0 +1,60 @@ +package org.biojava.nbio.core.sequence.features; + +public class PublicationReferenceAuthor { + + private String firstName, lastName, fullName; + + public String getFirstName() { + return firstName; + } + + public void setFirstName(String firstName) { + this.firstName = firstName; + } + + public String getLastName() { + return lastName; + } + + public void setLastName(String lastName) { + this.lastName = lastName; + } + + public String getFullName() { + return fullName; + } + + public void setFullName(String fullName) { + this.fullName = fullName; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + PublicationReferenceAuthor that = (PublicationReferenceAuthor) o; + + if (firstName != null ? !firstName.equals(that.firstName) : that.firstName != null) return false; + if (lastName != null ? !lastName.equals(that.lastName) : that.lastName != null) return false; + return fullName != null ? fullName.equals(that.fullName) : that.fullName == null; + + } + + @Override + public int hashCode() { + int result = firstName != null ? firstName.hashCode() : 0; + result = 31 * result + (lastName != null ? lastName.hashCode() : 0); + result = 31 * result + (fullName != null ? fullName.hashCode() : 0); + return result; + } + + @Override + public String toString() { + return "PublicationAuthor{" + + "firstName='" + firstName + '\'' + + ", lastName='" + lastName + '\'' + + ", fullName='" + fullName + '\'' + + '}'; + } +} \ No newline at end of file diff --git a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReader.java b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReader.java index a506743643..3ac38b4894 100644 --- a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReader.java +++ b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReader.java @@ -36,6 +36,7 @@ import org.biojava.nbio.core.sequence.compound.NucleotideCompound; import org.biojava.nbio.core.sequence.features.AbstractFeature; import org.biojava.nbio.core.sequence.features.DBReferenceInfo; +import org.biojava.nbio.core.sequence.features.PublicationReference; import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; import org.biojava.nbio.core.sequence.template.AbstractSequence; @@ -45,6 +46,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.List; /** * Use GenbankReaderHelper as an example of how to use this class where GenbankReaderHelper should be the @@ -148,6 +150,12 @@ public LinkedHashMap process(int max) throws IOException, CompoundNotF S sequence = (S) sequenceCreator.getSequence(seqString, 0); genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence); + sequence.setPublicationReference(genbankParser.getPublicationReferences()); + sequence.setKeywords(genbankParser.getKeyWords()); + sequence.setSource(genbankParser.getSource()); + sequence.setOrganism(genbankParser.getOrganism()); + sequence.setComment(genbankParser.getComment()); + // add features to new sequence for (String k: genbankParser.getFeatures().keySet()){ for (AbstractFeature f: genbankParser.getFeatures(k)){ @@ -163,6 +171,8 @@ public LinkedHashMap process(int max) throws IOException, CompoundNotF sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK)); } + + sequences.put(sequence.getAccession().getID(), sequence); } br.close(); diff --git a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankSequenceParser.java b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankSequenceParser.java index eab171167e..3e3bf4bc3d 100644 --- a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankSequenceParser.java +++ b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankSequenceParser.java @@ -37,10 +37,7 @@ import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; import org.biojava.nbio.core.sequence.compound.DNACompoundSet; import org.biojava.nbio.core.sequence.compound.RNACompoundSet; -import org.biojava.nbio.core.sequence.features.AbstractFeature; -import org.biojava.nbio.core.sequence.features.DBReferenceInfo; -import org.biojava.nbio.core.sequence.features.Qualifier; -import org.biojava.nbio.core.sequence.features.TextFeature; +import org.biojava.nbio.core.sequence.features.*; import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface; import org.biojava.nbio.core.sequence.location.InsdcParser; import org.biojava.nbio.core.sequence.location.template.AbstractLocation; @@ -51,379 +48,460 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.xml.transform.Source; import java.io.BufferedReader; import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; -public class GenbankSequenceParser, C extends Compound> implements SequenceParserInterface{ - - private String seqData = null; - private GenericGenbankHeaderParser headerParser; - private String header; - private String accession; - public LinkedHashMap> mapDB; - /** - * this data structure collects list of features extracted from the - * FEATURE_TAG section They are organized by list of the same type (i.e. - * same genbank Feature) and are provided with location - */ - private HashMap> featureCollection; - - private Logger log = LoggerFactory.getLogger(getClass()); - - // this is a compoundset parsed from header. - private CompoundSet compoundType; - - /** - * The name of this format - */ - public static final String GENBANK_FORMAT = "GENBANK"; - - protected static final String LOCUS_TAG = "LOCUS"; - protected static final String DEFINITION_TAG = "DEFINITION"; - protected static final String ACCESSION_TAG = "ACCESSION"; - protected static final String VERSION_TAG = "VERSION"; - protected static final String KEYWORDS_TAG = "KEYWORDS"; - // "SEGMENT" - protected static final String SOURCE_TAG = "SOURCE"; - protected static final String ORGANISM_TAG = "ORGANISM"; - protected static final String REFERENCE_TAG = "REFERENCE"; - protected static final String AUTHORS_TAG = "AUTHORS"; - protected static final String CONSORTIUM_TAG = "CONSRTM"; - protected static final String TITLE_TAG = "TITLE"; - protected static final String JOURNAL_TAG = "JOURNAL"; - protected static final String PUBMED_TAG = "PUBMED"; - protected static final String MEDLINE_TAG = "MEDLINE"; //deprecated - protected static final String REMARK_TAG = "REMARK"; - protected static final String COMMENT_TAG = "COMMENT"; - protected static final String FEATURE_TAG = "FEATURES"; - protected static final String BASE_COUNT_TAG_FULL = "BASE COUNT"; //deprecated - protected static final String BASE_COUNT_TAG = "BASE"; - // "CONTIG" - protected static final String START_SEQUENCE_TAG = "ORIGIN"; - protected static final String END_SEQUENCE_TAG = "//"; - // locus line - protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+\\d+\\s+(bp|aa)\\s{1,4}(([dms]s-)?(\\S+))?\\s+(circular|linear)?\\s*(\\S+)?\\s*(\\S+)?$"); - // version line - protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$"); - // reference line - protected static final Pattern refRange = Pattern.compile("^\\s*(\\d+)\\s+to\\s+(\\d+)$"); - protected static final Pattern refp = Pattern.compile("^(\\d+)\\s*(?:(\\((?:bases|residues)\\s+(\\d+\\s+to\\s+\\d+(\\s*;\\s*\\d+\\s+to\\s+\\d+)*)\\))|\\(sites\\))?"); - // dbxref line - protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$"); - - protected static final InsdcParser locationParser = new InsdcParser(DataSource.GENBANK); - //sections start at a line and continue till the first line afterwards with a - //non-whitespace first character - //we want to match any of the following as a new section within a section - // \s{0,8} word \s{0,7} value - // \s{21} /word = value - // \s{21} /word - protected static final Pattern sectp = Pattern.compile("^(\\s{0,8}(\\S+)\\s{0,7}(.*)|\\s{21}(/\\S+?)=(.*)|\\s{21}(/\\S+))$"); - - protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)"); - protected static final Pattern headerLine = Pattern.compile("^LOCUS.*"); - private static final String DBSOURCE = "DBSOURCE"; - private static final String PRIMARY = "PRIMARY"; - private static final String DBLINK = "DBLINK"; +public class GenbankSequenceParser, C extends Compound> implements SequenceParserInterface { + + private String seqData = null; + private GenericGenbankHeaderParser headerParser; + private String header; + private String accession; + private String source; + private String organism; + private String comment; + public LinkedHashMap> mapDB; + /** + * this data structure collects list of features extracted from the + * FEATURE_TAG section They are organized by list of the same type (i.e. + * same genbank Feature) and are provided with location + */ + private HashMap> featureCollection; + private List references = new ArrayList(); + + private Logger log = LoggerFactory.getLogger(getClass()); + + // this is a compoundset parsed from header. + private CompoundSet compoundType; + + /** + * The name of this format + */ + public static final String GENBANK_FORMAT = "GENBANK"; + + protected static final String LOCUS_TAG = "LOCUS"; + protected static final String DEFINITION_TAG = "DEFINITION"; + protected static final String ACCESSION_TAG = "ACCESSION"; + protected static final String VERSION_TAG = "VERSION"; + protected static final String KEYWORDS_TAG = "KEYWORDS"; + // "SEGMENT" + protected static final String SOURCE_TAG = "SOURCE"; + protected static final String ORGANISM_TAG = "ORGANISM"; + protected static final String REFERENCE_TAG = "REFERENCE"; + protected static final String AUTHORS_TAG = "AUTHORS"; + protected static final String CONSORTIUM_TAG = "CONSRTM"; + protected static final String TITLE_TAG = "TITLE"; + protected static final String JOURNAL_TAG = "JOURNAL"; + protected static final String PUBMED_TAG = "PUBMED"; + protected static final String MEDLINE_TAG = "MEDLINE"; //deprecated + protected static final String REMARK_TAG = "REMARK"; + protected static final String COMMENT_TAG = "COMMENT"; + protected static final String FEATURE_TAG = "FEATURES"; + protected static final String BASE_COUNT_TAG_FULL = "BASE COUNT"; //deprecated + protected static final String BASE_COUNT_TAG = "BASE"; + // "CONTIG" + protected static final String START_SEQUENCE_TAG = "ORIGIN"; + protected static final String END_SEQUENCE_TAG = "//"; + // locus line + protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+\\d+\\s+(bp|aa)\\s{1,4}(([dms]s-)?(\\S+))?\\s+(circular|linear)?\\s*(\\S+)?\\s*(\\S+)?$"); + // version line + protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$"); + // reference line + protected static final Pattern refRange = Pattern.compile("^\\s*(\\d+)\\s+to\\s+(\\d+)$"); + protected static final Pattern refp = Pattern.compile("^(\\d+)\\s*(?:(\\((?:bases|residues)\\s+(\\d+\\s+to\\s+\\d+(\\s*;\\s*\\d+\\s+to\\s+\\d+)*)\\))|\\(sites\\))?"); + // dbxref line + protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$"); + + protected static final Pattern authorsPattern = Pattern.compile("(\\w+?,(\\w{1}.)*)"); + + protected static final InsdcParser locationParser = new InsdcParser(DataSource.GENBANK); + //sections start at a line and continue till the first line afterwards with a + //non-whitespace first character + //we want to match any of the following as a new section within a section + // \s{0,8} word \s{0,7} value + // \s{21} /word = value + // \s{21} /word + protected static final Pattern sectp = Pattern.compile("^(\\s{0,8}(\\S+)\\s{0,7}(.*)|\\s{21}(/\\S+?)=(.*)|\\s{21}(/\\S+))$"); + + protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)"); + protected static final Pattern headerLine = Pattern.compile("^LOCUS.*"); + private static final String DBSOURCE = "DBSOURCE"; + private static final String PRIMARY = "PRIMARY"; + private static final String DBLINK = "DBLINK"; // private NCBITaxon tax = null; - - private String parse(BufferedReader bufferedReader) { - String sectionKey = null; - List section; - // Get an ordered list of key->value pairs in array-tuples - do { - section = this.readSection(bufferedReader); - sectionKey = section.get(0)[0]; - if (sectionKey == null) { - //if we reach the end of the file, section contains empty strings - if(section.get(0)[1]==null || section.get(0)[1]=="" || - section.get(0)[1].length()==0) { - throw new ParserException(Messages.ENDOFFILE); - } - throw new ParserException(Messages.SECTIONKEYNULL); - } - // process section-by-section - if (sectionKey.equals(LOCUS_TAG)) { - String loc = section.get(0)[1]; - header = loc; - Matcher m = lp.matcher(loc); - if (m.matches()) { - headerParser.setName(m.group(1)); - headerParser.setAccession(m.group(1)); // default if no accession found - - String lengthUnits = m.group(2); - String type = m.group(5); - - if (lengthUnits.equals("aa")) { - compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet(); - } else if (lengthUnits.equals("bp")) { - if (type != null) { - if (type.contains("RNA")) { - compoundType = RNACompoundSet.getRNACompoundSet(); - } else { - compoundType = DNACompoundSet.getDNACompoundSet(); - } - } else { - compoundType = DNACompoundSet.getDNACompoundSet(); - } - } - - log.debug("compound type: {}", compoundType.getClass().getSimpleName()); - - } else { - throw new ParserException("Bad locus line"); - } - } else if (sectionKey.equals(DEFINITION_TAG)) { - headerParser.setDescription(section.get(0)[1]); - } else if (sectionKey.equals(ACCESSION_TAG)) { - // if multiple accessions, store only first as accession, - // and store rest in annotation - String[] accs = section.get(0)[1].split("\\s+"); - accession = accs[0].trim(); - headerParser.setAccession(accession); - } else if (sectionKey.equals(VERSION_TAG)) { - String ver = section.get(0)[1]; - Matcher m = vp.matcher(ver); - if (m.matches()) { - String verAcc = m.group(1); - if (!accession.equals(verAcc)) { - // the version refers to a different accession! - // believe the version line, and store the original - // accession away in the additional accession set - accession = verAcc; - } - if (m.group(3) != null) { - headerParser.setVersion(Integer.parseInt(m.group(3))); - } - if (m.group(5) != null) { - headerParser.setIdentifier(m.group(5)); - } - } else { - throw new ParserException("Bad version line"); - } - } else if (sectionKey.equals(KEYWORDS_TAG)) { - } else if (sectionKey.equals(SOURCE_TAG)) { - // ignore - can get all this from the first feature - } else if (sectionKey.equals(REFERENCE_TAG)) { - } else if (sectionKey.equals(COMMENT_TAG)) { - // Set up some comments - headerParser.setComment(section.get(0)[1]); - } else if (sectionKey.equals(FEATURE_TAG)) { - // starting from second line of input, start a new feature whenever we come across - // a key that does not start with / - AbstractFeature gbFeature = null; - for (int i = 1; i < section.size(); i++) { - String key = section.get(i)[0]; - String val = section.get(i)[1]; - if (key.startsWith("/")) { - if (gbFeature == null) { - throw new ParserException("Malformed GenBank file: found a qualifier without feature."); - } - key = key.substring(1); // strip leading slash - val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim(); - if (val.endsWith("\"")) { - val = val.substring(1, val.length() - 1); // strip quotes - } - // parameter on old feature - if (key.equals("db_xref")) { - Matcher m = dbxp.matcher(val); - if (m.matches()) { - String dbname = m.group(1); - String raccession = m.group(2); - Qualifier xref = new DBReferenceInfo(dbname, raccession); - gbFeature.addQualifier(key, xref); - - ArrayList listDBEntry = new ArrayList(); - listDBEntry.add((DBReferenceInfo) xref); - mapDB.put(key, listDBEntry); - } else { - throw new ParserException("Bad dbxref"); - } - } else if (key.equalsIgnoreCase("organism")) { - Qualifier q = new Qualifier(key, val.replace('\n', ' ')); - gbFeature.addQualifier(key, q); - } else { - if (key.equalsIgnoreCase("translation")) { - // strip spaces from sequence - val = val.replaceAll("\\s+", ""); - Qualifier q = new Qualifier(key, val); - gbFeature.addQualifier(key, q); - } else { - Qualifier q = new Qualifier(key, val); - gbFeature.addQualifier(key, q); - } - } - } else { - // new feature! - gbFeature = new TextFeature(key, val, key, key); - Location l = - locationParser.parse(val); - gbFeature.setLocation((AbstractLocation)l); - - if (!featureCollection.containsKey(key)) { - featureCollection.put(key, new ArrayList()); - } - featureCollection.get(key).add(gbFeature); - } - } - } else if (sectionKey.equals(BASE_COUNT_TAG)) { - // ignore - can calculate from sequence content later if needed - } else if (sectionKey.equals(START_SEQUENCE_TAG)) { - // our first line is ignorable as it is the ORIGIN tag - // the second line onwards conveniently have the number as - // the [0] tuple, and sequence string as [1] so all we have - // to do is concat the [1] parts and then strip out spaces, - // and replace '.' and '~' with '-' for our parser. - StringBuffer seq = new StringBuffer(); - for (int i = 1; i < section.size(); i++) { - seq.append(section.get(i)[1]); - } - seqData = seq.toString().replaceAll("\\s+", "").replaceAll("[\\.|~]", "-").toUpperCase(); - } else if(sectionKey.equals(DBSOURCE)) { - //TODO - } else if(sectionKey.equals(PRIMARY)) { - //TODO - } else if(sectionKey.equals(DBLINK)) { - //TODO - } else { - if(!sectionKey.equals(END_SEQUENCE_TAG)) { - log.info("found unknown section key: "+sectionKey); - } - } - } while (!sectionKey.equals(END_SEQUENCE_TAG)); - return seqData; - } - - - - // reads an indented section, combining split lines and creating a list of - // key->value tuples - // reads an indented section, combining split lines and creating a list of - // key->value tuples - // reads an indented section, combining split lines and creating a list of - // key->value tuples - private List readSection(BufferedReader bufferedReader) { - List section = new ArrayList(); - String line = ""; - - String currKey = null; - StringBuffer currVal = new StringBuffer(); - boolean done = false; - int linecount = 0; - - try { - while (!done) { - bufferedReader.mark(320); - line = bufferedReader.readLine(); - String firstSecKey = section.isEmpty() ? "" - : section.get(0)[0]; - if (line != null && line.matches("\\p{Space}*")) { - // regular expression \p{Space}* will match line - // having only white space characters - continue; - } - if (line == null - || (!line.startsWith(" ") && linecount++ > 0 && (!firstSecKey - .equals(START_SEQUENCE_TAG) || line - .startsWith(END_SEQUENCE_TAG)))) { - // dump out last part of section - section.add(new String[]{currKey, currVal.toString()}); - bufferedReader.reset(); - done = true; - } else { - Matcher m = sectp.matcher(line); - if (m.matches()) { - // new key - if (currKey != null) { - section.add(new String[]{currKey, - currVal.toString()}); - } - // key = group(2) or group(4) or group(6) - whichever is - // not null - currKey = m.group(2) == null ? (m.group(4) == null ? m - .group(6) : m.group(4)) : m.group(2); - currVal = new StringBuffer(); - // val = group(3) if group(2) not null, group(5) if - // group(4) not null, "" otherwise, trimmed - currVal.append((m.group(2) == null ? (m.group(4) == null ? "" - : m.group(5)) - : m.group(3)).trim()); - } else { - // concatted line or SEQ START/END line? - if (line.startsWith(START_SEQUENCE_TAG) - || line.startsWith(END_SEQUENCE_TAG)) { - currKey = line; - } else { - currVal.append("\n"); // newline in between lines - - // can be removed later - currVal.append(currKey.charAt(0) == '/' ? line - .substring(21) : line.substring(12)); - } - } - } - } - } catch (IOException e) { - throw new ParserException(e.getMessage()); - } catch (RuntimeException e) { - throw new ParserException(e.getMessage()); - } - return section; - } - - @Override - public String getSequence(BufferedReader bufferedReader, int sequenceLength) throws IOException { - featureCollection = new HashMap>(); - mapDB = new LinkedHashMap>(); - headerParser = new GenericGenbankHeaderParser(); - try { - parse(bufferedReader); - } catch (ParserException e) { - if(e.getMessage().equalsIgnoreCase(Messages.ENDOFFILE)) return null; - else throw new ParserException(e.getMessage()); - } - - return seqData; - } - - public String getHeader() { - return header; - } - - public GenericGenbankHeaderParser getSequenceHeaderParser() { - return headerParser; - } - - public LinkedHashMap> getDatabaseReferences() { - return mapDB; - } - - public ArrayList getKeyWords() { - return new ArrayList(featureCollection.keySet()); - } - - public ArrayList getFeatures(String keyword) { - return featureCollection.get(keyword); - } - public HashMap> getFeatures() { - return featureCollection; - } - - public void parseFeatures(AbstractSequence sequence) { - for (String k: featureCollection.keySet()) - for (AbstractFeature f: featureCollection.get(k)) - sequence.addFeature(f); - } - - public CompoundSet getCompoundType() { - return compoundType; - } + private String parse(BufferedReader bufferedReader) { + String sectionKey = null; + List section; + // Get an ordered list of key->value pairs in array-tuples + do { + section = this.readSection(bufferedReader); + sectionKey = section.get(0)[0]; + if (sectionKey == null) { + //if we reach the end of the file, section contains empty strings + if (section.get(0)[1] == null || section.get(0)[1] == "" || + section.get(0)[1].length() == 0) { + throw new ParserException(Messages.ENDOFFILE); + } + throw new ParserException(Messages.SECTIONKEYNULL); + } + // process section-by-section + if (sectionKey.equals(LOCUS_TAG)) { + String loc = section.get(0)[1]; + header = loc; + Matcher m = lp.matcher(loc); + if (m.matches()) { + headerParser.setName(m.group(1)); + headerParser.setAccession(m.group(1)); // default if no accession found + + String lengthUnits = m.group(2); + String type = m.group(5); + + if (lengthUnits.equals("aa")) { + compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet(); + } else if (lengthUnits.equals("bp")) { + if (type != null) { + if (type.contains("RNA")) { + compoundType = RNACompoundSet.getRNACompoundSet(); + } else { + compoundType = DNACompoundSet.getDNACompoundSet(); + } + } else { + compoundType = DNACompoundSet.getDNACompoundSet(); + } + } + + log.debug("compound type: {}", compoundType.getClass().getSimpleName()); + + } else { + throw new ParserException("Bad locus line"); + } + } else if (sectionKey.equals(DEFINITION_TAG)) { + headerParser.setDescription(section.get(0)[1]); + } else if (sectionKey.equals(ACCESSION_TAG)) { + // if multiple accessions, store only first as accession, + // and store rest in annotation + String[] accs = section.get(0)[1].split("\\s+"); + accession = accs[0].trim(); + headerParser.setAccession(accession); + } else if (sectionKey.equals(VERSION_TAG)) { + String ver = section.get(0)[1]; + Matcher m = vp.matcher(ver); + if (m.matches()) { + String verAcc = m.group(1); + if (!accession.equals(verAcc)) { + // the version refers to a different accession! + // believe the version line, and store the original + // accession away in the additional accession set + accession = verAcc; + } + if (m.group(3) != null) { + headerParser.setVersion(Integer.parseInt(m.group(3))); + } + if (m.group(5) != null) { + headerParser.setIdentifier(m.group(5)); + } + } else { + throw new ParserException("Bad version line"); + } + } else if (sectionKey.equals(KEYWORDS_TAG)) { + } else if (sectionKey.equals(SOURCE_TAG)) { + if (section.size() == 2) { + String[] source = section.get(0); + if (source.length == 2) { + this.source = source[1]; + } + String[] organism = section.get(1); + if (organism.length == 2) { + this.organism = organism[1]; + } + } + } else if (sectionKey.equals(REFERENCE_TAG)) { + + PublicationReference reference = new PublicationReference(); + String[] authorSection = section.get(1); + if (authorSection.length > 1) { + if (AUTHORS_TAG.equals(authorSection[0])) { + Matcher matcher = authorsPattern.matcher(authorSection[1]); + while (matcher.find()) { + String fullName = matcher.group(); + String[] names = fullName.split(","); + String lastName = names[0]; + String firstName = names[1]; + PublicationReferenceAuthor author = new PublicationReferenceAuthor(); + author.setFirstName(firstName); + author.setLastName(lastName); + author.setFullName(firstName + " " + lastName); + reference.getAuthors().add(author); + } + } else if (CONSORTIUM_TAG.equals(authorSection[0])) { + String fullName = authorSection[1]; + PublicationReferenceAuthor author = new PublicationReferenceAuthor(); + author.setFullName(fullName); + reference.getAuthors().add(author); + } + } + + String title = section.get(2)[1]; + title = title.replace("\n", ""); + reference.setTitle(title); + + String journal = section.get(3)[1]; + journal = journal.replace("\n", " "); + + if (section.size() == 5) { + reference.setReferenceType(PublicationReference.ReferenceType.PUBMED); + String pubmedId = section.get(4)[1]; + reference.setId(pubmedId); + } else if (journal.startsWith("Patent")) { + reference.setReferenceType(PublicationReference.ReferenceType.PATENT); + } else if (title.startsWith("Direct Submission")) { + reference.setReferenceType(PublicationReference.ReferenceType.DIRECT_SUBMISSION); + } else { + reference.setReferenceType(PublicationReference.ReferenceType.UNKNOWN); + } + reference.setJournal(journal); + references.add(reference); + + + } else if (sectionKey.equals(COMMENT_TAG)) { + // Set up some comments + String comment = section.get(0)[1]; + if (comment != null) { + String cleaned = comment.replace("\n", " "); + headerParser.setComment(cleaned); + this.comment = cleaned; + } + } else if (sectionKey.equals(FEATURE_TAG)) { + // starting from second line of input, start a new feature whenever we come across + // a key that does not start with / + AbstractFeature gbFeature = null; + for (int i = 1; i < section.size(); i++) { + String key = section.get(i)[0]; + String val = section.get(i)[1]; + if (key.startsWith("/")) { + if (gbFeature == null) { + throw new ParserException("Malformed GenBank file: found a qualifier without feature."); + } + key = key.substring(1); // strip leading slash + val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim(); + if (val.endsWith("\"")) { + val = val.substring(1, val.length() - 1); // strip quotes + } + // parameter on old feature + if (key.equals("db_xref")) { + Matcher m = dbxp.matcher(val); + if (m.matches()) { + String dbname = m.group(1); + String raccession = m.group(2); + Qualifier xref = new DBReferenceInfo(dbname, raccession); + gbFeature.addQualifier(key, xref); + + ArrayList listDBEntry = new ArrayList(); + listDBEntry.add((DBReferenceInfo) xref); + mapDB.put(key, listDBEntry); + } else { + throw new ParserException("Bad dbxref"); + } + } else if (key.equalsIgnoreCase("organism")) { + Qualifier q = new Qualifier(key, val.replace('\n', ' ')); + gbFeature.addQualifier(key, q); + } else { + if (key.equalsIgnoreCase("translation")) { + // strip spaces from sequence + val = val.replaceAll("\\s+", ""); + Qualifier q = new Qualifier(key, val); + gbFeature.addQualifier(key, q); + } else { + Qualifier q = new Qualifier(key, val); + gbFeature.addQualifier(key, q); + } + } + } else { + // new feature! + gbFeature = new TextFeature(key, val, key, key); + Location l = + locationParser.parse(val); + gbFeature.setLocation((AbstractLocation) l); + + if (!featureCollection.containsKey(key)) { + featureCollection.put(key, new ArrayList()); + } + featureCollection.get(key).add(gbFeature); + } + } + } else if (sectionKey.equals(BASE_COUNT_TAG)) { + // ignore - can calculate from sequence content later if needed + } else if (sectionKey.equals(START_SEQUENCE_TAG)) { + // our first line is ignorable as it is the ORIGIN tag + // the second line onwards conveniently have the number as + // the [0] tuple, and sequence string as [1] so all we have + // to do is concat the [1] parts and then strip out spaces, + // and replace '.' and '~' with '-' for our parser. + StringBuffer seq = new StringBuffer(); + for (int i = 1; i < section.size(); i++) { + seq.append(section.get(i)[1]); + } + seqData = seq.toString().replaceAll("\\s+", "").replaceAll("[\\.|~]", "-").toUpperCase(); + } else if (sectionKey.equals(DBSOURCE)) { + //TODO + } else if (sectionKey.equals(PRIMARY)) { + //TODO + } else if (sectionKey.equals(DBLINK)) { + //TODO + } else { + if (!sectionKey.equals(END_SEQUENCE_TAG)) { + log.info("found unknown section key: " + sectionKey); + } + } + } + while (!sectionKey.equals(END_SEQUENCE_TAG)); + return seqData; + } + + + // reads an indented section, combining split lines and creating a list of + // key->value tuples + // reads an indented section, combining split lines and creating a list of + // key->value tuples + // reads an indented section, combining split lines and creating a list of + // key->value tuples + private List readSection(BufferedReader bufferedReader) { + List section = new ArrayList(); + String line = ""; + + String currKey = null; + StringBuffer currVal = new StringBuffer(); + boolean done = false; + int linecount = 0; + + try { + while (!done) { + bufferedReader.mark(320); + line = bufferedReader.readLine(); + String firstSecKey = section.isEmpty() ? "" + : section.get(0)[0]; + if (line != null && line.matches("\\p{Space}*")) { + // regular expression \p{Space}* will match line + // having only white space characters + continue; + } + if (line == null + || (!line.startsWith(" ") && linecount++ > 0 && (!firstSecKey + .equals(START_SEQUENCE_TAG) || line + .startsWith(END_SEQUENCE_TAG)))) { + // dump out last part of section + section.add(new String[]{currKey, currVal.toString()}); + bufferedReader.reset(); + done = true; + } else { + Matcher m = sectp.matcher(line); + if (m.matches()) { + // new key + if (currKey != null) { + section.add(new String[]{currKey, + currVal.toString()}); + } + // key = group(2) or group(4) or group(6) - whichever is + // not null + currKey = m.group(2) == null ? (m.group(4) == null ? m + .group(6) : m.group(4)) : m.group(2); + currVal = new StringBuffer(); + // val = group(3) if group(2) not null, group(5) if + // group(4) not null, "" otherwise, trimmed + currVal.append((m.group(2) == null ? (m.group(4) == null ? "" + : m.group(5)) + : m.group(3)).trim()); + } else { + // concatted line or SEQ START/END line? + if (line.startsWith(START_SEQUENCE_TAG) + || line.startsWith(END_SEQUENCE_TAG)) { + currKey = line; + } else { + currVal.append("\n"); // newline in between lines - + // can be removed later + currVal.append(currKey.charAt(0) == '/' ? line + .substring(21) : line.substring(12)); + } + } + } + } + } catch (IOException e) { + throw new ParserException(e.getMessage()); + } catch (RuntimeException e) { + throw new ParserException(e.getMessage()); + } + return section; + } + + @Override + public String getSequence(BufferedReader bufferedReader, int sequenceLength) throws IOException { + featureCollection = new HashMap>(); + mapDB = new LinkedHashMap>(); + headerParser = new GenericGenbankHeaderParser(); + try { + parse(bufferedReader); + } catch (ParserException e) { + if (e.getMessage().equalsIgnoreCase(Messages.ENDOFFILE)) return null; + else throw new ParserException(e.getMessage()); + } + + return seqData; + } + + public String getHeader() { + return header; + } + + public GenericGenbankHeaderParser getSequenceHeaderParser() { + return headerParser; + } + + public LinkedHashMap> getDatabaseReferences() { + return mapDB; + } + + public ArrayList getKeyWords() { + return new ArrayList(featureCollection.keySet()); + } + + public ArrayList getFeatures(String keyword) { + return featureCollection.get(keyword); + } + + public HashMap> getFeatures() { + return featureCollection; + } + + public void parseFeatures(AbstractSequence sequence) { + for (String k : featureCollection.keySet()) + for (AbstractFeature f : featureCollection.get(k)) + sequence.addFeature(f); + } + + public List getPublicationReferences() { + return Collections.unmodifiableList(references); + } + + public CompoundSet getCompoundType() { + return compoundType; + } + + public String getSource() { + return source; + } + + public String getOrganism() { + return organism; + } + + public String getComment() { + return comment; + } } diff --git a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/template/AbstractSequence.java b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/template/AbstractSequence.java index f34cf1642d..17cb856371 100644 --- a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/template/AbstractSequence.java +++ b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/template/AbstractSequence.java @@ -43,623 +43,627 @@ import java.util.*; /** - * * The base class for DNA, RNA and Protein sequences. + * * @param */ public abstract class AbstractSequence implements Sequence { - private final static Logger logger = LoggerFactory.getLogger(AbstractSequence.class); - - private TaxonomyID taxonomy; - private AccessionID accession; - private SequenceReader sequenceStorage = null; - private CompoundSet compoundSet; - private AnnotationType annotationType = AnnotationType.UNKNOWN; - private String description; - private String originalHeader; - private Collection userCollection; - private Integer bioBegin = null; - private Integer bioEnd = null; - private AbstractSequence parentSequence = null; - private String source = null; - private ArrayList notesList = new ArrayList(); - private Double sequenceScore = null; - private FeaturesKeyWordInterface featuresKeyWord = null; - private DatabaseReferenceInterface databaseReferences = null; - private FeatureRetriever featureRetriever = null; - private ArrayList, C>> features = - new ArrayList, C>>(); - private LinkedHashMap, C>>> groupedFeatures = - new LinkedHashMap, C>>>(); - - public AbstractSequence() { - } - - /** - * Create a Sequence from a simple string where the values should be found in compoundSet - * @param seqString - * @param compoundSet - * @throws CompoundNotFoundException - */ - public AbstractSequence(String seqString, CompoundSet compoundSet) throws CompoundNotFoundException { - setCompoundSet(compoundSet); - sequenceStorage = new ArrayListSequenceReader(); - sequenceStorage.setCompoundSet(this.getCompoundSet()); - sequenceStorage.setContents(seqString); - } - - /** - * A ProxySequenceReader allows abstraction of both the storage of the sequence data and the location - * of the sequence data. A variety of use cases are possible. A ProxySequenceReader that knows the offset and of the sequence in - * a large fasta file. A ProxySequenceReader that can pull Sequence data from UniProt, NCBI or a custom database. - * If the ProxySequenceReader implements various interfaces then the sequence will set those interfaces so that calls to - * various methods will be valid. - * - * @param proxyLoader - * @param compoundSet - */ - public AbstractSequence(SequenceReader proxyLoader, CompoundSet compoundSet) { - setCompoundSet(compoundSet); - setProxySequenceReader(proxyLoader); - } - - /** - * Very important method that allows external mappings of sequence data and features. This method - * will gain additional interface inspection that allows external data sources with knowledge - * of features for a sequence to be supported. - * - * @param proxyLoader - */ - public void setProxySequenceReader(SequenceReader proxyLoader) { - this.sequenceStorage = proxyLoader; - if (proxyLoader instanceof FeaturesKeyWordInterface) { - this.setFeaturesKeyWord((FeaturesKeyWordInterface) sequenceStorage); - } - if (proxyLoader instanceof DatabaseReferenceInterface) { - this.setDatabaseReferences((DatabaseReferenceInterface) sequenceStorage); - } - - if (proxyLoader instanceof FeatureRetriever) { - this.setFeatureRetriever((FeatureRetriever) sequenceStorage); - HashMap> ff = getFeatureRetriever().getFeatures(); - for (String k: ff.keySet()){ - for (AbstractFeature f: ff.get(k)){ - this.addFeature(f); - } - } - // success of next statement guaranteed because source is a compulsory field - //DBReferenceInfo dbQualifier = (DBReferenceInfo)ff.get("source").get(0).getQualifiers().get("db_xref"); - ArrayList dbQualifiers = (ArrayList)ff.get("source").get(0).getQualifiers().get("db_xref"); - DBReferenceInfo dbQualifier = dbQualifiers.get(0); - - if (dbQualifier != null) this.setTaxonomy(new TaxonomyID(dbQualifier.getDatabase()+":"+dbQualifier.getId(), DataSource.UNKNOWN)); - } - - if(getAccession() == null && proxyLoader instanceof UniprotProxySequenceReader){ // we have lots of unsupported operations for this call so quick fix to allow this tow rork - this.setAccession(proxyLoader.getAccession()); - } - } - - public SequenceReader getProxySequenceReader() { - return sequenceStorage; - } - - /** - * @return the bioBegin - */ - public Integer getBioBegin() { - if (bioBegin == null) { - return 1; - } else { - return bioBegin; - } - } - - /** - * @param bioBegin the bioBegin to set - */ - public void setBioBegin(Integer begin) { - this.bioBegin = begin; - } - - /** - * @return the bioEnd - */ - public Integer getBioEnd() { - if (bioEnd == null) { - return this.getLength(); - } else { - return bioEnd; - } - } - - /** - * @param bioEnd the bioEnd to set - */ - public void setBioEnd(Integer end) { - this.bioEnd = end; - } - - /** - * Provided for convince if the developer needs to associate data with a sequence - * - * @return - */ - public Collection getUserCollection() { - - return userCollection; - } - - /** - * - * @param userCollection - */ - public void setUserCollection(Collection userCollection) { - this.userCollection = userCollection; - } - - /** - * @return the annotation - */ - public AnnotationType getAnnotationType() { - return annotationType; - } - - /** - * @param annotation the annotation to set - */ - public void setAnnotationType(AnnotationType annotationType) { - this.annotationType = annotationType; - } - - /** - * @return the description - */ - public String getDescription() { - return description; - } - - /** - * @param description the description to set - */ - public void setDescription(String description) { - this.description = description; - } - - /** - * @return the originalHeader - */ - public String getOriginalHeader() { - return originalHeader; - } - - /** - * @param originalHeader the originalHeader to set - */ - public void setOriginalHeader(String originalHeader) { - this.originalHeader = originalHeader; - } - - /** - * @return the parentSequence - */ - public AbstractSequence getParentSequence() { - return parentSequence; - } - - /** - * @param parentSequence the parentSequence to set - */ - public void setParentSequence(AbstractSequence parentSequence) { - this.parentSequence = parentSequence; - } - - /** - * Added support for the source of this sequence for GFF3 export - * If a sub sequence doesn't have source then check for parent source - * @return the source - */ - public String getSource() { - if (source != null) { - return source; - } - if (parentSequence != null) { - return parentSequence.getSource(); - } - return null; - } - - /** - * Added support for the source of this sequence for GFF3 export - * @param source the source to set - */ - public void setSource(String source) { - - this.source = source; - } - - /** - * Add notes about this sequence that will get exported for GFF3 - * @param note - */ - public void addNote(String note) { - notesList.add(note); - } - - public void removeNote(String note) { - notesList.remove(note); - } - - /** - * @return the notesList - */ - public ArrayList getNotesList() { - return notesList; - } - - /** - * @param notesList the notesList to set - */ - public void setNotesList(ArrayList notesList) { - this.notesList = notesList; - } - - /** - * Provide place holder for a metric that indicate a score associated with the sequence - * @return the sequenceScore - */ - public Double getSequenceScore() { - return sequenceScore; - } - - /** - * @param sequenceScore the sequenceScore to set - */ - public void setSequenceScore(Double sequenceScore) { - this.sequenceScore = sequenceScore; - } - - /** - * Return features at a sequence position by type - * @param featureType - * @param bioSequencePosition - * @return - */ - public List, C>> getFeatures(String featureType, int bioSequencePosition) { - ArrayList, C>> featureHits = - new ArrayList, C>>(); - List, C>> features = getFeaturesByType(featureType); - if (features != null) { - for (FeatureInterface, C> feature : features) { - if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) { - featureHits.add(feature); - } - } - } - return featureHits; - } - - /** - * Return features at a sequence position - * @param featureType - * @param bioSequencePosition - * @return - */ - public List, C>> getFeatures(int bioSequencePosition) { - ArrayList, C>> featureHits = - new ArrayList, C>>(); - if (features != null) { - for (FeatureInterface, C> feature : features) { - if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) { - featureHits.add(feature); - } - } - } - return featureHits; - } - - /** - * - * @return - */ - public List, C>> getFeatures() { - return features; - } - - /** - * Method to help set the proper details for a feature as it relates to a sequence - * where the feature needs to have a location on the sequence - * @param bioStart - * @param bioEnd - * @param feature - */ - public void addFeature(int bioStart, int bioEnd, FeatureInterface, C> feature) { - SequenceLocation, C> sequenceLocation = - new SequenceLocation, C>(bioStart, bioEnd, this); - feature.setLocation(sequenceLocation); - addFeature(feature); - } - - /** - * Add a feature to this sequence. The feature will be added to the collection where the order is start position and if more than - * one feature at the same start position then longest is added first. This helps on doing feature layout for displaying features - * in SequenceFeaturePanel - * @param feature - */ - public void addFeature(FeatureInterface, C> feature) { - features.add(feature); - ArrayList, C>> featureList = groupedFeatures.get(feature.getType()); - if (featureList == null) { - featureList = new ArrayList, C>>(); - groupedFeatures.put(feature.getType(), featureList); - } - featureList.add(feature); - Collections.sort(features, AbstractFeature.LOCATION_LENGTH); - Collections.sort(featureList, AbstractFeature.LOCATION_LENGTH); - } - - /** - * Remove a feature from the sequence - * @param feature - */ - public void removeFeature(FeatureInterface, C> feature) { - features.remove(feature); - ArrayList, C>> featureList = groupedFeatures.get(feature.getType()); - if (featureList != null) { - featureList.remove(feature); - if (featureList.isEmpty()) { - groupedFeatures.remove(feature.getType()); - } - } - } - - /** - * - * @param type - * @return - */ - public List, C>> getFeaturesByType(String type) { - List, C>> features = groupedFeatures.get(type); - if (features == null) { - features = new ArrayList, C>>(); - } - return features; - } - - /** - * @return the featuresKeyWord - */ - public FeaturesKeyWordInterface getFeaturesKeyWord() { - return featuresKeyWord; - } - - /** - * @param featuresKeyWord the featuresKeyWord to set - */ - public void setFeaturesKeyWord(FeaturesKeyWordInterface featuresKeyWord) { - this.featuresKeyWord = featuresKeyWord; - } - - /** - * @return the databaseReferences - */ - public DatabaseReferenceInterface getDatabaseReferences() { - return databaseReferences; - } - - /** - * @param databaseReferences the databaseReferences to set - */ - public void setDatabaseReferences(DatabaseReferenceInterface databaseReferences) { - this.databaseReferences = databaseReferences; - } - - public FeatureRetriever getFeatureRetriever() { - return featureRetriever; - } - - public void setFeatureRetriever(FeatureRetriever featureRetriever) { - this.featureRetriever = featureRetriever; - } - - - - public enum AnnotationType { - - CURATED, PREDICTED, UNKNOWN; - } - - /** - * @return the accession - */ - @Override - public AccessionID getAccession() { - return accession; - } - - /** - * @param accession the accession to set - */ - public void setAccession(AccessionID accession) { - this.accession = accession; - } - - /** - * @return the species - */ - public TaxonomyID getTaxonomy() { - return taxonomy; - } - - /** - * @param species the species to set - */ - public void setTaxonomy(TaxonomyID taxonomy) { - this.taxonomy = taxonomy; - } - - @Override - public CompoundSet getCompoundSet() { - if (compoundSet != null) { - return compoundSet; - } - // This is invalid since the parentSequence isn't guaranteed to have the same compound set as this sequence, - // e.g., the case where the parent sequence for a protein is a CDS. - /* + private final static Logger logger = LoggerFactory.getLogger(AbstractSequence.class); + + private TaxonomyID taxonomy; + private AccessionID accession; + private SequenceReader sequenceStorage = null; + private CompoundSet compoundSet; + private AnnotationType annotationType = AnnotationType.UNKNOWN; + private String description; + private String organism; + private String originalHeader; + private String comment; + private Collection userCollection; + private Integer bioBegin = null; + private Integer bioEnd = null; + private AbstractSequence parentSequence = null; + private String source = null; + private ArrayList notesList = new ArrayList(); + private Double sequenceScore = null; + private FeaturesKeyWordInterface featuresKeyWord = null; + private DatabaseReferenceInterface databaseReferences = null; + private FeatureRetriever featureRetriever = null; + private ArrayList, C>> features = + new ArrayList, C>>(); + private LinkedHashMap, C>>> groupedFeatures = + new LinkedHashMap, C>>>(); + private List publicationReference = new ArrayList(); + private List keywords = new ArrayList(); + + public AbstractSequence() { + } + + /** + * Create a Sequence from a simple string where the values should be found in compoundSet + * + * @param seqString + * @param compoundSet + * @throws CompoundNotFoundException + */ + public AbstractSequence(String seqString, CompoundSet compoundSet) throws CompoundNotFoundException { + setCompoundSet(compoundSet); + sequenceStorage = new ArrayListSequenceReader(); + sequenceStorage.setCompoundSet(this.getCompoundSet()); + sequenceStorage.setContents(seqString); + } + + /** + * A ProxySequenceReader allows abstraction of both the storage of the sequence data and the location + * of the sequence data. A variety of use cases are possible. A ProxySequenceReader that knows the offset and of the sequence in + * a large fasta file. A ProxySequenceReader that can pull Sequence data from UniProt, NCBI or a custom database. + * If the ProxySequenceReader implements various interfaces then the sequence will set those interfaces so that calls to + * various methods will be valid. + * + * @param proxyLoader + * @param compoundSet + */ + public AbstractSequence(SequenceReader proxyLoader, CompoundSet compoundSet) { + setCompoundSet(compoundSet); + setProxySequenceReader(proxyLoader); + } + + /** + * Very important method that allows external mappings of sequence data and features. This method + * will gain additional interface inspection that allows external data sources with knowledge + * of features for a sequence to be supported. + * + * @param proxyLoader + */ + public void setProxySequenceReader(SequenceReader proxyLoader) { + this.sequenceStorage = proxyLoader; + if (proxyLoader instanceof FeaturesKeyWordInterface) { + this.setFeaturesKeyWord((FeaturesKeyWordInterface) sequenceStorage); + } + if (proxyLoader instanceof DatabaseReferenceInterface) { + this.setDatabaseReferences((DatabaseReferenceInterface) sequenceStorage); + } + + if (proxyLoader instanceof FeatureRetriever) { + this.setFeatureRetriever((FeatureRetriever) sequenceStorage); + HashMap> ff = getFeatureRetriever().getFeatures(); + for (String k : ff.keySet()) { + for (AbstractFeature f : ff.get(k)) { + this.addFeature(f); + } + } + // success of next statement guaranteed because source is a compulsory field + //DBReferenceInfo dbQualifier = (DBReferenceInfo)ff.get("source").get(0).getQualifiers().get("db_xref"); + ArrayList dbQualifiers = (ArrayList) ff.get("source").get(0).getQualifiers().get("db_xref"); + DBReferenceInfo dbQualifier = dbQualifiers.get(0); + + if (dbQualifier != null) + this.setTaxonomy(new TaxonomyID(dbQualifier.getDatabase() + ":" + dbQualifier.getId(), DataSource.UNKNOWN)); + } + + if (getAccession() == null && proxyLoader instanceof UniprotProxySequenceReader) { // we have lots of unsupported operations for this call so quick fix to allow this tow rork + this.setAccession(proxyLoader.getAccession()); + } + } + + public SequenceReader getProxySequenceReader() { + return sequenceStorage; + } + + /** + * @return the bioBegin + */ + public Integer getBioBegin() { + if (bioBegin == null) { + return 1; + } else { + return bioBegin; + } + } + + /** + * @param bioBegin the bioBegin to set + */ + public void setBioBegin(Integer begin) { + this.bioBegin = begin; + } + + /** + * @return the bioEnd + */ + public Integer getBioEnd() { + if (bioEnd == null) { + return this.getLength(); + } else { + return bioEnd; + } + } + + /** + * @param bioEnd the bioEnd to set + */ + public void setBioEnd(Integer end) { + this.bioEnd = end; + } + + /** + * Provided for convince if the developer needs to associate data with a sequence + * + * @return + */ + public Collection getUserCollection() { + + return userCollection; + } + + /** + * @param userCollection + */ + public void setUserCollection(Collection userCollection) { + this.userCollection = userCollection; + } + + /** + * @return the annotation + */ + public AnnotationType getAnnotationType() { + return annotationType; + } + + /** + * @param annotation the annotation to set + */ + public void setAnnotationType(AnnotationType annotationType) { + this.annotationType = annotationType; + } + + /** + * @return the description + */ + public String getDescription() { + return description; + } + + /** + * @param description the description to set + */ + public void setDescription(String description) { + this.description = description; + } + + /** + * @return the originalHeader + */ + public String getOriginalHeader() { + return originalHeader; + } + + /** + * @param originalHeader the originalHeader to set + */ + public void setOriginalHeader(String originalHeader) { + this.originalHeader = originalHeader; + } + + /** + * @return the parentSequence + */ + public AbstractSequence getParentSequence() { + return parentSequence; + } + + /** + * @param parentSequence the parentSequence to set + */ + public void setParentSequence(AbstractSequence parentSequence) { + this.parentSequence = parentSequence; + } + + /** + * Added support for the source of this sequence for GFF3 export + * If a sub sequence doesn't have source then check for parent source + * + * @return the source + */ + public String getSource() { + if (source != null) { + return source; + } + if (parentSequence != null) { + return parentSequence.getSource(); + } + return null; + } + + /** + * Added support for the source of this sequence for GFF3 export + * + * @param source the source to set + */ + public void setSource(String source) { + + this.source = source; + } + + /** + * Add notes about this sequence that will get exported for GFF3 + * + * @param note + */ + public void addNote(String note) { + notesList.add(note); + } + + public void removeNote(String note) { + notesList.remove(note); + } + + /** + * @return the notesList + */ + public ArrayList getNotesList() { + return notesList; + } + + /** + * @param notesList the notesList to set + */ + public void setNotesList(ArrayList notesList) { + this.notesList = notesList; + } + + /** + * Provide place holder for a metric that indicate a score associated with the sequence + * + * @return the sequenceScore + */ + public Double getSequenceScore() { + return sequenceScore; + } + + /** + * @param sequenceScore the sequenceScore to set + */ + public void setSequenceScore(Double sequenceScore) { + this.sequenceScore = sequenceScore; + } + + /** + * Return features at a sequence position by type + * + * @param featureType + * @param bioSequencePosition + * @return + */ + public List, C>> getFeatures(String featureType, int bioSequencePosition) { + ArrayList, C>> featureHits = + new ArrayList, C>>(); + List, C>> features = getFeaturesByType(featureType); + if (features != null) { + for (FeatureInterface, C> feature : features) { + if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) { + featureHits.add(feature); + } + } + } + return featureHits; + } + + /** + * Return features at a sequence position + * + * @param featureType + * @param bioSequencePosition + * @return + */ + public List, C>> getFeatures(int bioSequencePosition) { + ArrayList, C>> featureHits = + new ArrayList, C>>(); + if (features != null) { + for (FeatureInterface, C> feature : features) { + if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) { + featureHits.add(feature); + } + } + } + return featureHits; + } + + /** + * @return + */ + public List, C>> getFeatures() { + return features; + } + + /** + * Method to help set the proper details for a feature as it relates to a sequence + * where the feature needs to have a location on the sequence + * + * @param bioStart + * @param bioEnd + * @param feature + */ + public void addFeature(int bioStart, int bioEnd, FeatureInterface, C> feature) { + SequenceLocation, C> sequenceLocation = + new SequenceLocation, C>(bioStart, bioEnd, this); + feature.setLocation(sequenceLocation); + addFeature(feature); + } + + /** + * Add a feature to this sequence. The feature will be added to the collection where the order is start position and if more than + * one feature at the same start position then longest is added first. This helps on doing feature layout for displaying features + * in SequenceFeaturePanel + * + * @param feature + */ + public void addFeature(FeatureInterface, C> feature) { + features.add(feature); + ArrayList, C>> featureList = groupedFeatures.get(feature.getType()); + if (featureList == null) { + featureList = new ArrayList, C>>(); + groupedFeatures.put(feature.getType(), featureList); + } + featureList.add(feature); + Collections.sort(features, AbstractFeature.LOCATION_LENGTH); + Collections.sort(featureList, AbstractFeature.LOCATION_LENGTH); + } + + /** + * Remove a feature from the sequence + * + * @param feature + */ + public void removeFeature(FeatureInterface, C> feature) { + features.remove(feature); + ArrayList, C>> featureList = groupedFeatures.get(feature.getType()); + if (featureList != null) { + featureList.remove(feature); + if (featureList.isEmpty()) { + groupedFeatures.remove(feature.getType()); + } + } + } + + /** + * @param type + * @return + */ + public List, C>> getFeaturesByType(String type) { + List, C>> features = groupedFeatures.get(type); + if (features == null) { + features = new ArrayList, C>>(); + } + return features; + } + + /** + * @return the featuresKeyWord + */ + public FeaturesKeyWordInterface getFeaturesKeyWord() { + return featuresKeyWord; + } + + /** + * @param featuresKeyWord the featuresKeyWord to set + */ + public void setFeaturesKeyWord(FeaturesKeyWordInterface featuresKeyWord) { + this.featuresKeyWord = featuresKeyWord; + } + + /** + * @return the databaseReferences + */ + public DatabaseReferenceInterface getDatabaseReferences() { + return databaseReferences; + } + + /** + * @param databaseReferences the databaseReferences to set + */ + public void setDatabaseReferences(DatabaseReferenceInterface databaseReferences) { + this.databaseReferences = databaseReferences; + } + + public FeatureRetriever getFeatureRetriever() { + return featureRetriever; + } + + public void setFeatureRetriever(FeatureRetriever featureRetriever) { + this.featureRetriever = featureRetriever; + } + + + public enum AnnotationType { + + CURATED, PREDICTED, UNKNOWN; + } + + /** + * @return the accession + */ + @Override + public AccessionID getAccession() { + return accession; + } + + /** + * @param accession the accession to set + */ + public void setAccession(AccessionID accession) { + this.accession = accession; + } + + /** + * @return the species + */ + public TaxonomyID getTaxonomy() { + return taxonomy; + } + + /** + * @param species the species to set + */ + public void setTaxonomy(TaxonomyID taxonomy) { + this.taxonomy = taxonomy; + } + + @Override + public CompoundSet getCompoundSet() { + if (compoundSet != null) { + return compoundSet; + } + // This is invalid since the parentSequence isn't guaranteed to have the same compound set as this sequence, + // e.g., the case where the parent sequence for a protein is a CDS. + /* if (parentSequence != null) { return parentSequence.getCompoundSet(); } */ - return null; - - - } - - public void setCompoundSet(CompoundSet compoundSet) { - this.compoundSet = compoundSet; - } - - @Override - public String toString() { - return getSequenceAsString(); - } - - private SequenceReader getSequenceStorage() { - if (sequenceStorage != null) { - return sequenceStorage; - } - if (parentSequence != null) { - - //return parentSequence.getSequenceStorage(); - - if ( this.compoundSet.equals(parentSequence.getCompoundSet())){ - sequenceStorage = new ArrayListSequenceReader(); - sequenceStorage.setCompoundSet(this.getCompoundSet()); - try { - sequenceStorage.setContents(parentSequence.getSequenceAsString()); - } catch (CompoundNotFoundException e) { - // TODO is there a better way to handle this exception? - logger.error("Problem setting contents from parent sequence, some unrecognised compound: {}",e.getMessage()); - } - return sequenceStorage; - } - - } - - return null; - } - - /** - * - * @param begin - * @param end - * @param strand - * @return - */ - public String getSequenceAsString(Integer bioStart, Integer bioEnd, Strand strand) { - - Location loc = new SimpleLocation(bioStart, bioEnd, strand); - return loc.getSubSequence(this).getSequenceAsString(); - } - - /** - * Default case is to assume strand is positive because only CDSSequence can be either positive or negative Strand. - * @return - */ - @Override - public String getSequenceAsString() { - return SequenceMixin.toString(this); - - } - - /** - * - * @return - */ - @Override - public List getAsList() { - return SequenceMixin.toList(this); - } - - /** - * - * @param position The 1-indexed position of the amino acid - * @return - */ - @Override - public C getCompoundAt(int position) { - - return getSequenceStorage().getCompoundAt(position); - } - - /** - * - * @param compound - * @return The first index of compound in this sequence (1-based) - */ - @Override - public int getIndexOf(C compound) { - return getSequenceStorage().getIndexOf(compound); - } - - /** - * - * @param compound - * @return The last index of compound in this sequence (1-based) - */ - @Override - public int getLastIndexOf(C compound) { - return getSequenceStorage().getLastIndexOf(compound); - } - - /** - * - * @return - */ - @Override - public int getLength() { - return getSequenceStorage().getLength(); - } - - /** - * - * @param bioStart - * @param bioEnd - * @return - */ - @Override - public SequenceView getSubSequence(final Integer bioStart, final Integer bioEnd) { - return new SequenceProxyView(this, bioStart, bioEnd); - } - - /** - * - * @return - */ - @Override - public Iterator iterator() { - return getSequenceStorage().iterator(); - } - - /** - * - * @param compounds - * @return - */ - @Override - public int countCompounds(C... compounds) { - return SequenceMixin.countCompounds(this, compounds); - } - - /** - * - * @return - */ - @Override - public SequenceView getInverse() { - return SequenceMixin.inverse(this); - } - - //TODO needs equals and hashcode + return null; + } + + public void setCompoundSet(CompoundSet compoundSet) { + this.compoundSet = compoundSet; + } + + @Override + public String toString() { + return getSequenceAsString(); + } + + private SequenceReader getSequenceStorage() { + if (sequenceStorage != null) { + return sequenceStorage; + } + if (parentSequence != null) { + + //return parentSequence.getSequenceStorage(); + + if (this.compoundSet.equals(parentSequence.getCompoundSet())) { + sequenceStorage = new ArrayListSequenceReader(); + sequenceStorage.setCompoundSet(this.getCompoundSet()); + try { + sequenceStorage.setContents(parentSequence.getSequenceAsString()); + } catch (CompoundNotFoundException e) { + // TODO is there a better way to handle this exception? + logger.error("Problem setting contents from parent sequence, some unrecognised compound: {}", e.getMessage()); + } + return sequenceStorage; + } + + } + + return null; + } + + public String getSequenceAsString(Integer bioStart, Integer bioEnd, Strand strand) { + + Location loc = new SimpleLocation(bioStart, bioEnd, strand); + return loc.getSubSequence(this).getSequenceAsString(); + } + + /** + * Default case is to assume strand is positive because only CDSSequence can be either positive or negative Strand. + * + * @return + */ + @Override + public String getSequenceAsString() { + return SequenceMixin.toString(this); + + } + + @Override + public List getAsList() { + return SequenceMixin.toList(this); + } + + /** + * @param position The 1-indexed position of the amino acid + * @return + */ + @Override + public C getCompoundAt(int position) { + + return getSequenceStorage().getCompoundAt(position); + } + + /** + * @param compound + * @return The first index of compound in this sequence (1-based) + */ + @Override + public int getIndexOf(C compound) { + return getSequenceStorage().getIndexOf(compound); + } + + /** + * @param compound + * @return The last index of compound in this sequence (1-based) + */ + @Override + public int getLastIndexOf(C compound) { + return getSequenceStorage().getLastIndexOf(compound); + } + + @Override + public int getLength() { + return getSequenceStorage().getLength(); + } + + @Override + public SequenceView getSubSequence(final Integer bioStart, final Integer bioEnd) { + return new SequenceProxyView(this, bioStart, bioEnd); + } + + @Override + public Iterator iterator() { + return getSequenceStorage().iterator(); + } + + @Override + public int countCompounds(C... compounds) { + return SequenceMixin.countCompounds(this, compounds); + } + + @Override + public SequenceView getInverse() { + return SequenceMixin.inverse(this); + } + + + public List getPublicationReference() { + return publicationReference; + } + + public void setPublicationReference(List publicationReference) { + this.publicationReference = publicationReference; + } + + public List getKeywords() { + return keywords; + } + + public void setKeywords(List keywords) { + this.keywords = keywords; + } + + public String getOrganism() { + return organism; + } + + public void setOrganism(String organism) { + this.organism = organism; + } + + public String getComment() { + return comment; + } + + public void setComment(String comment) { + this.comment = comment; + } } diff --git a/biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenBankReferenceTest.java b/biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenBankReferenceTest.java new file mode 100644 index 0000000000..faaa1b3daf --- /dev/null +++ b/biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenBankReferenceTest.java @@ -0,0 +1,170 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package org.biojava.nbio.core.sequence.io; + +import org.biojava.nbio.core.sequence.DNASequence; +import org.biojava.nbio.core.sequence.compound.DNACompoundSet; +import org.biojava.nbio.core.sequence.compound.NucleotideCompound; +import org.biojava.nbio.core.sequence.features.FeatureInterface; +import org.biojava.nbio.core.sequence.features.PublicationReference; +import org.biojava.nbio.core.sequence.features.PublicationReferenceAuthor; +import org.biojava.nbio.core.sequence.template.AbstractSequence; +import org.junit.*; + +import java.io.InputStream; +import java.util.LinkedHashMap; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +public class GenBankReferenceTest { + + public GenBankReferenceTest() { + } + + @BeforeClass + public static void setUpClass() throws Exception { + } + + @AfterClass + public static void tearDownClass() throws Exception { + } + + @Before + public void setUp() { + } + + @After + public void tearDown() { + } + + + @Test + public void testDirectSubmission() throws Exception { + + InputStream inStream = this.getClass().getResourceAsStream("/SCU49845.gb"); + + GenbankReader GenbankDNA = + new GenbankReader( + inStream, + new GenericGenbankHeaderParser(), + new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()) + ); + + LinkedHashMap dnaSequences = GenbankDNA.process(); + DNASequence result = dnaSequences.get("U49845"); + List publicationReferences = result.getPublicationReference(); + Assert.assertEquals(2, publicationReferences.size()); + PublicationReference publicationReference1 = publicationReferences.get(0); + Assert.assertEquals("8846915", publicationReference1.getId()); + Assert.assertEquals("Genes Dev. 10 (7), 777-793 (1996)", publicationReference1.getJournal()); + Assert.assertEquals(PublicationReference.ReferenceType.PUBMED, publicationReference1.getReferenceType()); + Assert.assertEquals("Selection of axial growth sites in yeast requires Axl2p, a novelplasma membrane glycoprotein", publicationReference1.getTitle()); + + List authors = publicationReference1.getAuthors(); + Assert.assertEquals(4, authors.size()); + + PublicationReferenceAuthor author1 = authors.get(0); + Assert.assertEquals("T.", author1.getFirstName()); + Assert.assertEquals("Roemer", author1.getLastName()); + Assert.assertEquals("T. Roemer", author1.getFullName()); + + PublicationReferenceAuthor author2 = authors.get(1); + Assert.assertEquals("K.", author2.getFirstName()); + Assert.assertEquals("Madden", author2.getLastName()); + Assert.assertEquals("K. Madden", author2.getFullName()); + + PublicationReferenceAuthor author3 = authors.get(2); + Assert.assertEquals("J.", author3.getFirstName()); + Assert.assertEquals("Chang", author3.getLastName()); + Assert.assertEquals("J. Chang", author3.getFullName()); + + PublicationReferenceAuthor author4 = authors.get(3); + Assert.assertEquals("M.", author4.getFirstName()); + Assert.assertEquals("Snyder", author4.getLastName()); + Assert.assertEquals("M. Snyder", author4.getFullName()); + + PublicationReference publicationReference2 = publicationReferences.get(1); + //Direct submission have no id + Assert.assertEquals(null, publicationReference2.getId()); + Assert.assertEquals("Submitted (22-FEB-1996) Biology, Yale University, New Haven, CT06520, USA", publicationReference2.getJournal()); + Assert.assertEquals(PublicationReference.ReferenceType.DIRECT_SUBMISSION, publicationReference2.getReferenceType()); + Assert.assertEquals("Direct Submission", publicationReference2.getTitle()); + + inStream.close(); + } + + @Test + public void testPatent() throws Exception { + + InputStream inStream = this.getClass().getResourceAsStream("/E01172.gb"); + + GenbankReader GenbankDNA = + new GenbankReader( + inStream, + new GenericGenbankHeaderParser(), + new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()) + ); + + + LinkedHashMap dnaSequences = GenbankDNA.process(); + + DNASequence result = dnaSequences.get("E01172"); + List publicationReferences = result.getPublicationReference(); + Assert.assertEquals(1, publicationReferences.size()); + PublicationReference publicationReference1 = publicationReferences.get(0); + Assert.assertEquals(null, publicationReference1.getId()); + Assert.assertEquals("Patent: JP 1987099398-A 1 08-MAY-1987;F HOFFMANN LA ROCHE & CO AG", publicationReference1.getJournal()); + Assert.assertEquals(PublicationReference.ReferenceType.PATENT, publicationReference1.getReferenceType()); + Assert.assertEquals("NOVEL POLYPEPTIDE", publicationReference1.getTitle()); + List authors = publicationReference1.getAuthors(); + + Assert.assertEquals(4, authors.size()); + + inStream.close(); + } + + @Test + public void testMore() throws Exception { + + InputStream inStream = this.getClass().getResourceAsStream("/HE608876.gb"); + + GenbankReader GenbankDNA = + new GenbankReader( + inStream, + new GenericGenbankHeaderParser(), + new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()) + ); + + LinkedHashMap dnaSequences = GenbankDNA.process(); + + assertEquals(1, dnaSequences.size()); + DNASequence result = dnaSequences.values().iterator().next(); + + inStream.close(); + } + + @Test + public void testKeywords() throws Exception { + + InputStream inStream = this.getClass().getResourceAsStream("/NM_000266.gb"); + + GenbankReader GenbankDNA = + new GenbankReader( + inStream, + new GenericGenbankHeaderParser(), + new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()) + ); + + LinkedHashMap dnaSequences = GenbankDNA.process(); + + assertEquals(1, dnaSequences.size()); + DNASequence result = dnaSequences.values().iterator().next(); + List, NucleotideCompound>> features = result.getFeatures(); + + inStream.close(); + } + +} \ No newline at end of file diff --git a/biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenbankCookbookTest.java b/biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenbankCookbookTest.java index d6018a5eaf..f9df8ed6b1 100644 --- a/biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenbankCookbookTest.java +++ b/biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenbankCookbookTest.java @@ -64,6 +64,7 @@ public void tearDown() { /** * Test of process method, of class GenbankReader. */ + @Ignore @Test public void testProcess() throws Throwable { /* diff --git a/biojava-core/src/test/resources/E01172.gb b/biojava-core/src/test/resources/E01172.gb new file mode 100644 index 0000000000..907d17cc33 --- /dev/null +++ b/biojava-core/src/test/resources/E01172.gb @@ -0,0 +1,51 @@ +LOCUS E01172 246 bp DNA linear PAT 04-NOV-2005 +DEFINITION Synthetic DNA encoding a polypeptide eliciting antibody against + AIDS virus and/or reacting with AIDS-antibody. +ACCESSION E01172 +VERSION E01172.1 GI:2169431 +KEYWORDS JP 1987099398-A/1. +SOURCE synthetic construct + ORGANISM synthetic construct + other sequences; artificial sequences. +REFERENCE 1 (bases 1 to 246) + AUTHORS Buiruherumu,B., Ururitsuhi,S., Yan,M. and Deietoritsuhi,S. + TITLE NOVEL POLYPEPTIDE + JOURNAL Patent: JP 1987099398-A 1 08-MAY-1987; + F HOFFMANN LA ROCHE & CO AG +COMMENT OS Artificial gene + OC Artificial sequence; Genes. + OS AIDS virus + PN JP 1987099398-A/1 + PD 08-MAY-1987 + PF 16-OCT-1986 JP 1986246454 + PR 17-OCT-1985 GB 85 8525615 + PI BUIRUHERUMU BANBUARUSU, URURITSUHI SERUTA, YAN MOUSU, PI + DEIETORITSUHI SUTEYUUBAA + PC C07K7/10,A61K35/74,A61K37/02,A61K39/21,C07H21/04,C07K15/12, PC + C12N1/20, + PC C12N15/00,C12P21/02,G01N33/569//G01N33/577,(C12P21/02, PC + C12R1:19),(C12P21/02, + PC C12R1:125),C07K99:00; + CC strandedness: Double; + CC topology: Linear; + CC hypothetical: No; + CC anti-sense: No; + FH Key Location/Qualifiers + FH + FT CDS >1..<245 + FT /gene='ENV(80)' + FT /product='polypeptide eliciting antibody FT + against AIDS virus + FT and/or reacting with AIDS-antibody'. +FEATURES Location/Qualifiers + source 1..246 + /organism="synthetic construct" + /mol_type="unassigned DNA" + /db_xref="taxon:32630" +ORIGIN + 1 gatccgaagc tcaacagcat ctgctgcaac tcactgtttg gggtatcaaa cagctccagg + 61 ctcgaattct ggctgttgaa cgttacctga aagatcaaca gctcctgggt atctggggct + 121 gcagtggtaa actcatctgc actactgctg ttccttggaa tgcttcttgg tctaataagc + 181 ttctggaaca gatctggaat aacatgactt ggatggagtg ggaccgtgaa atcaacaatt + 241 acactg +// \ No newline at end of file diff --git a/biojava-core/src/test/resources/HE608876.gb b/biojava-core/src/test/resources/HE608876.gb new file mode 100644 index 0000000000..6d60661bb8 --- /dev/null +++ b/biojava-core/src/test/resources/HE608876.gb @@ -0,0 +1,92 @@ +LOCUS HE608876 2257 bp mRNA linear INV 07-NOV-2011 +DEFINITION Cancer pagurus mRNA for putative DEAD-box ATP-dependent RNA + helicase (ddx gene). +ACCESSION HE608876 +VERSION HE608876.1 GI:356483014 +KEYWORDS . +SOURCE Cancer pagurus (edible crab) + ORGANISM Cancer pagurus + Eukaryota; Metazoa; Ecdysozoa; Arthropoda; Crustacea; Malacostraca; + Eumalacostraca; Eucarida; Decapoda; Pleocyemata; Brachyura; + Eubrachyura; Cancroidea; Cancridae; Cancer. +REFERENCE 1 + AUTHORS Hoppes,J.L., Hauton,C. and Hawkins,L.E. + TITLE Immune gene expression of Cancer pagurus haemocytes exposed to the + dinoflagellate parasite Hematodinium sp. in vitro + JOURNAL Unpublished +REFERENCE 2 (bases 1 to 2257) + AUTHORS Hoppes,J. + TITLE Direct Submission + JOURNAL Submitted (25-OCT-2011) to the INSDC. University of Southampton, + School of Ocean and Earth Sciences, European Way, SO14 3ZH, UNITED + KINGDOM +REFERENCE 3 (bases 1 to 2257) + AUTHORS Hoppes,J. and Kleen,M. + TITLE Direct Submission + JOURNAL Submitted (25-OCT-2011) to the INSDC. University of Southampton, + School of Ocean and Earth Sciences, European Way, SO14 3ZH, UNITED + KINGDOM +FEATURES Location/Qualifiers + source 1..2257 + /organism="Cancer pagurus" + /mol_type="mRNA" + /db_xref="taxon:6755" + /tissue_type="haemocytes" + gene 1..2257 + /gene="ddx" + CDS 176..1819 + /gene="ddx" + /codon_start=1 + /product="putative DEAD-box ATP-dependent RNA helicase" + /protein_id="CCE46010.1" + /db_xref="GI:356483015" + /translation="MSYRSRRSRSRSRSRSRDRDRRRDRDDWGGSRGGWGSGGGGRPS + LKGRQPGERLRKPRWDLTKLTPFEKNFYQPTPTVIARSPYEVEKYRTDKEITLRGRNI + PNPIQYFGDYNFPDYVMAEIRRQGYEHPTPIQGQGWPISLQGRDFVGIAQTGSGKTLG + YILPAIVHINHQPYLERGDGPIALILAPTRELAQQILTVAQDYGTSSKIRPTCVFGGA + PKGPQIRDLERGVEICIATPGRLIDFLEAGKTNLRRTTYLVLDEADRMLDMGFEPQIR + KIVDQIRPDRQTLMWSATWPKEVRNLAEDFLKDYIQLNVGSLSLAANHNILQIVDVYQ + EIEKDTKLRQLLNEMAQEKANKTIIFIETKRKVEDVTRGLRSTGWPAMCIHGDKSQQE + RDWVLSEFRSGRAPILVATDVAARGLDVDDVKFVINYDYPSCSEDYVHRIGRTGRSDK + TGTAYTFFTADNCKQAKDLIEVLKEANQVVNPRLYEIMDMARGGGGKGRNRWRGRDDD + RRGGFGRDRDRDRGRMGGSSSGSGGGSRNGYSNGNGYAY" +ORIGIN + 1 aagcagtggt atcaacgcag agtacgcggg ggacgacggt gagactgaac ggtggcagaa + 61 gctcccgact cacctctatt tcaacctgaa ttacacccta aattcataaa ggaaaagcat + 121 caactattta tttattcaaa caacgtgaac atatagtgta aaacaaacca acacgatgtc + 181 gtacagaagt cgcaggtctc ggtcccgttc tcgttctcgt tcccgtgacc gcgaccgtcg + 241 gcgtgatagg gatgattggg gcggctctcg aggaggctgg ggttctggtg gaggtggtcg + 301 gccatcactt aaagggagac agcctggtga gcgcctgcgt aaaccgagat gggacttgac + 361 caagctcacc ccctttgaga agaacttcta ccagcccact ccaactgtca ttgctcgttc + 421 accttatgag gtggaaaagt acagaactga caaagagatc actctacgag gaagaaatat + 481 ccccaacccg attcagtatt ttggtgacta caactttcct gattacgtca tggctgagat + 541 ccgaagacag ggatacgagc accccactcc aattcagggt cagggatggc ccatctctct + 601 ccaaggaagg gacttcgttg gcattgcaca gactggctca ggaaaaacat tgggttacat + 661 tctgcctgcc attgtacaca ttaaccatca gccatatcta gagcgtggag atggccccat + 721 tgccctgata ctagctccta ctagggaatt agcacagcag atcttgacag ttgcacaaga + 781 ctatggtacc tcatccaaga tccgacccac ctgcgtgttt ggaggtgcac caaagggacc + 841 acaaattcgt gacctcgaga gaggagttga gatctgcatt gctactccag gtcgacttat + 901 tgacttcctg gaagctggca agacaaacct tcggcgtact acgtacctgg ttttggatga + 961 agctgaccgt atgcttgata tgggttttga accacaaatt aggaaaattg tagatcagat + 1021 taggcctgac agacaaacac ttatgtggtc tgcaacatgg cccaaggagg tgaggaatct + 1081 cgctgaagac tttctcaagg actacatcca actgaatgtt ggctcccttt cccttgctgc + 1141 aaaccacaac attctccaga ttgtggatgt ataccaggaa atagaaaagg atacaaagct + 1201 gcgtcagttg ctcaatgaga tggcgcagga gaaggctaac aagactataa tctttatcga + 1261 aacgaagagg aaggtggagg atgttactcg ggggttgagg agtactggat ggcctgctat + 1321 gtgtatccat ggtgataaat cacaacaaga gcgagactgg gtcttaagtg aattccgatc + 1381 agggcgggcc ccaatcctag tggccactga tgtagctgct cgcggcttag atgtggatga + 1441 tgtgaagttt gtcatcaact acgactaccc atcatgctct gaggactatg ttcatcggat + 1501 tggtcgaact ggccgatcag acaagactgg aacagcctac acatttttca ctgctgacaa + 1561 ctgcaagcaa gccaaagatc tgattgaagt gttgaaagaa gcaaatcaag ttgttaatcc + 1621 acgactctac gagataatgg atatggctcg tggtggtgga ggcaaaggcc gtaatcgctg + 1681 gaggggccgt gatgacgaca ggcggggagg atttggacgg gaccgtgacc gtgatcgtgg + 1741 ccgcatgggt ggtagtagta gtggtagtgg tggtggttca aggaacggct acagcaatgg + 1801 aaatggatat gcttactgaa gcccgggccg gaggtgaccg cttaccgctg caccatcccc + 1861 tttctaggag attagaggtt gcgcacgtta ccactaagta cttgacaggc ctggttgtca + 1921 agggtaggga aaagttcggc tcaccccatc ataagtgtgc ggatacctgc gccattcctc + 1981 cttgtctacg ggactgctgt ctgcttaagc caacgacgtc attgttagta aacctcactg + 2041 tccaaaagcc agtcaagtca gcatgtcttt tttttcttta ttttttttgc ccccaaacca + 2101 aatattccat tgtctgtagg aaggaattct ttctagtttt tttgttacac tgtatagaca + 2161 ttatttttca atcattttga aggattttgt gtatggacca ataaactttg taaccctctg + 2221 aataaaataa cattgagtaa aaaaaaaaaa aaaaaaa +// \ No newline at end of file diff --git a/biojava-core/src/test/resources/NM_000266.gb b/biojava-core/src/test/resources/NM_000266.gb index a94c71f9f5..399c676cc2 100644 --- a/biojava-core/src/test/resources/NM_000266.gb +++ b/biojava-core/src/test/resources/NM_000266.gb @@ -120,7 +120,7 @@ FEATURES Location/Qualifiers /gene="NDP" /gene_synonym="EVR2; FEVR; ND" /note="Norrie disease (pseudoglioma)" - /db_xref="GeneID:4693" + /db_xref=" :4693" /db_xref="HGNC:7678" /db_xref="HPRD:02404" /db_xref="MIM:300658" diff --git a/biojava-core/src/test/resources/SCU49845.gb b/biojava-core/src/test/resources/SCU49845.gb new file mode 100644 index 0000000000..5f4d3a5de7 --- /dev/null +++ b/biojava-core/src/test/resources/SCU49845.gb @@ -0,0 +1,166 @@ +LOCUS SCU49845 5028 bp DNA linear PLN 23-MAR-2010 +DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds; and Axl2p + (AXL2) and Rev7p (REV7) genes, complete cds. +ACCESSION U49845 +VERSION U49845.1 GI:1293613 +KEYWORDS . +SOURCE Saccharomyces cerevisiae (baker's yeast) + ORGANISM Saccharomyces cerevisiae + Eukaryota; Fungi; Dikarya; Ascomycota; Saccharomycotina; + Saccharomycetes; Saccharomycetales; Saccharomycetaceae; + Saccharomyces. +REFERENCE 1 (bases 1 to 5028) + AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. + TITLE Selection of axial growth sites in yeast requires Axl2p, a novel + plasma membrane glycoprotein + JOURNAL Genes Dev. 10 (7), 777-793 (1996) + PUBMED 8846915 +REFERENCE 2 (bases 1 to 5028) + AUTHORS Roemer,T. + TITLE Direct Submission + JOURNAL Submitted (22-FEB-1996) Biology, Yale University, New Haven, CT + 06520, USA +FEATURES Location/Qualifiers + source 1..5028 + /organism="Saccharomyces cerevisiae" + /mol_type="genomic DNA" + /db_xref="taxon:4932" + /chromosome="IX" + mRNA <1..>206 + /product="TCP1-beta" + CDS <1..206 + /codon_start=3 + /product="TCP1-beta" + /protein_id="AAA98665.1" + /db_xref="GI:1293614" + /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA + AEVLLRVDNIIRARPRTANRQHM" + gene <687..>3158 + /gene="AXL2" + mRNA <687..>3158 + /gene="AXL2" + /product="Axl2p" + CDS 687..3158 + /gene="AXL2" + /note="plasma membrane glycoprotein" + /codon_start=1 + /product="Axl2p" + /protein_id="AAA98666.1" + /db_xref="GI:1293615" + /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF + TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN + VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE + VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE + TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV + YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG + DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ + DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA + NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA + CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN + NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ + SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS + YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK + HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL + VDFSNKSNVNVGQVKDIHGRIPEML" + gene complement(<3300..>4037) + /gene="REV7" + mRNA complement(<3300..>4037) + /gene="REV7" + /product="Rev7p" + CDS complement(3300..4037) + /gene="REV7" + /codon_start=1 + /product="Rev7p" + /protein_id="AAA98667.1" + /db_xref="GI:1293616" + /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ + FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD + KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR + RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK + LISGDDKILNGVYSQYEEGESIFGSLF" +ORIGIN + 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg + 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct + 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa + 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg + 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa + 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa + 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat + 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga + 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc + 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga + 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta + 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag + 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa + 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata + 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga + 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac + 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg + 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc + 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa + 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca + 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac + 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa + 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag + 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct + 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac + 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa + 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc + 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata + 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca + 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc + 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc + 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca + 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc + 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg + 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt + 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc + 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg + 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca + 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata + 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg + 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga + 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt + 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat + 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt + 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc + 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag + 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta + 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa + 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact + 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt + 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa + 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag + 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct + 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt + 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact + 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa + 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg + 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt + 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc + 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca + 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc + 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc + 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat + 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa + 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga + 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat + 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc + 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc + 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa + 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg + 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc + 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt + 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg + 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg + 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt + 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt + 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat + 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc + 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct + 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta + 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac + 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct + 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct + 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc +// \ No newline at end of file