X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2FFastaFile.java;h=adfa96711bdde05cf30160b74933c1b84f8a11e5;hb=ba0711d9cab4854f27589fc58ef3f0fb4cba3908;hp=4c1ba9ee3ec25e575de461095c6066f969bce4e9;hpb=6cbe1876d4a5fdd7d5b73e11bf2468fe4e75ce99;p=jalview.git diff --git a/src/jalview/io/FastaFile.java b/src/jalview/io/FastaFile.java index 4c1ba9e..adfa967 100755 --- a/src/jalview/io/FastaFile.java +++ b/src/jalview/io/FastaFile.java @@ -1,148 +1,186 @@ +/* +* Jalview - A Sequence Alignment Editor and Viewer +* Copyright (C) 2005 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version 2 +* of the License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +*/ package jalview.io; -import jalview.datamodel.*; import jalview.analysis.*; +import jalview.datamodel.*; + import java.io.*; + import java.util.*; + public class FastaFile extends AlignFile { + public FastaFile() { + } - public FastaFile() - {} + public FastaFile(String inStr) { + super(inStr); + } - public FastaFile(String inStr) { - super(inStr); - } + public FastaFile(String inFile, String type) throws IOException { + super(inFile, type); + } - public FastaFile(String inFile, String type) throws IOException { - super(inFile,type); - } + public void parse() throws IOException { + String id = ""; + StringBuffer seq = new StringBuffer(); + int count = 0; + boolean flag = false; + + int sstart = 0; + int send = 0; + + String line; + + while ((line = nextLine()) != null) { + if (line.length() > 0) { + // Do we have an id line? + // JBPNote - this code needs to be standardised to EBI/whatever for the + // >dbref/dbref/dbref|refid1|refid2|refid3 'human-readable' style of naming (should it really exist) + + if (line.substring(0, 1).equals(">")) { + if (count != 0) { + if (sstart != 0) { + seqs.addElement(new Sequence(id, + seq.toString().toUpperCase(), sstart, send)); + } else { + seqs.addElement(new Sequence(id, + seq.toString().toUpperCase(), 1, + seq.length())); + } + } + + count++; + + StringTokenizer str = new StringTokenizer(line, " "); + + id = str.nextToken(); + id = id.substring(1); + + com.stevesoft.pat.Regex dbId = new com.stevesoft.pat.Regex( + "[A-Za-z-]+/?[A-Za-z-]+\\|(\\w+)\\|(.+)"); + // JBPNote At the moment - we don't get rid of the friendly names but this + // behaviour is probably wrong in the long run. + if (dbId.search(id)) { + String dbid = dbId.stringMatched(1); + String idname = dbId.stringMatched(2); + if ( (idname.length() > 0) && + (idname.indexOf("_") > -1)) { + id = idname; // use the friendly name - apparently no dbid + } else + if (dbid.length()>1) { + id = dbid; // ignore the friendly name - we lose uniprot accession ID otherwise + } + } + + if (id.indexOf("/") > 0) { + StringTokenizer st = new StringTokenizer(id, "/"); + + if (st.countTokens() == 2) { + id = st.nextToken(); + + String tmp = st.nextToken(); + + st = new StringTokenizer(tmp, "-"); + + if (st.countTokens() == 2) { + sstart = Integer.valueOf(st.nextToken()) + .intValue(); + send = Integer.valueOf(st.nextToken()).intValue(); + } + } + } + + seq = new StringBuffer(); + } else { + seq = seq.append(line); + } + } + } - public void parse() throws IOException - { + if (count > 0) { + if (!isValidProteinSequence(seq.toString().toUpperCase())) { + throw new IOException("Invalid protein sequence"); + } - String id = ""; - StringBuffer seq = new StringBuffer(); - int count = 0; - boolean flag = false; + if (sstart != 0) { + seqs.addElement(new Sequence(id, seq.toString().toUpperCase(), + sstart, send)); + } else { + seqs.addElement(new Sequence(id, seq.toString().toUpperCase(), + 1, seq.length())); + } + } + } - int sstart = 0; - int send = 0; + public static String print(SequenceI[] s) { + return print(s, 72); + } - String line; + public static String print(SequenceI[] s, int len) { + return print(s, len, true); + } - while ((line = nextLine()) != null) { + public static String print(SequenceI[] s, int len, boolean gaps) { + return print(s, len, gaps, true); + } - if (line.length() > 0) { + public static String print(SequenceI[] s, int len, boolean gaps, + boolean displayId) { + StringBuffer out = new StringBuffer(); + int i = 0; - // Do we have an id line? + while ((i < s.length) && (s[i] != null)) { + String seq = ""; - if (line.substring(0,1).equals(">")) { + if (gaps) { + seq = s[i].getSequence(); + } else { + seq = AlignSeq.extractGaps("-. ", s[i].getSequence()); + } - if (count != 0) { - if (sstart != 0) { - seqs.addElement(new Sequence(id,seq.toString().toUpperCase(),sstart,send)); - } else { - seqs.addElement(new Sequence(id,seq.toString().toUpperCase(),1,seq.length())); - } - } + // used to always put this here: + "/" + s[i].getStart() + "-" + s[i].getEnd() + + out.append(">" + + ((displayId) ? s[i].getDisplayId() : s[i].getName()) + "\n"); - count++; + int nochunks = (seq.length() / len) + 1; - StringTokenizer str = new StringTokenizer(line," "); + for (int j = 0; j < nochunks; j++) { + int start = j * len; + int end = start + len; - id = str.nextToken(); - id = id.substring(1); - if(id.indexOf("UniProt/Swiss-Prot")>-1) - { - id = id.substring(id.indexOf("UniProt/Swiss-Prot|") + 19); - if(id.indexOf("|")>-1) - id = id.substring(id.indexOf("|") + 1); + if (end < seq.length()) { + out.append(seq.substring(start, end) + "\n"); + } else if (start < seq.length()) { + out.append(seq.substring(start) + "\n"); + } } - if (id.indexOf("/") > 0 ) { - - StringTokenizer st = new StringTokenizer(id,"/"); - if (st.countTokens() == 2) { - id = st.nextToken(); - String tmp = st.nextToken(); - - st = new StringTokenizer(tmp,"-"); - - if (st.countTokens() == 2) { - sstart = Integer.valueOf(st.nextToken()).intValue(); - send = Integer.valueOf(st.nextToken()).intValue(); - } - } - } - - seq = new StringBuffer(); - - } else { - seq = seq.append(line); - } - } - } - if (count > 0) { - - if(!isValidProteinSequence(seq.toString().toUpperCase())) - throw new IOException("Invalid protein sequence"); - - if (sstart != 0) { - seqs.addElement(new Sequence(id,seq.toString().toUpperCase(),sstart,send)); - } else { - seqs.addElement(new Sequence(id,seq.toString().toUpperCase(),1,seq.length())); - } - } - - } - - public static String print(SequenceI[] s) { - return print(s,72); - } - public static String print(SequenceI[] s, int len) { - return print(s,len,true); - } - - public static String print(SequenceI[] s, int len,boolean gaps) { - return print(s,len,gaps,true); - } - - public static String print(SequenceI[] s, int len,boolean gaps, boolean displayId) { - StringBuffer out = new StringBuffer(); - int i = 0; - while (i < s.length && s[i] != null) { - String seq = ""; - if (gaps) { - seq = s[i].getSequence(); - } else { - seq = AlignSeq.extractGaps("-. ",s[i].getSequence()); - } - // used to always put this here: + "/" + s[i].getStart() + "-" + s[i].getEnd() + - out.append(">" + ((displayId) ? s[i].getDisplayId() : s[i].getName())+"\n"); - - int nochunks = seq.length() / len + 1; - - for (int j = 0; j < nochunks; j++) { - int start = j*len; - int end = start + len; - - if (end < seq.length()) { - out.append(seq.substring(start,end) + "\n"); - } else if (start < seq.length()) { - out.append(seq.substring(start) + "\n"); + i++; } - } - i++; + + return out.toString(); } - return out.toString(); - } - public String print() { - return print(getSeqsAsArray()); - } + public String print() { + return print(getSeqsAsArray()); + } } - - -