From: Jim Procter Date: Fri, 30 Oct 2015 12:27:39 +0000 (+0000) Subject: JAL-1954 rejigged PFAMFile to split on ‘\t’ if no space present and optimised for... X-Git-Tag: Release_2_10_0~343^2^2 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=abad83613306b9afe956746610e3d62b47c4c0fd;p=jalview.git JAL-1954 rejigged PFAMFile to split on ‘\t’ if no space present and optimised for long line searches. Will not cope with mixed space/tab separators. --- diff --git a/src/jalview/io/PfamFile.java b/src/jalview/io/PfamFile.java index 07ba3f5..71cc7f0 100755 --- a/src/jalview/io/PfamFile.java +++ b/src/jalview/io/PfamFile.java @@ -26,9 +26,8 @@ import jalview.util.Format; import jalview.util.MessageManager; import java.io.IOException; -import java.util.Hashtable; -import java.util.StringTokenizer; -import java.util.Vector; +import java.util.ArrayList; +import java.util.HashMap; public class PfamFile extends AlignFile { @@ -47,57 +46,70 @@ public class PfamFile extends AlignFile super(source); } + @Override public void initData() { super.initData(); } + @Override public void parse() throws IOException { int i = 0; String line; - Hashtable seqhash = new Hashtable(); - Vector headers = new Vector(); - + HashMap seqhash = new HashMap(); + ArrayList headers = new ArrayList(); + boolean useTabs = false; + int spces; while ((line = nextLine()) != null) { - if (line.indexOf(" ") != 0) + if (line.indexOf("#") == 0) + { + // skip comment lines + continue; + } + // locate first space or (if already checked), tab + if (useTabs) + { + spces = line.indexOf("\t"); + } + else { - if (line.indexOf("#") != 0) + spces = line.indexOf(" "); + // check to see if we ought to split on tabs instead. + if (!useTabs && spces == -1) { - // TODO: verify pfam format requires spaces and not tab characters - - // if not upgrade to use stevesoft regex and look for whitespace. - StringTokenizer str = new StringTokenizer(line, " "); - String id = ""; - - if (str.hasMoreTokens()) - { - id = str.nextToken(); - - StringBuffer tempseq; - - if (seqhash.containsKey(id)) - { - tempseq = (StringBuffer) seqhash.get(id); - } - else - { - tempseq = new StringBuffer(); - seqhash.put(id, tempseq); - } - - if (!(headers.contains(id))) - { - headers.addElement(id); - } - if (str.hasMoreTokens()) - { - tempseq.append(str.nextToken()); - } - } + useTabs = true; + spces = line.indexOf("\t"); } } + if (spces <= 0) + { + // no sequence data to split on + continue; + } + String id = line.substring(0, spces); + StringBuffer tempseq; + + if (seqhash.containsKey(id)) + { + tempseq = seqhash.get(id); + } + else + { + tempseq = new StringBuffer(); + seqhash.put(id, tempseq); + } + + if (!(headers.contains(id))) + { + headers.add(id); + } + if (spces + 1 < line.length()) + { + tempseq.append(line.substring(spces + 1)); + } } this.noSeqs = headers.size(); @@ -110,23 +122,23 @@ public class PfamFile extends AlignFile for (i = 0; i < headers.size(); i++) { - if (seqhash.get(headers.elementAt(i)) != null) + if (seqhash.get(headers.get(i)) != null) { - if (maxLength < seqhash.get(headers.elementAt(i)).toString() + if (maxLength < seqhash.get(headers.get(i)).toString() .length()) { - maxLength = seqhash.get(headers.elementAt(i)).toString().length(); + maxLength = seqhash.get(headers.get(i)).toString().length(); } - Sequence newSeq = parseId(headers.elementAt(i).toString()); - newSeq.setSequence(seqhash.get(headers.elementAt(i).toString()) + Sequence newSeq = parseId(headers.get(i).toString()); + newSeq.setSequence(seqhash.get(headers.get(i).toString()) .toString()); seqs.addElement(newSeq); } else { System.err.println("PFAM File reader: Can't find sequence for " - + headers.elementAt(i)); + + headers.get(i)); } } } @@ -178,6 +190,7 @@ public class PfamFile extends AlignFile return out.toString(); } + @Override public String print() { return print(getSeqsAsArray());