(no commit message)
[jalview.git] / forester / java / src / org / forester / io / parsers / FastaParser.java
index 4c6845c..dc4fc46 100644 (file)
@@ -6,7 +6,7 @@
 // Copyright (C) 2010 Christian M Zmasek
 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
 // All rights reserved
-// 
+//
 // This library is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
 // License as published by the Free Software Foundation; either
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 // Lesser General Public License for more details.
-// 
+//
 // You should have received a copy of the GNU Lesser General Public
 // License along with this library; if not, write to the Free Software
 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 //
 // Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org/forester
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
 
 package org.forester.io.parsers;
 
 import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
@@ -39,13 +41,8 @@ import java.util.regex.Pattern;
 import org.forester.msa.BasicMsa;
 import org.forester.msa.Msa;
 import org.forester.msa.MsaFormatException;
-import org.forester.phylogeny.Phylogeny;
-import org.forester.phylogeny.PhylogenyNode;
-import org.forester.phylogeny.data.Accession;
-import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
 import org.forester.sequence.BasicSequence;
-import org.forester.sequence.Sequence;
-import org.forester.util.ForesterUtil;
+import org.forester.sequence.MolecularSequence;
 
 public class FastaParser {
 
@@ -53,8 +50,8 @@ public class FastaParser {
     private static final Pattern SEQ_REGEX       = Pattern.compile( "^\\s*(.+)" );
     private static final Pattern ANYTHING_REGEX  = Pattern.compile( "[\\d\\s]+" );
     //>gi|71834668|ref|NP_001025424.1| Bcl2 [Danio rerio]
-    private static final Pattern FASTA_DESC_LINE = Pattern
-                                                         .compile( ">?\\s*([^|]+)\\|([^|]+)\\S*\\s+(.+)\\s+\\[(.+)\\]" );
+    public static final Pattern  FASTA_DESC_LINE = Pattern
+            .compile( ">?\\s*([^|]+)\\|([^|]+)\\S*\\s+(.+)\\s+\\[(.+)\\]" );
 
     public static void main( final String[] args ) {
         final String a = ">gi|71834668|ref|NP_001025424.1| Bcl2 [Danio rerio]";
@@ -71,6 +68,10 @@ public class FastaParser {
         }
     }
 
+    static public boolean isLikelyFasta( final File f ) throws IOException {
+        return isLikelyFasta( new FileInputStream( f ) );
+    }
+
     static public boolean isLikelyFasta( final InputStream is ) throws IOException {
         final BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) );
         String line = null;
@@ -92,6 +93,10 @@ public class FastaParser {
         return false;
     }
 
+    static public Msa parseMsa( final File f ) throws IOException {
+        return parseMsa( new FileInputStream( f ) );
+    }
+
     static public Msa parseMsa( final InputStream is ) throws IOException {
         return BasicMsa.createInstance( parse( is ) );
     }
@@ -104,7 +109,11 @@ public class FastaParser {
         return parseMsa( new ByteArrayInputStream( bytes ) );
     }
 
-    static public List<Sequence> parse( final InputStream is ) throws IOException {
+    static public List<MolecularSequence> parse( final File f ) throws IOException {
+        return parse( new FileInputStream( f ) );
+    }
+
+    static public List<MolecularSequence> parse( final InputStream is ) throws IOException {
         final BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) );
         String line = null;
         int line_counter = 0;
@@ -142,10 +151,10 @@ public class FastaParser {
         }
         addSeq( name, current_seq, temp_msa );
         reader.close();
-        final List<Sequence> seqs = new ArrayList<Sequence>();
+        final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
         for( int i = 0; i < temp_msa.size(); ++i ) {
-            seqs.add( BasicSequence.createAaSequence( temp_msa.get( i )[ 0 ].toString(), temp_msa.get( i )[ 1 ]
-                    .toString() ) );
+            seqs.add( BasicSequence.createAaSequence( temp_msa.get( i )[ 0 ].toString(),
+                                                      temp_msa.get( i )[ 1 ].toString() ) );
         }
         return seqs;
     }
@@ -175,36 +184,4 @@ public class FastaParser {
         }
         return line;
     }
-
-    public static void extractFastaInformation( final Phylogeny phy ) {
-        for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) {
-            final PhylogenyNode node = iter.next();
-            if ( !ForesterUtil.isEmpty( node.getName() ) ) {
-                final Matcher name_m = FASTA_DESC_LINE.matcher( node.getName() );
-                if ( name_m.lookingAt() ) {
-                    System.out.println();
-                    // System.out.println( name_m.group( 1 ) );
-                    // System.out.println( name_m.group( 2 ) );
-                    // System.out.println( name_m.group( 3 ) );
-                    // System.out.println( name_m.group( 4 ) );
-                    final String acc_source = name_m.group( 1 );
-                    final String acc = name_m.group( 2 );
-                    final String seq_name = name_m.group( 3 );
-                    final String tax_sn = name_m.group( 4 );
-                    if ( !ForesterUtil.isEmpty( acc_source ) && !ForesterUtil.isEmpty( acc ) ) {
-                        ForesterUtil.ensurePresenceOfSequence( node );
-                        node.getNodeData().getSequence( 0 ).setAccession( new Accession( acc, acc_source ) );
-                    }
-                    if ( !ForesterUtil.isEmpty( seq_name ) ) {
-                        ForesterUtil.ensurePresenceOfSequence( node );
-                        node.getNodeData().getSequence( 0 ).setName( seq_name );
-                    }
-                    if ( !ForesterUtil.isEmpty( tax_sn ) ) {
-                        ForesterUtil.ensurePresenceOfTaxonomy( node );
-                        node.getNodeData().getTaxonomy( 0 ).setScientificName( tax_sn );
-                    }
-                }
-            }
-        }
-    }
 }