// Copyright (C) 2010 Christian M Zmasek
// Copyright (C) 2010 Sanford-Burnham Medical Research Institute
// All rights reserved
-//
+//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
-//
+//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
//
// Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org/forester
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
package org.forester.io.parsers;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.forester.msa.BasicMsa;
import org.forester.msa.Msa;
import org.forester.msa.MsaFormatException;
-import org.forester.phylogeny.Phylogeny;
-import org.forester.phylogeny.PhylogenyNode;
-import org.forester.phylogeny.data.Accession;
-import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
import org.forester.sequence.BasicSequence;
import org.forester.sequence.Sequence;
-import org.forester.util.ForesterUtil;
public class FastaParser {
private static final Pattern SEQ_REGEX = Pattern.compile( "^\\s*(.+)" );
private static final Pattern ANYTHING_REGEX = Pattern.compile( "[\\d\\s]+" );
//>gi|71834668|ref|NP_001025424.1| Bcl2 [Danio rerio]
- private static final Pattern FASTA_DESC_LINE = Pattern
+ public static final Pattern FASTA_DESC_LINE = Pattern
.compile( ">?\\s*([^|]+)\\|([^|]+)\\S*\\s+(.+)\\s+\\[(.+)\\]" );
public static void main( final String[] args ) {
}
}
+ static public boolean isLikelyFasta( final File f ) throws IOException {
+ return isLikelyFasta( new FileInputStream( f ) );
+ }
+
static public boolean isLikelyFasta( final InputStream is ) throws IOException {
final BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) );
String line = null;
return false;
}
+ static public Msa parseMsa( final File f ) throws IOException {
+ return parseMsa( new FileInputStream( f ) );
+ }
+
static public Msa parseMsa( final InputStream is ) throws IOException {
return BasicMsa.createInstance( parse( is ) );
}
return parseMsa( new ByteArrayInputStream( bytes ) );
}
+ static public List<Sequence> parse( final File f ) throws IOException {
+ return parse( new FileInputStream( f ) );
+ }
+
static public List<Sequence> parse( final InputStream is ) throws IOException {
final BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) );
String line = null;
reader.close();
final List<Sequence> seqs = new ArrayList<Sequence>();
for( int i = 0; i < temp_msa.size(); ++i ) {
- seqs.add( BasicSequence.createAaSequence( temp_msa.get( i )[ 0 ].toString(), temp_msa.get( i )[ 1 ]
- .toString() ) );
+ seqs.add( BasicSequence.createAaSequence( temp_msa.get( i )[ 0 ].toString(),
+ temp_msa.get( i )[ 1 ].toString() ) );
}
return seqs;
}
}
return line;
}
-
- public static void extractFastaInformation( final Phylogeny phy ) {
- for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) {
- final PhylogenyNode node = iter.next();
- if ( !ForesterUtil.isEmpty( node.getName() ) ) {
- final Matcher name_m = FASTA_DESC_LINE.matcher( node.getName() );
- if ( name_m.lookingAt() ) {
- System.out.println();
- // System.out.println( name_m.group( 1 ) );
- // System.out.println( name_m.group( 2 ) );
- // System.out.println( name_m.group( 3 ) );
- // System.out.println( name_m.group( 4 ) );
- final String acc_source = name_m.group( 1 );
- final String acc = name_m.group( 2 );
- final String seq_name = name_m.group( 3 );
- final String tax_sn = name_m.group( 4 );
- if ( !ForesterUtil.isEmpty( acc_source ) && !ForesterUtil.isEmpty( acc ) ) {
- ForesterUtil.ensurePresenceOfSequence( node );
- node.getNodeData().getSequence( 0 ).setAccession( new Accession( acc, acc_source ) );
- }
- if ( !ForesterUtil.isEmpty( seq_name ) ) {
- ForesterUtil.ensurePresenceOfSequence( node );
- node.getNodeData().getSequence( 0 ).setName( seq_name );
- }
- if ( !ForesterUtil.isEmpty( tax_sn ) ) {
- ForesterUtil.ensurePresenceOfTaxonomy( node );
- node.getNodeData().getTaxonomy( 0 ).setScientificName( tax_sn );
- }
- }
- }
- }
- }
}