3 // forester -- software libraries and applications
4 // for genomics and evolutionary biology research.
6 // Copyright (C) 2010 Christian M Zmasek
7 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 // Contact: phylosoft @ gmail . com
25 // WWW: www.phylosoft.org/forester
27 package org.forester.io.parsers;
29 import java.io.BufferedReader;
30 import java.io.ByteArrayInputStream;
32 import java.io.FileInputStream;
33 import java.io.IOException;
34 import java.io.InputStream;
35 import java.io.InputStreamReader;
36 import java.util.ArrayList;
37 import java.util.List;
38 import java.util.regex.Matcher;
39 import java.util.regex.Pattern;
41 import org.forester.archaeopteryx.Util;
42 import org.forester.msa.BasicMsa;
43 import org.forester.msa.Msa;
44 import org.forester.msa.MsaFormatException;
45 import org.forester.phylogeny.Phylogeny;
46 import org.forester.phylogeny.PhylogenyNode;
47 import org.forester.phylogeny.data.Accession;
48 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
49 import org.forester.sequence.BasicSequence;
50 import org.forester.sequence.Sequence;
51 import org.forester.util.ForesterUtil;
53 public class FastaParser {
55 private static final Pattern NAME_REGEX = Pattern.compile( "^\\s*>\\s*(.+)" );
56 private static final Pattern SEQ_REGEX = Pattern.compile( "^\\s*(.+)" );
57 private static final Pattern ANYTHING_REGEX = Pattern.compile( "[\\d\\s]+" );
58 //>gi|71834668|ref|NP_001025424.1| Bcl2 [Danio rerio]
59 private static final Pattern FASTA_DESC_LINE = Pattern
60 .compile( ">?\\s*([^|]+)\\|([^|]+)\\S*\\s+(.+)\\s+\\[(.+)\\]" );
62 public static void main( final String[] args ) {
63 final String a = ">gi|71834668|ref|NP_001025424.1| Bcl2 [Danio rerio]";
64 final Matcher name_m = FASTA_DESC_LINE.matcher( a );
65 if ( name_m.lookingAt() ) {
67 System.out.println( name_m.group( 1 ) );
68 System.out.println( name_m.group( 2 ) );
69 System.out.println( name_m.group( 3 ) );
70 System.out.println( name_m.group( 4 ) );
73 System.out.println( "Does not match." );
77 static public boolean isLikelyFasta( final InputStream is ) throws IOException {
78 final BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) );
80 while ( ( line = reader.readLine() ) != null ) {
81 final boolean is_name_line = NAME_REGEX.matcher( line ).lookingAt();
82 if ( canIgnore( line, true, false ) ) {
85 else if ( is_name_line ) {
89 else if ( SEQ_REGEX.matcher( line ).lookingAt() ) {
98 static public Msa parseMsa( final File f ) throws IOException {
99 return parseMsa( new FileInputStream( f ) );
102 static public Msa parseMsa( final InputStream is ) throws IOException {
103 return BasicMsa.createInstance( parse( is ) );
106 static public Msa parseMsa( final String s ) throws IOException {
107 return parseMsa( s.getBytes() );
110 static public Msa parseMsa( final byte[] bytes ) throws IOException {
111 return parseMsa( new ByteArrayInputStream( bytes ) );
114 static public List<Sequence> parse( final File f ) throws IOException {
115 return parse( new FileInputStream( f ) );
118 static public List<Sequence> parse( final InputStream is ) throws IOException {
119 final BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) );
121 int line_counter = 0;
122 boolean saw_first_seq = false;
123 StringBuilder current_seq = null;
124 StringBuilder name = null;
125 final List<StringBuilder[]> temp_msa = new ArrayList<StringBuilder[]>();
126 while ( ( line = reader.readLine() ) != null ) {
128 final Matcher name_m = NAME_REGEX.matcher( line );
129 final boolean is_name_line = name_m.lookingAt();
130 if ( canIgnore( line, saw_first_seq, is_name_line ) ) {
133 final Matcher seq_m = SEQ_REGEX.matcher( line );
134 if ( is_name_line ) {
135 saw_first_seq = true;
136 addSeq( name, current_seq, temp_msa );
137 name = new StringBuilder( name_m.group( 1 ).trim() );
138 current_seq = new StringBuilder();
140 else if ( seq_m.lookingAt() ) {
141 if ( name.length() < 1 ) {
143 throw new MsaFormatException( "illegally formatted fasta msa (line: " + line_counter + "):\n\""
144 + trim( line ) + "\"" );
146 current_seq.append( seq_m.group( 1 ).replaceAll( "\\s+", "" ) );
150 throw new MsaFormatException( "illegally formatted fasta msa (line: " + line_counter + "):\n\""
151 + trim( line ) + "\"" );
154 addSeq( name, current_seq, temp_msa );
156 final List<Sequence> seqs = new ArrayList<Sequence>();
157 for( int i = 0; i < temp_msa.size(); ++i ) {
158 seqs.add( BasicSequence.createAaSequence( temp_msa.get( i )[ 0 ].toString(),
159 temp_msa.get( i )[ 1 ].toString() ) );
164 static private boolean canIgnore( final String line, final boolean saw_first_seq, final boolean is_name_line ) {
165 if ( ( line.length() < 1 ) || ANYTHING_REGEX.matcher( line ).matches() ) {
168 if ( !saw_first_seq && !is_name_line ) {
174 private static void addSeq( final StringBuilder name, final StringBuilder seq, final List<StringBuilder[]> temp_msa ) {
175 if ( ( name != null ) && ( seq != null ) && ( name.length() > 0 ) && ( seq.length() > 0 ) ) {
176 final StringBuilder[] ary = new StringBuilder[ 2 ];
183 private static String trim( final String line ) {
184 if ( line.length() > 100 ) {
185 return line.substring( 0, 100 ) + " ...";
190 public static void extractFastaInformation( final Phylogeny phy ) {
191 for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) {
192 final PhylogenyNode node = iter.next();
193 if ( !ForesterUtil.isEmpty( node.getName() ) ) {
194 final Matcher name_m = FASTA_DESC_LINE.matcher( node.getName() );
195 if ( name_m.lookingAt() ) {
196 System.out.println();
197 // System.out.println( name_m.group( 1 ) );
198 // System.out.println( name_m.group( 2 ) );
199 // System.out.println( name_m.group( 3 ) );
200 // System.out.println( name_m.group( 4 ) );
201 final String acc_source = name_m.group( 1 );
202 final String acc = name_m.group( 2 );
203 final String seq_name = name_m.group( 3 );
204 final String tax_sn = name_m.group( 4 );
205 if ( !ForesterUtil.isEmpty( acc_source ) && !ForesterUtil.isEmpty( acc ) ) {
206 Util.ensurePresenceOfSequence( node );
207 node.getNodeData().getSequence( 0 ).setAccession( new Accession( acc, acc_source ) );
209 if ( !ForesterUtil.isEmpty( seq_name ) ) {
210 Util.ensurePresenceOfSequence( node );
211 node.getNodeData().getSequence( 0 ).setName( seq_name );
213 if ( !ForesterUtil.isEmpty( tax_sn ) ) {
214 Util.ensurePresenceOfTaxonomy( node );
215 node.getNodeData().getTaxonomy( 0 ).setScientificName( tax_sn );