initial commit
[jalview.git] / forester / java / src / org / forester / go / OBOparser.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 // 
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 // 
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
25
26 package org.forester.go;
27
28 import java.io.BufferedReader;
29 import java.io.File;
30 import java.io.FileReader;
31 import java.io.IOException;
32 import java.util.ArrayList;
33 import java.util.HashSet;
34 import java.util.List;
35 import java.util.Set;
36
37 import org.forester.util.ForesterUtil;
38
39 public class OBOparser {
40
41     private final File       _input_file;    ;
42     private final ReturnType _return_type;
43     private int              _go_term_count;
44
45     public OBOparser( final File input_file, final ReturnType return_type ) {
46         switch ( return_type ) {
47             case BASIC_GO_TERM:
48                 break;
49             default:
50                 throw new IllegalArgumentException( "unknown return type: " + return_type );
51         }
52         _input_file = input_file;
53         _return_type = return_type;
54         init();
55     }
56
57     private GoTerm createNewBasicGoTerm( final String id,
58                                          final String name,
59                                          final String namespace,
60                                          final String is_obsolete,
61                                          final String comment,
62                                          final String definition,
63                                          final Set<String> alt_ids,
64                                          final List<GoXRef> go_xrefs,
65                                          final List<GoId> super_go_ids,
66                                          final List<GoRelationship> go_relationships,
67                                          final List<GoSubset> go_subsets ) {
68         final GoTerm gt = new BasicGoTerm( id, name, namespace, is_obsolete.trim().toLowerCase().equals( "true" ) );
69         ( ( BasicGoTerm ) gt ).setComment( comment );
70         ( ( BasicGoTerm ) gt ).setDefinition( definition );
71         for( final GoXRef x : go_xrefs ) {
72             gt.getGoXRefs().add( x );
73         }
74         for( final GoId s : super_go_ids ) {
75             gt.getSuperGoIds().add( s );
76         }
77         for( final GoRelationship r : go_relationships ) {
78             gt.getGoRelationships().add( r );
79         }
80         for( final GoSubset sub : go_subsets ) {
81             gt.getGoSubsets().add( sub );
82         }
83         for( final String alt_id : alt_ids ) {
84             gt.getAltIds().add( new GoId( alt_id ) );
85         }
86         ++_go_term_count;
87         return gt;
88     }
89
90     private void createNewGoTerm( final List<GoTerm> go_terms,
91                                   final String id,
92                                   final String name,
93                                   final String namespace,
94                                   final String is_obsolete,
95                                   final String comment,
96                                   final String definition,
97                                   final Set<String> alt_ids,
98                                   final List<GoXRef> go_xrefs,
99                                   final List<GoId> super_go_ids,
100                                   final List<GoRelationship> go_relationships,
101                                   final List<GoSubset> go_subsets ) {
102         GoTerm gt;
103         switch ( getReturnType() ) {
104             case BASIC_GO_TERM:
105                 gt = createNewBasicGoTerm( id,
106                                            name,
107                                            namespace,
108                                            is_obsolete,
109                                            comment,
110                                            definition,
111                                            alt_ids,
112                                            go_xrefs,
113                                            super_go_ids,
114                                            go_relationships,
115                                            go_subsets );
116                 break;
117             default:
118                 throw new AssertionError( "unknown return type: " + getReturnType() );
119         }
120         go_terms.add( gt );
121     }
122
123     public int getGoTermCount() {
124         return _go_term_count;
125     }
126
127     private File getInputFile() {
128         return _input_file;
129     }
130
131     private ReturnType getReturnType() {
132         return _return_type;
133     }
134
135     private void init() {
136         setGoTermCount( 0 );
137     }
138
139     public List<GoTerm> parse() throws IOException {
140         final String error = ForesterUtil.isReadableFile( getInputFile() );
141         if ( !ForesterUtil.isEmpty( error ) ) {
142             throw new IOException( error );
143         }
144         final BufferedReader br = new BufferedReader( new FileReader( getInputFile() ) );
145         String line;
146         final List<GoTerm> go_terms = new ArrayList<GoTerm>();
147         int line_number = 0;
148         boolean in_term = false;
149         String id = "";
150         String name = "";
151         String namespace = "";
152         String def = "";
153         String comment = "";
154         String is_obsolete = "";
155         HashSet<String> alt_ids = new HashSet<String>();
156         List<GoId> super_go_ids = new ArrayList<GoId>();
157         List<GoXRef> go_xrefs = new ArrayList<GoXRef>();
158         List<GoRelationship> go_relationships = new ArrayList<GoRelationship>();
159         List<GoSubset> go_subsets = new ArrayList<GoSubset>();
160         try {
161             while ( ( line = br.readLine() ) != null ) {
162                 line_number++;
163                 line = line.trim();
164                 if ( line.length() < 1 ) {
165                     if ( in_term ) {
166                         in_term = false;
167                     }
168                 }
169                 else if ( line.startsWith( "[Term]" ) ) {
170                     in_term = true;
171                     if ( id.length() > 0 ) {
172                         createNewGoTerm( go_terms,
173                                          id,
174                                          name,
175                                          namespace,
176                                          is_obsolete,
177                                          comment,
178                                          def,
179                                          alt_ids,
180                                          go_xrefs,
181                                          super_go_ids,
182                                          go_relationships,
183                                          go_subsets );
184                     }
185                     id = "";
186                     name = "";
187                     namespace = "";
188                     alt_ids = new HashSet<String>();
189                     def = "";
190                     comment = "";
191                     is_obsolete = "";
192                     super_go_ids = new ArrayList<GoId>();
193                     go_xrefs = new ArrayList<GoXRef>();
194                     go_relationships = new ArrayList<GoRelationship>();
195                     go_subsets = new ArrayList<GoSubset>();
196                 }
197                 else if ( in_term && line.startsWith( "id:" ) ) {
198                     id = line.substring( 3 ).trim();
199                 }
200                 else if ( in_term && line.startsWith( "name:" ) ) {
201                     name = line.substring( 5 ).trim();
202                 }
203                 else if ( in_term && line.startsWith( "namespace:" ) ) {
204                     namespace = line.substring( 10 ).trim();
205                 }
206                 else if ( in_term && line.startsWith( "alt_id:" ) ) {
207                     alt_ids.add( line.substring( 7 ).trim() );
208                 }
209                 else if ( in_term && line.startsWith( "def:" ) ) {
210                     def = line.substring( 4 ).trim();
211                 }
212                 else if ( in_term && line.startsWith( "is_obsolete:" ) ) {
213                     is_obsolete = line.substring( 12 ).trim();
214                 }
215                 else if ( in_term && line.startsWith( "comment:" ) ) {
216                     comment = line.substring( 8 ).trim();
217                 }
218                 else if ( in_term && line.startsWith( "xref:" ) ) {
219                     final String s = trimOffComment( line.substring( 5 ).trim() );
220                     go_xrefs.add( new BasicGoXRef( s ) );
221                 }
222                 else if ( in_term && line.startsWith( "is_a:" ) ) {
223                     final String s = trimOffComment( line.substring( 5 ).trim() );
224                     super_go_ids.add( new GoId( s ) );
225                 }
226                 else if ( in_term && line.startsWith( "relationship:" ) ) {
227                     final String s = trimOffComment( line.substring( 13 ).trim() );
228                     go_relationships.add( new BasicGoRelationship( s ) );
229                 }
230                 else if ( in_term && line.startsWith( "subset:" ) ) {
231                     final String s = line.substring( 8 ).trim();
232                     go_subsets.add( new BasicGoSubset( s ) );
233                 }
234             } // while ( ( line = br.readLine() ) != null )
235         }
236         catch ( final Exception e ) {
237             throw new IOException( "parsing problem: " + e.getMessage() + " [at line " + line_number + "]" );
238         }
239         if ( id.length() > 0 ) {
240             createNewGoTerm( go_terms,
241                              id,
242                              name,
243                              namespace,
244                              is_obsolete,
245                              comment,
246                              def,
247                              alt_ids,
248                              go_xrefs,
249                              super_go_ids,
250                              go_relationships,
251                              go_subsets );
252         }
253         return go_terms;
254     }
255
256     private void setGoTermCount( final int go_term_count ) {
257         _go_term_count = go_term_count;
258     }
259
260     private String trimOffComment( String xref ) {
261         final int i = xref.indexOf( '!' );
262         if ( i > 0 ) {
263             xref = xref.substring( 0, xref.indexOf( '!' ) ).trim();
264         }
265         return xref;
266     }
267
268     public static enum ReturnType {
269         BASIC_GO_TERM
270     }
271 }