fixed issue with UTF8 encoding.
[jalview.git] / forester_applications / src / org / forester / applications / tax_code_cleaner.java
1 // javac -cp ~/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester/java/forester.jar
2 // ~/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester_applications/src/org/forester/applications/tax_code_cleaner.java
3 // java -Xmx2048m -cp
4 // /home/czmasek/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester_applications/src/:/home/czmasek/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester/java/forester.jar
5 // org.forester.applications.tax_code_cleaner
6
7 package org.forester.applications;
8
9 import java.io.File;
10 import java.util.regex.Pattern;
11
12 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
13 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
14 import org.forester.io.writers.PhylogenyWriter;
15 import org.forester.phylogeny.Phylogeny;
16 import org.forester.phylogeny.PhylogenyNode;
17 import org.forester.phylogeny.data.Taxonomy;
18 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
19 import org.forester.phylogeny.factories.PhylogenyFactory;
20 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
21 import org.forester.util.CommandLineArguments;
22 import org.forester.util.ForesterUtil;
23
24 public class tax_code_cleaner {
25
26     private final static String BASE = "b_";
27
28     public static void main( final String args[] ) {
29         File in = null;
30         File out = null;
31         try {
32             CommandLineArguments cla = null;
33             cla = new CommandLineArguments( args );
34             in = cla.getFile( 0 );
35             out = cla.getFile( 1 );
36             // if ( out.exists() ) {
37             //      System.out.println( out + " already exists" );
38             //      System.exit( -1 );
39             //  }
40             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
41             final PhyloXmlParser xml_parser = PhyloXmlParser.createPhyloXmlParserXsdValidating();
42             final Phylogeny[] phylogenies_0 = factory.create( in, xml_parser );
43             final Phylogeny phylogeny_0 = phylogenies_0[ 0 ];
44             final PhylogenyNodeIterator it = phylogeny_0.iteratorPostorder();
45             int i = 0;
46             while ( it.hasNext() ) {
47                 final PhylogenyNode node = it.next();
48                 processNode( node, i );
49                 i++;
50             }
51             final PhylogenyWriter writer = new PhylogenyWriter();
52             writer.toPhyloXML( out, phylogeny_0, 0 );
53         }
54         catch ( final Exception e ) {
55             System.out.println( e.getLocalizedMessage() );
56             e.printStackTrace();
57             System.exit( -1 );
58         }
59     }
60
61     private static void processNode( final PhylogenyNode node, final int i ) throws PhyloXmlDataFormatException {
62         if ( node.isExternal() ) {
63             if ( node.getNodeData().isHasTaxonomy() ) {
64                 final Taxonomy t = node.getNodeData().getTaxonomy();
65                 if ( !ForesterUtil.isEmpty( t.getTaxonomyCode() ) ) {
66                     final String tc = t.getTaxonomyCode();
67                     if ( tc.equals( "ACRALC" ) ) {
68                         t.setScientificName( "Acremonium alcalophilum" );
69                         t.setTaxonomyCode( "AALXX" );
70                     }
71                     else if ( tc.equals( "AMPQU" ) ) {
72                         t.setScientificName( "Amphimedon queenslandica" );
73                         t.setTaxonomyCode( "AMPQE" );
74                     }
75                     else if ( tc.equals( "AQUAE" ) ) {
76                         t.setScientificName( "Aquifex aeolicus (strain VF5)" );
77                     }
78                     else if ( tc.equals( "ASTSPC" ) ) {
79                         t.setScientificName( "Asterochloris sp. Cgr/DA1pho" );
80                         t.setTaxonomyCode( "ASCXX" );
81                     }
82                     else if ( tc.equals( "BAUCOM" ) ) {
83                         t.setScientificName( "Baudoinia compniacensis" );
84                         t.setTaxonomyCode( "BCOXX" );
85                     }
86                     else if ( tc.equals( "CAP" ) ) {
87                         t.setScientificName( "Capitella sp.1" );
88                         t.setTaxonomyCode( "CTEXX" );
89                     }
90                     else if ( tc.equals( "CAPOWC" ) ) {
91                         t.setScientificName( "Capsaspora owczarzaki (strain ATCC 30864)" );
92                         t.setTaxonomyCode( "CAPO3" );
93                     }
94                     else if ( tc.equals( "CHLVUL" ) ) {
95                         t.setScientificName( "Chlorella variabilis" );
96                         t.setTaxonomyCode( "CHLVA" );
97                     }
98                     else if ( tc.equals( "CITCLE" ) ) {
99                         t.setScientificName( "Citrus clementina" );
100                         t.setTaxonomyCode( "CCLXX" );
101                     }
102                     else if ( tc.equals( "CLAGRA" ) ) {
103                         t.setScientificName( "Cladonia grayi" );
104                         t.setTaxonomyCode( "" );
105                     }
106                     else if ( tc.equals( "COEREV" ) ) {
107                         t.setScientificName( "Coemansia reversa" );
108                         t.setTaxonomyCode( "CREXX" );
109                     }
110                     else if ( tc.equals( "CONPUT" ) ) {
111                         t.setScientificName( "Coniophora puteana" );
112                         t.setTaxonomyCode( "CPUXX" );
113                     }
114                     else if ( tc.equals( "DICSQU" ) ) {
115                         t.setScientificName( "Dichomitus squalens" );
116                         t.setTaxonomyCode( "DICSQ" );
117                     }
118                     else if ( tc.equals( "FOMPIN" ) ) {
119                         t.setScientificName( "Fomitopsis pinicola" );
120                         t.setTaxonomyCode( "FPIXX" );
121                     }
122                     else if ( tc.equals( "GONPRO" ) ) {
123                         t.setScientificName( "Gonapodya prolifera" );
124                         t.setTaxonomyCode( "GONPR" );
125                     }
126                     else if ( tc.equals( "GYMLUX" ) ) {
127                         t.setScientificName( "Gymnopus luxurians" );
128                         t.setTaxonomyCode( "" );
129                     }
130                     else if ( tc.equals( "HYDPIN" ) ) {
131                         t.setScientificName( "Hydnomerulius pinastri" );
132                         t.setTaxonomyCode( "" );
133                     }
134                     else if ( tc.equals( "JAAARG" ) ) {
135                         t.setScientificName( "Jaapia argillacea" );
136                         t.setTaxonomyCode( "" );
137                     }
138                     else if ( tc.equals( "MYCPOP" ) ) {
139                         t.setScientificName( "Mycosphaerella populorum" );
140                         t.setTaxonomyCode( "MYCPS" );
141                     }
142                     else if ( tc.equals( "MYCTHE" ) ) {
143                         t.setScientificName( "Myceliophthora thermophila" );
144                         t.setTaxonomyCode( "THIHA" );
145                     }
146                     else if ( tc.equals( "OIDMAI" ) ) {
147                         t.setScientificName( "Oidiodendron maius" );
148                         t.setTaxonomyCode( "" );
149                     }
150                     else if ( tc.equals( "PANVIR" ) ) {
151                         t.setScientificName( "Panicum virgatum" );
152                         t.setTaxonomyCode( "PANVG" );
153                     }
154                     else if ( tc.equals( "PIRSPE" ) ) {
155                         t.setScientificName( "Piromyces sp. E2" );
156                         t.setTaxonomyCode( "PIRSE" );
157                     }
158                     else if ( tc.equals( "SAICOM" ) ) {
159                         t.setScientificName( "Saitoella complicata" );
160                         t.setTaxonomyCode( "" );
161                     }
162                     else if ( tc.equals( "SERLAC" ) ) {
163                         t.setScientificName( "Serpula lacrymans" );
164                         t.setTaxonomyCode( "SERL9" );
165                     }
166                     else if ( tc.equals( "SPHARC" ) ) {
167                         t.setScientificName( "Sphaeroforma arctica" );
168                         t.setTaxonomyCode( "SARXX" );
169                     }
170                     else if ( tc.equals( "THETRA" ) ) {
171                         t.setScientificName( "Thecamonas trahens" );
172                         t.setTaxonomyCode( "TTRXX" );
173                     }
174                     else if ( tc.equals( "THITER" ) ) {
175                         t.setScientificName( "Thielavia terrestris (strain ATCC 38088 / NRRL 8126)" );
176                         t.setTaxonomyCode( "THITE" );
177                     }
178                     else if ( tc.equals( "WOLCOC" ) ) {
179                         t.setScientificName( "Wolfiporia cocos MD-104 SS10" );
180                         t.setTaxonomyCode( "WOLCO" );
181                     }
182                     else if ( tc.equals( "XANPAR" ) ) {
183                         t.setScientificName( "Xanthoria parietina 46-1" );
184                         t.setTaxonomyCode( "" );
185                     }
186                     else if ( tc.length() == 6 ) {
187                         final Pattern p = Pattern.compile( "[A-Z9][A-Z]{2}[A-Z0-9]{2}\\d" );
188                         if ( p.matcher( tc ).matches() ) {
189                             t.setTaxonomyCode( tc.substring( 0, 5 ) );
190                         }
191                     }
192                 }
193             }
194         }
195     }
196 }