package org.forester.application;
import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
+import org.forester.io.parsers.FastaParser;
import org.forester.io.parsers.PhylogenyParser;
import org.forester.io.parsers.util.ParserUtils;
import org.forester.io.writers.PhylogenyWriter;
import org.forester.phylogeny.data.Identifier;
import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
import org.forester.phylogeny.factories.PhylogenyFactory;
+import org.forester.sequence.Sequence;
import org.forester.tools.PhylogenyDecorator;
import org.forester.tools.PhylogenyDecorator.FIELD;
import org.forester.util.BasicTable;
public final class decorator {
private static final String SEQUENCE_NAME_FIELD = "s";
+ private static final String MOL_SEQ = "m";
private static final String TAXONOMY_CODE_FIELD = "c";
private static final String TAXONOMY_SCIENTIFIC_NAME_FIELD = "sn";
private static final String DS_FILED = "d";
final static private String PICKY_OPTION = "p";
final static private String FIELD_OPTION = "f";
final static private String TRIM_AFTER_TILDE_OPTION = "t";
+ final static private String VERBOSE_OPTION = "ve";
final static private String TREE_NAME_OPTION = "pn";
final static private String TREE_ID_OPTION = "pi";
final static private String TREE_DESC_OPTION = "pd";
final static private String MAPPING_FILE_SEPARATOR_OPTION = "s";
final static private char MAPPING_FILE_SEPARATOR_DEFAULT = '\t';
final static private String PRG_NAME = "decorator";
- final static private String PRG_VERSION = "1.14";
- final static private String PRG_DATE = "130426";
+ final static private String PRG_VERSION = "1.16";
+ final static private String PRG_DATE = "131113";
public static void main( final String args[] ) {
ForesterUtil.printProgramInformation( decorator.PRG_NAME, decorator.PRG_VERSION, decorator.PRG_DATE );
allowed_options.add( decorator.TRIM_AFTER_TILDE_OPTION );
allowed_options.add( decorator.ORDER_TREE_OPTION );
allowed_options.add( decorator.MIDPOINT_ROOT_OPTION );
+ allowed_options.add( decorator.VERBOSE_OPTION );
final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
if ( dissallowed_options.length() > 0 ) {
ForesterUtil.fatalError( decorator.PRG_NAME, "unknown option(s): " + dissallowed_options );
boolean trim_after_tilde = false;
boolean order_tree = false;
boolean midpoint_root = false;
+ boolean verbose = false;
String tree_name = "";
String tree_id = "";
String tree_desc = "";
if ( cla.isOptionSet( decorator.ORDER_TREE_OPTION ) ) {
order_tree = true;
}
+ if ( cla.isOptionSet( decorator.VERBOSE_OPTION ) ) {
+ verbose = true;
+ }
if ( cla.isOptionSet( decorator.FIELD_OPTION ) ) {
field_str = cla.getOptionValue( decorator.FIELD_OPTION );
if ( field_str.equals( NODE_NAME_FIELD ) ) {
else if ( field_str.equals( SEQUENCE_NAME_FIELD ) ) {
field = FIELD.SEQUENCE_NAME;
}
+ else if ( field_str.equals( MOL_SEQ ) ) {
+ field = FIELD.MOL_SEQ;
+ }
else if ( field_str.equals( TAXONOMY_SCIENTIFIC_NAME_FIELD ) ) {
field = FIELD.TAXONOMY_SCIENTIFIC_NAME;
extract_bracketed_scientific_name = false;
}
Map<String, String> map = null;
if ( !advanced_table ) {
- BasicTable<String> mapping_table = null;
- try {
- mapping_table = BasicTableParser.parse( mapping_infile, separator, true, false );
- }
- catch ( final Exception e ) {
- ForesterUtil.fatalError( decorator.PRG_NAME,
- "failed to read [" + mapping_infile + "] [" + e.getMessage() + "]" );
- }
- if ( ( key_column < 0 ) || ( key_column >= mapping_table.getNumberOfColumns() ) ) {
- ForesterUtil.fatalError( decorator.PRG_NAME, "illegal value for key column" );
- }
- if ( ( value_column < 0 ) || ( value_column >= mapping_table.getNumberOfColumns() ) ) {
- ForesterUtil.fatalError( decorator.PRG_NAME, "illegal value for value column" );
- }
- if ( mapping_table.isEmpty() || ( mapping_table.getNumberOfColumns() < 1 ) ) {
- ForesterUtil.fatalError( decorator.PRG_NAME, "mapping table is empty" );
- }
- if ( mapping_table.getNumberOfColumns() == 1 ) {
- ForesterUtil.fatalError( decorator.PRG_NAME, "mapping table has only one column" );
+ if ( field != FIELD.MOL_SEQ ) {
+ BasicTable<String> mapping_table = null;
+ try {
+ mapping_table = BasicTableParser.parse( mapping_infile, separator, true, false );
+ }
+ catch ( final Exception e ) {
+ ForesterUtil.fatalError( decorator.PRG_NAME,
+ "failed to read [" + mapping_infile + "] [" + e.getMessage() + "]" );
+ }
+ if ( ( key_column < 0 ) || ( key_column >= mapping_table.getNumberOfColumns() ) ) {
+ ForesterUtil.fatalError( decorator.PRG_NAME, "illegal value for key column" );
+ }
+ if ( ( value_column < 0 ) || ( value_column >= mapping_table.getNumberOfColumns() ) ) {
+ ForesterUtil.fatalError( decorator.PRG_NAME, "illegal value for value column" );
+ }
+ if ( mapping_table.isEmpty() || ( mapping_table.getNumberOfColumns() < 1 ) ) {
+ ForesterUtil.fatalError( decorator.PRG_NAME, "mapping table is empty" );
+ }
+ if ( mapping_table.getNumberOfColumns() == 1 ) {
+ ForesterUtil.fatalError( decorator.PRG_NAME, "mapping table has only one column" );
+ }
+ map = mapping_table.getColumnsAsMap( key_column, value_column );
+ if ( verbose ) {
+ final Iterator<Entry<String, String>> iter = map.entrySet().iterator();
+ System.out.println();
+ while ( iter.hasNext() ) {
+ final Entry<String, String> e = iter.next();
+ System.out.println( e.getKey() + " => " + e.getValue() );
+ }
+ System.out.println();
+ }
}
- map = mapping_table.getColumnsAsMap( key_column, value_column );
- final Iterator<Entry<String, String>> iter = map.entrySet().iterator();
- System.out.println();
- while ( iter.hasNext() ) {
- final Entry<String, String> e = iter.next();
- System.out.println( e.getKey() + " => " + e.getValue() );
+ else {
+ map = readFastaFileIntoMap( mapping_infile, verbose );
}
- System.out.println();
}
if ( !ForesterUtil.isEmpty( tree_name ) || !ForesterUtil.isEmpty( tree_id )
|| !ForesterUtil.isEmpty( tree_desc ) ) {
process_name_intelligently,
process_similar_to,
numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- trim_after_tilde );
+ trim_after_tilde,
+ verbose );
}
}
catch ( final NullPointerException e ) {
ForesterUtil.programMessage( PRG_NAME, "OK." );
}
+ private static Map<String, String> readFastaFileIntoMap( final File mapping_infile, final boolean verbose ) {
+ List<Sequence> seqs = null;
+ try {
+ seqs = FastaParser.parse( new FileInputStream( mapping_infile ) );
+ }
+ catch ( final IOException e ) {
+ ForesterUtil.fatalError( decorator.PRG_NAME, "failed to read fasta-file from [" + mapping_infile + "] ["
+ + e.getMessage() + "]" );
+ }
+ if ( ForesterUtil.isEmpty( seqs ) ) {
+ ForesterUtil.fatalError( decorator.PRG_NAME, "fasta-file [" + mapping_infile
+ + "] is devoid of fasta-formatted sequences" );
+ }
+ final Map<String, String> map = new HashMap<String, String>();
+ for( final Sequence seq : seqs ) {
+ if ( ForesterUtil.isEmpty( seq.getIdentifier() ) ) {
+ ForesterUtil.fatalError( decorator.PRG_NAME, "fasta-file [" + mapping_infile
+ + "] contains sequence with empty identifier" );
+ }
+ if ( map.containsKey( seq.getIdentifier() ) ) {
+ ForesterUtil.fatalError( decorator.PRG_NAME, "sequence identifier [" + seq.getIdentifier()
+ + "] is not unique" );
+ }
+ if ( seq.getLength() < 1 ) {
+ ForesterUtil.fatalError( decorator.PRG_NAME, "sequence [" + seq.getIdentifier() + "] is empty" );
+ }
+ map.put( seq.getIdentifier(), seq.getMolecularSequenceAsString() );
+ if ( verbose ) {
+ System.out.println( seq.getIdentifier() + " => " + seq.getMolecularSequenceAsString() );
+ }
+ }
+ return map;
+ }
+
private static void argumentsError() {
System.out.println();
System.out.println( decorator.PRG_NAME + " -" + ADVANCED_TABLE_OPTION + " | -f=<c> <phylogenies infile> "
- + "[mapping table file] <phylogenies outfile>" );
+ + "[mapping table file|fasta-file] <phylogenies outfile>" );
System.out.println();
System.out.println( "options:" );
System.out.println();
System.out.println( " " + TAXONOMY_SCIENTIFIC_NAME_FIELD
+ ": taxonomy scientific name" );
System.out.println( " " + SEQUENCE_NAME_FIELD + " : sequence name" );
+ System.out.println( " " + MOL_SEQ + " : molecular sequence" );
System.out.println( " -k=<n> : key column in mapping table (0 based)," );
System.out.println( " names of the node to be decorated - default is 0" );
System.out.println( " -v=<n> : value column in mapping table (0 based)," );
System.out.println( " -c : cut name after first space (only for -f=n)" );
System.out.println( " -" + decorator.TRIM_AFTER_TILDE_OPTION
+ " : trim node name to be replaced after tilde" );
- System.out.println( " -" + decorator.MIDPOINT_ROOT_OPTION + " : to midpoint-root the tree" );
- System.out.println( " -" + decorator.ORDER_TREE_OPTION + " : to order tree branches" );
+ System.out.println( " -" + decorator.MIDPOINT_ROOT_OPTION + " : to midpoint-root the tree" );
+ System.out.println( " -" + decorator.ORDER_TREE_OPTION + " : to order tree branches" );
+ System.out.println( " -" + decorator.VERBOSE_OPTION + " : verbose" );
System.out.println();
System.exit( -1 );
}
public final class PhylogenyDecorator {
- // From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb:
- final private static String TP_TAXONOMY_CODE = "TAXONOMY_CODE";
- final private static String TP_TAXONOMY_ID = "TAXONOMY_ID";
- final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER";
- final private static String TP_TAXONOMY_SN = "TAXONOMY_SN";
- final private static String TP_TAXONOMY_CN = "TAXONOMY_CN";
- final private static String TP_TAXONOMY_SYN = "TAXONOMY_SYN";
- final private static String TP_SEQ_SYMBOL = "SEQ_SYMBOL";
+ public final static boolean SANITIZE = false;
+ final private static String TP_NODE_NAME = "NODE_NAME";
final private static String TP_SEQ_ACCESSION = "SEQ_ACCESSION";
final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE";
final private static String TP_SEQ_ANNOTATION_DESC = "SEQ_ANNOTATION_DESC";
final private static String TP_SEQ_ANNOTATION_REF = "SEQ_ANNOTATION_REF";
final private static String TP_SEQ_MOL_SEQ = "SEQ_MOL_SEQ";
final private static String TP_SEQ_NAME = "SEQ_NAME";
- final private static String TP_NODE_NAME = "NODE_NAME";
- public final static boolean SANITIZE = false;
- public final static boolean VERBOSE = true;
+ final private static String TP_SEQ_SYMBOL = "SEQ_SYMBOL";
+ final private static String TP_TAXONOMY_CN = "TAXONOMY_CN";
+ // From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb:
+ final private static String TP_TAXONOMY_CODE = "TAXONOMY_CODE";
+ final private static String TP_TAXONOMY_ID = "TAXONOMY_ID";
+ final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER";
+ final private static String TP_TAXONOMY_SN = "TAXONOMY_SN";
+ final private static String TP_TAXONOMY_SYN = "TAXONOMY_SYN";
private PhylogenyDecorator() {
// Not needed.
final boolean process_name_intelligently,
final boolean process_similar_to,
final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
+ final boolean trim_after_tilde,
+ final boolean verbose ) throws IllegalArgumentException, NHXFormatException,
PhyloXmlDataFormatException {
PhylogenyDecorator.decorate( phylogeny,
map,
process_name_intelligently,
process_similar_to,
numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- trim_after_tilde );
+ trim_after_tilde,
+ verbose );
}
/**
final boolean process_name_intelligently,
final boolean process_similar_to,
final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- final boolean trim_after_tilde ) throws IllegalArgumentException,
- PhyloXmlDataFormatException {
+ final boolean trim_after_tilde,
+ final boolean verbose ) throws IllegalArgumentException, PhyloXmlDataFormatException {
if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) {
throw new IllegalArgumentException( "attempt to extract bracketed scientific name together with data field pointing to scientific name" );
}
}
if ( !ForesterUtil.isEmpty( name ) ) {
if ( intermediate_map != null ) {
- name = PhylogenyDecorator.extractIntermediate( intermediate_map, name );
+ name = PhylogenyDecorator.extractIntermediate( intermediate_map, name, verbose );
}
if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
String new_value = map.get( name );
new_value = extractBracketedScientificNames( node, new_value );
}
else if ( extract_bracketed_tax_code ) {
- if ( ParserUtils.TAXOMONY_CODE_PATTERN_4.matcher( new_value ).find() ) {
+ if ( ParserUtils.TAXOMONY_CODE_PATTERN_BRACKETED.matcher( new_value ).find() ) {
new_value = extractBracketedTaxCodes( node, new_value );
}
- else if ( ParserUtils.TAXOMONY_CODE_PATTERN_6.matcher( new_value ).find() ) {
- new_value = extractBracketedTaxCodes6( node, new_value );
- }
else if ( picky ) {
throw new IllegalArgumentException( " could not get taxonomy from \"" + new_value
+ "\"" );
}
}
switch ( field ) {
+ case MOL_SEQ:
+ if ( verbose ) {
+ System.out.println( name + ": " + new_value );
+ }
+ if ( !node.getNodeData().isHasSequence() ) {
+ node.getNodeData().setSequence( new Sequence() );
+ }
+ node.getNodeData().getSequence().setMolecularSequence( new_value );
+ break;
case SEQUENCE_ANNOTATION_DESC:
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.println( name + ": " + new_value );
}
if ( !node.getNodeData().isHasSequence() ) {
node.getNodeData().getSequence().addAnnotation( annotation );
break;
case DOMAIN_STRUCTURE:
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.println( name + ": " + new_value );
}
if ( !node.getNodeData().isHasSequence() ) {
.setDomainArchitecture( new DomainArchitecture( new_value ) );
break;
case TAXONOMY_CODE:
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.println( name + ": " + new_value );
}
ForesterUtil.ensurePresenceOfTaxonomy( node );
node.getNodeData().getTaxonomy().setTaxonomyCode( new_value );
break;
case TAXONOMY_SCIENTIFIC_NAME:
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.println( name + ": " + new_value );
}
ForesterUtil.ensurePresenceOfTaxonomy( node );
if ( trim_after_tilde ) {
new_value = addTildeAnnotation( tilde_annotation, new_value );
}
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.println( name + ": " + new_value );
}
if ( !node.getNodeData().isHasSequence() ) {
node.getNodeData().getSequence().setName( new_value );
break;
case NODE_NAME:
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.print( name + " -> " );
}
if ( cut_name_after_space ) {
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.print( new_value + " -> " );
}
new_value = PhylogenyDecorator.deleteAtFirstSpace( new_value );
}
else if ( process_name_intelligently ) {
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.print( new_value + " -> " );
}
new_value = PhylogenyDecorator.processNameIntelligently( new_value );
}
else if ( process_similar_to ) {
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.print( new_value + " -> " );
}
new_value = PhylogenyDecorator.processSimilarTo( new_value );
if ( trim_after_tilde ) {
new_value = addTildeAnnotation( tilde_annotation, new_value );
}
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.println( new_value );
}
node.setName( new_value );
}
}
- private final static String addTildeAnnotation( final String tilde_annotation, final String new_value ) {
- if ( ForesterUtil.isEmpty( tilde_annotation ) ) {
- return new_value;
- }
- return new_value + tilde_annotation;
- }
-
public static void decorate( final Phylogeny[] phylogenies,
final Map<String, Map<String, String>> map,
final boolean picky,
final boolean process_name_intelligently,
final boolean process_similar_to,
final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
+ final boolean trim_after_tilde,
+ final boolean verbose ) throws IllegalArgumentException, NHXFormatException,
PhyloXmlDataFormatException {
for( final Phylogeny phylogenie : phylogenies ) {
PhylogenyDecorator.decorate( phylogenie,
process_name_intelligently,
process_similar_to,
numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- trim_after_tilde );
+ trim_after_tilde,
+ verbose );
}
}
final boolean process_name_intelligently,
final boolean process_similar_to,
final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
+ final boolean trim_after_tilde,
+ final boolean verbose ) throws IllegalArgumentException, NHXFormatException,
PhyloXmlDataFormatException {
for( final Phylogeny phylogenie : phylogenies ) {
PhylogenyDecorator.decorate( phylogenie,
process_name_intelligently,
process_similar_to,
numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- trim_after_tilde );
+ trim_after_tilde,
+ verbose );
}
}
return map;
}
+ private final static String addTildeAnnotation( final String tilde_annotation, final String new_value ) {
+ if ( ForesterUtil.isEmpty( tilde_annotation ) ) {
+ return new_value;
+ }
+ return new_value + tilde_annotation;
+ }
+
private static String deleteAtFirstSpace( final String name ) {
final int first_space = name.indexOf( " " );
if ( first_space > 1 ) {
}
private static String extractBracketedTaxCodes( final PhylogenyNode node, final String new_value ) {
- final Matcher m = ParserUtils.TAXOMONY_CODE_PATTERN_4.matcher( new_value );
- String tc = "?";
- if ( m.find() ) {
- tc = m.group( 1 );
- }
- ForesterUtil.ensurePresenceOfTaxonomy( node );
- try {
- node.getNodeData().getTaxonomy().setTaxonomyCode( tc );
- }
- catch ( final PhyloXmlDataFormatException e ) {
- throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc );
+ final StringBuilder sb = new StringBuilder();
+ sb.append( new_value );
+ final String tc = extractBracketedTaxCodes( sb );
+ if ( !ForesterUtil.isEmpty( tc ) ) {
+ ForesterUtil.ensurePresenceOfTaxonomy( node );
+ try {
+ node.getNodeData().getTaxonomy().setTaxonomyCode( tc );
+ }
+ catch ( final PhyloXmlDataFormatException e ) {
+ throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc );
+ }
+ return sb.toString().trim();
}
- return new_value; //TODO //FIXME
+ return new_value;
}
- private static String extractBracketedTaxCodes6( final PhylogenyNode node, final String new_value ) {
- final Matcher m = ParserUtils.TAXOMONY_CODE_PATTERN_6.matcher( new_value );
- String tc = "?";
+ private static String extractBracketedTaxCodes( final StringBuilder sb ) {
+ final Matcher m = ParserUtils.TAXOMONY_CODE_PATTERN_BRACKETED.matcher( sb );
if ( m.find() ) {
- tc = m.group( 1 );
- }
- ForesterUtil.ensurePresenceOfTaxonomy( node );
- try {
- if ( tc.length() == 6 ) {
- final String t = tc.substring( 0, 5 );
- System.out.println( "WARNING: taxonomy code " + tc + " -> " + t );
- tc = t;
- }
- else {
- throw new IllegalArgumentException();
- }
- node.getNodeData().getTaxonomy().setTaxonomyCode( tc );
- }
- catch ( final PhyloXmlDataFormatException e ) {
- throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc );
+ final String tc = m.group( 1 );
+ sb.delete( m.start( 1 ) - 1, m.end( 1 ) + 1 );
+ return tc;
}
- return new_value; //TODO //FIXME
+ return null;
}
- private static String extractIntermediate( final Map<String, String> intermediate_map, final String name ) {
+ private static String extractIntermediate( final Map<String, String> intermediate_map,
+ final String name,
+ final boolean verbose ) {
String new_name = null;
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.print( name + " => " );
}
if ( intermediate_map.containsKey( name ) ) {
else {
throw new IllegalArgumentException( "\"" + name + "\" not found in name secondary map" );
}
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.println( new_name + " " );
}
return new_name;
}
public static enum FIELD {
- NODE_NAME, SEQUENCE_ANNOTATION_DESC, DOMAIN_STRUCTURE, TAXONOMY_CODE, TAXONOMY_SCIENTIFIC_NAME, SEQUENCE_NAME;
+ DOMAIN_STRUCTURE,
+ MOL_SEQ,
+ NODE_NAME,
+ SEQUENCE_ANNOTATION_DESC,
+ SEQUENCE_NAME,
+ TAXONOMY_CODE,
+ TAXONOMY_SCIENTIFIC_NAME;
}
}