in progress
[jalview.git] / forester / java / src / org / forester / util / ForesterUtil.java
index 47e6aa7..eb1ea8d 100644 (file)
@@ -50,7 +50,6 @@ import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Date;
-import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
@@ -67,6 +66,10 @@ import org.forester.phylogeny.PhylogenyNode;
 import org.forester.phylogeny.data.Distribution;
 import org.forester.phylogeny.data.Sequence;
 import org.forester.phylogeny.data.Taxonomy;
+import org.forester.protein.BasicProtein;
+import org.forester.protein.Domain;
+import org.forester.protein.Protein;
+import org.forester.surfacing.SurfacingUtil;
 
 public final class ForesterUtil {
 
@@ -87,10 +90,6 @@ public final class ForesterUtil {
     public static final String       NCBI_PROTEIN                     = "http://www.ncbi.nlm.nih.gov/protein/";
     public static final String       NCBI_NUCCORE                     = "http://www.ncbi.nlm.nih.gov/nuccore/";
     public final static String       UNIPROT_KB                       = "http://www.uniprot.org/uniprot/";
-    public final static Pattern      UNIPROT_KB_PATTERN_1             = Pattern
-                                                                              .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );
-    public final static Pattern      UNIPROT_KB_PATTERN_2             = Pattern
-                                                                              .compile( "\\b(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)\\b" );
     public static final String       NCBI_GI                          = "http://www.ncbi.nlm.nih.gov/protein/gi:";
     static {
         final DecimalFormatSymbols dfs = new DecimalFormatSymbols();
@@ -105,128 +104,86 @@ public final class ForesterUtil {
     private ForesterUtil() {
     }
 
-    public static String extractRefSeqAccessorAccessor( final PhylogenyNode node ) {
-        String v = null;
-        if ( node.getNodeData().isHasSequence() ) {
-            final Sequence seq = node.getNodeData().getSequence();
-            if ( !isEmpty( seq.getSymbol() ) ) {
-                v = SequenceIdParser.parseRefSeqAccessor( seq.getSymbol() );
-            }
-            if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
-                v = SequenceIdParser.parseRefSeqAccessor( seq.getName() );
-            }
-            if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
-                    && !isEmpty( seq.getAccession().getValue() ) ) {
-                v = SequenceIdParser.parseRefSeqAccessor( seq.getAccession().getValue() );
+    public static int calculateOverlap( final Domain domain, final List<Boolean> covered_positions ) {
+        int overlap_count = 0;
+        for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) {
+            if ( ( i < covered_positions.size() ) && ( covered_positions.get( i ) == true ) ) {
+                ++overlap_count;
             }
         }
-        if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
-            v = SequenceIdParser.parseRefSeqAccessor( node.getName() );
-        }
-        return v;
+        return overlap_count;
     }
 
-    public static String extractGenbankAccessor( final PhylogenyNode node ) {
-        String v = null;
-        if ( node.getNodeData().isHasSequence() ) {
-            final Sequence seq = node.getNodeData().getSequence();
-            if ( !isEmpty( seq.getSymbol() ) ) {
-                v = SequenceIdParser.parseGenbankAccessor( seq.getSymbol() );
-            }
-            if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
-                v = SequenceIdParser.parseGenbankAccessor( seq.getName() );
-            }
-            if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
-                    && !isEmpty( seq.getAccession().getValue() ) ) {
-                v = SequenceIdParser.parseGenbankAccessor( seq.getAccession().getValue() );
-            }
-        }
-        if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
-            v = SequenceIdParser.parseGenbankAccessor( node.getName() );
-        }
-        return v;
-    }
-
-    public static String extractGInumber( final PhylogenyNode node ) {
-        String v = null;
-        if ( node.getNodeData().isHasSequence() ) {
-            final Sequence seq = node.getNodeData().getSequence();
-            if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
-                v = SequenceIdParser.parseGInumber( seq.getName() );
-            }
-            if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
-                    && !isEmpty( seq.getAccession().getValue() ) ) {
-                v = SequenceIdParser.parseGInumber( seq.getAccession().getValue() );
-            }
-        }
-        if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
-            v = SequenceIdParser.parseGInumber( node.getName() );
+    final public static void appendSeparatorIfNotEmpty( final StringBuffer sb, final char separator ) {
+        if ( sb.length() > 0 ) {
+            sb.append( separator );
         }
-        return v;
     }
 
-    public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) {
-        String upkb = null;
-        if ( node.getNodeData().isHasSequence() ) {
-            final Sequence seq = node.getNodeData().getSequence();
-            Matcher m;
-            if ( !isEmpty( seq.getSymbol() ) ) {
-                m = UNIPROT_KB_PATTERN_1.matcher( seq.getSymbol() );
-                if ( m.find() ) {
-                    upkb = m.group( 1 );
-                }
-                else {
-                    m = UNIPROT_KB_PATTERN_2.matcher( seq.getSymbol() );
-                    if ( m.find() ) {
-                        upkb = m.group();
-                    }
-                }
-            }
-            if ( isEmpty( upkb ) && !isEmpty( seq.getName() ) ) {
-                m = UNIPROT_KB_PATTERN_1.matcher( seq.getName() );
-                if ( m.find() ) {
-                    upkb = m.group( 1 );
+    /**
+     * 
+     * Example regarding engulfment: ------------0.1 ----------0.2 --0.3 =>
+     * domain with 0.3 is ignored
+     * 
+     * -----------0.1 ----------0.2 --0.3 => domain with 0.3 is ignored
+     * 
+     * 
+     * ------------0.1 ----------0.3 --0.2 => domains with 0.3 and 0.2 are _not_
+     * ignored
+     * 
+     * @param max_allowed_overlap
+     *            maximal allowed overlap (inclusive) to be still considered not
+     *            overlapping (zero or negative value to allow any overlap)
+     * @param remove_engulfed_domains
+     *            to remove domains which are completely engulfed by coverage of
+     *            domains with better support
+     * @param protein
+     * @return
+     */
+    public static Protein removeOverlappingDomains( final int max_allowed_overlap,
+                                                    final boolean remove_engulfed_domains,
+                                                    final Protein protein ) {
+        final Protein pruned_protein = new BasicProtein( protein.getProteinId().getId(), protein.getSpecies()
+                .getSpeciesId(), protein.getLength() );
+        final List<Domain> sorted = SurfacingUtil.sortDomainsWithAscendingConfidenceValues( protein );
+        final List<Boolean> covered_positions = new ArrayList<Boolean>();
+        for( final Domain domain : sorted ) {
+            if ( ( ( max_allowed_overlap < 0 ) || ( ForesterUtil.calculateOverlap( domain, covered_positions ) <= max_allowed_overlap ) )
+                    && ( !remove_engulfed_domains || !isEngulfed( domain, covered_positions ) ) ) {
+                final int covered_positions_size = covered_positions.size();
+                for( int i = covered_positions_size; i < domain.getFrom(); ++i ) {
+                    covered_positions.add( false );
                 }
-                else {
-                    m = UNIPROT_KB_PATTERN_2.matcher( seq.getName() );
-                    if ( m.find() ) {
-                        upkb = m.group();
+                final int new_covered_positions_size = covered_positions.size();
+                for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) {
+                    if ( i < new_covered_positions_size ) {
+                        covered_positions.set( i, true );
                     }
-                }
-            }
-            if ( isEmpty( upkb ) && ( node.getNodeData().getSequence().getAccession() != null )
-                    && !isEmpty( seq.getAccession().getValue() ) ) {
-                m = UNIPROT_KB_PATTERN_1.matcher( seq.getAccession().getValue() );
-                if ( m.find() ) {
-                    upkb = m.group( 1 );
-                }
-                else {
-                    m = UNIPROT_KB_PATTERN_2.matcher( seq.getAccession().getValue() );
-                    if ( m.find() ) {
-                        upkb = m.group();
+                    else {
+                        covered_positions.add( true );
                     }
                 }
+                pruned_protein.addProteinDomain( domain );
             }
         }
-        if ( isEmpty( upkb ) && !isEmpty( node.getName() ) ) {
-            final Matcher m1 = UNIPROT_KB_PATTERN_1.matcher( node.getName() );
-            if ( m1.find() ) {
-                upkb = m1.group( 1 );
-            }
-            else {
-                final Matcher m2 = UNIPROT_KB_PATTERN_2.matcher( node.getName() );
-                if ( m2.find() ) {
-                    upkb = m2.group();
-                }
-            }
-        }
-        return upkb;
+        return pruned_protein;
     }
 
-    final public static void appendSeparatorIfNotEmpty( final StringBuffer sb, final char separator ) {
-        if ( sb.length() > 0 ) {
-            sb.append( separator );
+    /**
+     * Returns true is Domain domain falls in an uninterrupted stretch of
+     * covered positions.
+     * 
+     * @param domain
+     * @param covered_positions
+     * @return
+     */
+    public static boolean isEngulfed( final Domain domain, final List<Boolean> covered_positions ) {
+        for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) {
+            if ( ( i >= covered_positions.size() ) || ( covered_positions.get( i ) != true ) ) {
+                return false;
+            }
         }
+        return true;
     }
 
     /**
@@ -757,7 +714,7 @@ public final class ForesterUtil {
         return i;
     }
 
-    final public static SortedMap<Object, Integer> listToSortedCountsMap( final List list ) {
+    final public static SortedMap<Object, Integer> listToSortedCountsMap( final List<?> list ) {
         final SortedMap<Object, Integer> map = new TreeMap<Object, Integer>();
         for( final Object key : list ) {
             if ( !map.containsKey( key ) ) {
@@ -797,10 +754,9 @@ public final class ForesterUtil {
         }
     }
 
-    final public static StringBuffer mapToStringBuffer( final Map map, final String key_value_separator ) {
+    final public static StringBuffer mapToStringBuffer( final Map<Object, Object> map, final String key_value_separator ) {
         final StringBuffer sb = new StringBuffer();
-        for( final Iterator iter = map.keySet().iterator(); iter.hasNext(); ) {
-            final Object key = iter.next();
+        for( final Object key : map.keySet() ) {
             sb.append( key.toString() );
             sb.append( key_value_separator );
             sb.append( map.get( key ).toString() );
@@ -1247,4 +1203,161 @@ public final class ForesterUtil {
         System.err.println();
         System.exit( -1 );
     }
+
+    public final static Color obtainColorDependingOnTaxonomyGroup( final String tax_group ) {
+        if ( !ForesterUtil.isEmpty( tax_group ) ) {
+            if ( tax_group.equals( "deuterostomia" ) ) {
+                return TaxonomyColors.DEUTEROSTOMIA_COLOR;
+            }
+            else if ( tax_group.equals( "protostomia" ) ) {
+                return TaxonomyColors.PROTOSTOMIA_COLOR;
+            }
+            else if ( tax_group.equals( "cnidaria" ) ) {
+                return TaxonomyColors.CNIDARIA_COLOR;
+            }
+            else if ( tax_group.equals( "placozoa" ) ) {
+                return TaxonomyColors.PLACOZOA_COLOR;
+            }
+            else if ( tax_group.equals( "ctenophora" ) ) {
+                return TaxonomyColors.CTENOPHORA_COLOR;
+            }
+            else if ( tax_group.equals( "porifera" ) ) {
+                return TaxonomyColors.PORIFERA_COLOR;
+            }
+            else if ( tax_group.equals( "choanoflagellida" ) ) {
+                return TaxonomyColors.CHOANOFLAGELLIDA;
+            }
+            else if ( tax_group.equals( "ichthyophonida & filasterea" ) ) {
+                return TaxonomyColors.ICHTHYOSPOREA_AND_FILASTEREA;
+            }
+            else if ( tax_group.equals( "dikarya" ) ) {
+                return TaxonomyColors.DIKARYA_COLOR;
+            }
+            else if ( tax_group.equalsIgnoreCase( "fungi" ) || tax_group.equalsIgnoreCase( "other fungi" ) ) {
+                return TaxonomyColors.OTHER_FUNGI_COLOR;
+            }
+            else if ( tax_group.equals( "nucleariidae and fonticula" ) ) {
+                return TaxonomyColors.NUCLEARIIDAE_AND_FONTICULA_GROUP_COLOR;
+            }
+            else if ( tax_group.equals( "amoebozoa" ) ) {
+                return TaxonomyColors.AMOEBOZOA_COLOR;
+            }
+            else if ( tax_group.equals( "embryophyta" ) ) {
+                return TaxonomyColors.EMBRYOPHYTA_COLOR;
+            }
+            else if ( tax_group.equals( "chlorophyta" ) ) {
+                return TaxonomyColors.CHLOROPHYTA_COLOR;
+            }
+            else if ( tax_group.equals( "rhodophyta" ) ) {
+                return TaxonomyColors.RHODOPHYTA_COLOR;
+            }
+            else if ( tax_group.equals( "hacrobia" ) ) {
+                return TaxonomyColors.HACROBIA_COLOR;
+            }
+            else if ( tax_group.equals( "glaucocystophyceae" ) ) {
+                return TaxonomyColors.GLAUCOPHYTA_COLOR;
+            }
+            else if ( tax_group.equals( "stramenopiles" ) ) {
+                return TaxonomyColors.STRAMENOPILES_COLOR;
+            }
+            else if ( tax_group.equals( "alveolata" ) ) {
+                return TaxonomyColors.ALVEOLATA_COLOR;
+            }
+            else if ( tax_group.equals( "rhizaria" ) ) {
+                return TaxonomyColors.RHIZARIA_COLOR;
+            }
+            else if ( tax_group.equals( "excavata" ) ) {
+                return TaxonomyColors.EXCAVATA_COLOR;
+            }
+            else if ( tax_group.equals( "apusozoa" ) ) {
+                return TaxonomyColors.APUSOZOA_COLOR;
+            }
+            else if ( tax_group.equals( "archaea" ) ) {
+                return TaxonomyColors.ARCHAEA_COLOR;
+            }
+            else if ( tax_group.equals( "bacteria" ) ) {
+                return TaxonomyColors.BACTERIA_COLOR;
+            }
+        }
+        return null;
+    }
+
+    public final static String obtainNormalizedTaxonomyGroup( final String tax ) {
+        if ( tax.equalsIgnoreCase( "deuterostomia" ) ) {
+            return "deuterostomia";
+        }
+        else if ( tax.equalsIgnoreCase( "protostomia" ) ) {
+            return "protostomia";
+        }
+        else if ( tax.equalsIgnoreCase( "cnidaria" ) ) {
+            return "cnidaria";
+        }
+        else if ( tax.toLowerCase().startsWith( "trichoplax" ) || tax.equalsIgnoreCase( "placozoa" ) ) {
+            return "placozoa";
+        }
+        else if ( tax.toLowerCase().startsWith( "mnemiopsis" ) || tax.equalsIgnoreCase( "ctenophora" ) ) {
+            return "ctenophora";
+        }
+        else if ( tax.toLowerCase().startsWith( "amphimedon" ) || tax.equalsIgnoreCase( "porifera" ) ) {
+            return "porifera";
+        }
+        else if ( tax.equalsIgnoreCase( "codonosigidae" ) || tax.equalsIgnoreCase( "choanoflagellida" ) ) {
+            return "choanoflagellida";
+        }
+        else if ( tax.toLowerCase().startsWith( "ichthyophonida & filasterea" )
+                || tax.toLowerCase().startsWith( "ichthyophonida and filasterea" )
+                || tax.toLowerCase().startsWith( "ichthyosporea & filasterea" )
+                || tax.toLowerCase().startsWith( "ichthyosporea and filasterea" ) ) {
+            return "ichthyophonida & filasterea";
+        }
+        else if ( tax.equalsIgnoreCase( "dikarya" ) ) {
+            return "dikarya";
+        }
+        else if ( tax.equalsIgnoreCase( "other fungi" ) ) {
+            return "other fungi";
+        }
+        else if ( tax.toLowerCase().startsWith( "nucleariidae and fonticula" ) ) {
+            return "nucleariidae and fonticula group";
+        }
+        else if ( tax.equalsIgnoreCase( "amoebozoa" ) ) {
+            return "amoebozoa";
+        }
+        else if ( tax.equalsIgnoreCase( "embryophyta" ) ) {
+            return "embryophyta";
+        }
+        else if ( tax.equalsIgnoreCase( "chlorophyta" ) ) {
+            return "chlorophyta";
+        }
+        else if ( tax.equalsIgnoreCase( "rhodophyta" ) ) {
+            return "rhodophyta";
+        }
+        else if ( tax.toLowerCase().startsWith( "hacrobia" ) ) {
+            return "hacrobia";
+        }
+        else if ( tax.equalsIgnoreCase( "glaucocystophyceae" ) || tax.equalsIgnoreCase( "glaucophyta" ) ) {
+            return "glaucocystophyceae";
+        }
+        else if ( tax.equalsIgnoreCase( "stramenopiles" ) ) {
+            return "stramenopiles";
+        }
+        else if ( tax.equalsIgnoreCase( "alveolata" ) ) {
+            return "alveolata";
+        }
+        else if ( tax.equalsIgnoreCase( "rhizaria" ) ) {
+            return "rhizaria";
+        }
+        else if ( tax.equalsIgnoreCase( "excavata" ) ) {
+            return "excavata";
+        }
+        else if ( tax.equalsIgnoreCase( "apusozoa" ) ) {
+            return "apusozoa";
+        }
+        else if ( tax.equalsIgnoreCase( "archaea" ) ) {
+            return "archaea";
+        }
+        else if ( tax.equalsIgnoreCase( "bacteria" ) ) {
+            return "bacteria";
+        }
+        return null;
+    }
 }