X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Futil%2FForesterUtil.java;h=eb1ea8df1f65a535042d5b8d53315025fdae15ef;hb=a1114eb8610e592961a40e5c3d46d647c02b5108;hp=47e6aa74b4161813143732292d2d463a9779b8f8;hpb=663daba455e534e015bd56bae070e9248e3a4533;p=jalview.git diff --git a/forester/java/src/org/forester/util/ForesterUtil.java b/forester/java/src/org/forester/util/ForesterUtil.java index 47e6aa7..eb1ea8d 100644 --- a/forester/java/src/org/forester/util/ForesterUtil.java +++ b/forester/java/src/org/forester/util/ForesterUtil.java @@ -50,7 +50,6 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collection; import java.util.Date; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -67,6 +66,10 @@ import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Distribution; import org.forester.phylogeny.data.Sequence; import org.forester.phylogeny.data.Taxonomy; +import org.forester.protein.BasicProtein; +import org.forester.protein.Domain; +import org.forester.protein.Protein; +import org.forester.surfacing.SurfacingUtil; public final class ForesterUtil { @@ -87,10 +90,6 @@ public final class ForesterUtil { public static final String NCBI_PROTEIN = "http://www.ncbi.nlm.nih.gov/protein/"; public static final String NCBI_NUCCORE = "http://www.ncbi.nlm.nih.gov/nuccore/"; public final static String UNIPROT_KB = "http://www.uniprot.org/uniprot/"; - public final static Pattern UNIPROT_KB_PATTERN_1 = Pattern - .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" ); - public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern - .compile( "\\b(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)\\b" ); public static final String NCBI_GI = "http://www.ncbi.nlm.nih.gov/protein/gi:"; static { final DecimalFormatSymbols dfs = new DecimalFormatSymbols(); @@ -105,128 +104,86 @@ public final class ForesterUtil { private ForesterUtil() { } - public static String extractRefSeqAccessorAccessor( final PhylogenyNode node ) { - String v = null; - if ( node.getNodeData().isHasSequence() ) { - final Sequence seq = node.getNodeData().getSequence(); - if ( !isEmpty( seq.getSymbol() ) ) { - v = SequenceIdParser.parseRefSeqAccessor( seq.getSymbol() ); - } - if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) { - v = SequenceIdParser.parseRefSeqAccessor( seq.getName() ); - } - if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) - && !isEmpty( seq.getAccession().getValue() ) ) { - v = SequenceIdParser.parseRefSeqAccessor( seq.getAccession().getValue() ); + public static int calculateOverlap( final Domain domain, final List covered_positions ) { + int overlap_count = 0; + for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) { + if ( ( i < covered_positions.size() ) && ( covered_positions.get( i ) == true ) ) { + ++overlap_count; } } - if ( isEmpty( v ) && !isEmpty( node.getName() ) ) { - v = SequenceIdParser.parseRefSeqAccessor( node.getName() ); - } - return v; + return overlap_count; } - public static String extractGenbankAccessor( final PhylogenyNode node ) { - String v = null; - if ( node.getNodeData().isHasSequence() ) { - final Sequence seq = node.getNodeData().getSequence(); - if ( !isEmpty( seq.getSymbol() ) ) { - v = SequenceIdParser.parseGenbankAccessor( seq.getSymbol() ); - } - if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) { - v = SequenceIdParser.parseGenbankAccessor( seq.getName() ); - } - if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) - && !isEmpty( seq.getAccession().getValue() ) ) { - v = SequenceIdParser.parseGenbankAccessor( seq.getAccession().getValue() ); - } - } - if ( isEmpty( v ) && !isEmpty( node.getName() ) ) { - v = SequenceIdParser.parseGenbankAccessor( node.getName() ); - } - return v; - } - - public static String extractGInumber( final PhylogenyNode node ) { - String v = null; - if ( node.getNodeData().isHasSequence() ) { - final Sequence seq = node.getNodeData().getSequence(); - if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) { - v = SequenceIdParser.parseGInumber( seq.getName() ); - } - if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) - && !isEmpty( seq.getAccession().getValue() ) ) { - v = SequenceIdParser.parseGInumber( seq.getAccession().getValue() ); - } - } - if ( isEmpty( v ) && !isEmpty( node.getName() ) ) { - v = SequenceIdParser.parseGInumber( node.getName() ); + final public static void appendSeparatorIfNotEmpty( final StringBuffer sb, final char separator ) { + if ( sb.length() > 0 ) { + sb.append( separator ); } - return v; } - public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) { - String upkb = null; - if ( node.getNodeData().isHasSequence() ) { - final Sequence seq = node.getNodeData().getSequence(); - Matcher m; - if ( !isEmpty( seq.getSymbol() ) ) { - m = UNIPROT_KB_PATTERN_1.matcher( seq.getSymbol() ); - if ( m.find() ) { - upkb = m.group( 1 ); - } - else { - m = UNIPROT_KB_PATTERN_2.matcher( seq.getSymbol() ); - if ( m.find() ) { - upkb = m.group(); - } - } - } - if ( isEmpty( upkb ) && !isEmpty( seq.getName() ) ) { - m = UNIPROT_KB_PATTERN_1.matcher( seq.getName() ); - if ( m.find() ) { - upkb = m.group( 1 ); + /** + * + * Example regarding engulfment: ------------0.1 ----------0.2 --0.3 => + * domain with 0.3 is ignored + * + * -----------0.1 ----------0.2 --0.3 => domain with 0.3 is ignored + * + * + * ------------0.1 ----------0.3 --0.2 => domains with 0.3 and 0.2 are _not_ + * ignored + * + * @param max_allowed_overlap + * maximal allowed overlap (inclusive) to be still considered not + * overlapping (zero or negative value to allow any overlap) + * @param remove_engulfed_domains + * to remove domains which are completely engulfed by coverage of + * domains with better support + * @param protein + * @return + */ + public static Protein removeOverlappingDomains( final int max_allowed_overlap, + final boolean remove_engulfed_domains, + final Protein protein ) { + final Protein pruned_protein = new BasicProtein( protein.getProteinId().getId(), protein.getSpecies() + .getSpeciesId(), protein.getLength() ); + final List sorted = SurfacingUtil.sortDomainsWithAscendingConfidenceValues( protein ); + final List covered_positions = new ArrayList(); + for( final Domain domain : sorted ) { + if ( ( ( max_allowed_overlap < 0 ) || ( ForesterUtil.calculateOverlap( domain, covered_positions ) <= max_allowed_overlap ) ) + && ( !remove_engulfed_domains || !isEngulfed( domain, covered_positions ) ) ) { + final int covered_positions_size = covered_positions.size(); + for( int i = covered_positions_size; i < domain.getFrom(); ++i ) { + covered_positions.add( false ); } - else { - m = UNIPROT_KB_PATTERN_2.matcher( seq.getName() ); - if ( m.find() ) { - upkb = m.group(); + final int new_covered_positions_size = covered_positions.size(); + for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) { + if ( i < new_covered_positions_size ) { + covered_positions.set( i, true ); } - } - } - if ( isEmpty( upkb ) && ( node.getNodeData().getSequence().getAccession() != null ) - && !isEmpty( seq.getAccession().getValue() ) ) { - m = UNIPROT_KB_PATTERN_1.matcher( seq.getAccession().getValue() ); - if ( m.find() ) { - upkb = m.group( 1 ); - } - else { - m = UNIPROT_KB_PATTERN_2.matcher( seq.getAccession().getValue() ); - if ( m.find() ) { - upkb = m.group(); + else { + covered_positions.add( true ); } } + pruned_protein.addProteinDomain( domain ); } } - if ( isEmpty( upkb ) && !isEmpty( node.getName() ) ) { - final Matcher m1 = UNIPROT_KB_PATTERN_1.matcher( node.getName() ); - if ( m1.find() ) { - upkb = m1.group( 1 ); - } - else { - final Matcher m2 = UNIPROT_KB_PATTERN_2.matcher( node.getName() ); - if ( m2.find() ) { - upkb = m2.group(); - } - } - } - return upkb; + return pruned_protein; } - final public static void appendSeparatorIfNotEmpty( final StringBuffer sb, final char separator ) { - if ( sb.length() > 0 ) { - sb.append( separator ); + /** + * Returns true is Domain domain falls in an uninterrupted stretch of + * covered positions. + * + * @param domain + * @param covered_positions + * @return + */ + public static boolean isEngulfed( final Domain domain, final List covered_positions ) { + for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) { + if ( ( i >= covered_positions.size() ) || ( covered_positions.get( i ) != true ) ) { + return false; + } } + return true; } /** @@ -757,7 +714,7 @@ public final class ForesterUtil { return i; } - final public static SortedMap listToSortedCountsMap( final List list ) { + final public static SortedMap listToSortedCountsMap( final List list ) { final SortedMap map = new TreeMap(); for( final Object key : list ) { if ( !map.containsKey( key ) ) { @@ -797,10 +754,9 @@ public final class ForesterUtil { } } - final public static StringBuffer mapToStringBuffer( final Map map, final String key_value_separator ) { + final public static StringBuffer mapToStringBuffer( final Map map, final String key_value_separator ) { final StringBuffer sb = new StringBuffer(); - for( final Iterator iter = map.keySet().iterator(); iter.hasNext(); ) { - final Object key = iter.next(); + for( final Object key : map.keySet() ) { sb.append( key.toString() ); sb.append( key_value_separator ); sb.append( map.get( key ).toString() ); @@ -1247,4 +1203,161 @@ public final class ForesterUtil { System.err.println(); System.exit( -1 ); } + + public final static Color obtainColorDependingOnTaxonomyGroup( final String tax_group ) { + if ( !ForesterUtil.isEmpty( tax_group ) ) { + if ( tax_group.equals( "deuterostomia" ) ) { + return TaxonomyColors.DEUTEROSTOMIA_COLOR; + } + else if ( tax_group.equals( "protostomia" ) ) { + return TaxonomyColors.PROTOSTOMIA_COLOR; + } + else if ( tax_group.equals( "cnidaria" ) ) { + return TaxonomyColors.CNIDARIA_COLOR; + } + else if ( tax_group.equals( "placozoa" ) ) { + return TaxonomyColors.PLACOZOA_COLOR; + } + else if ( tax_group.equals( "ctenophora" ) ) { + return TaxonomyColors.CTENOPHORA_COLOR; + } + else if ( tax_group.equals( "porifera" ) ) { + return TaxonomyColors.PORIFERA_COLOR; + } + else if ( tax_group.equals( "choanoflagellida" ) ) { + return TaxonomyColors.CHOANOFLAGELLIDA; + } + else if ( tax_group.equals( "ichthyophonida & filasterea" ) ) { + return TaxonomyColors.ICHTHYOSPOREA_AND_FILASTEREA; + } + else if ( tax_group.equals( "dikarya" ) ) { + return TaxonomyColors.DIKARYA_COLOR; + } + else if ( tax_group.equalsIgnoreCase( "fungi" ) || tax_group.equalsIgnoreCase( "other fungi" ) ) { + return TaxonomyColors.OTHER_FUNGI_COLOR; + } + else if ( tax_group.equals( "nucleariidae and fonticula" ) ) { + return TaxonomyColors.NUCLEARIIDAE_AND_FONTICULA_GROUP_COLOR; + } + else if ( tax_group.equals( "amoebozoa" ) ) { + return TaxonomyColors.AMOEBOZOA_COLOR; + } + else if ( tax_group.equals( "embryophyta" ) ) { + return TaxonomyColors.EMBRYOPHYTA_COLOR; + } + else if ( tax_group.equals( "chlorophyta" ) ) { + return TaxonomyColors.CHLOROPHYTA_COLOR; + } + else if ( tax_group.equals( "rhodophyta" ) ) { + return TaxonomyColors.RHODOPHYTA_COLOR; + } + else if ( tax_group.equals( "hacrobia" ) ) { + return TaxonomyColors.HACROBIA_COLOR; + } + else if ( tax_group.equals( "glaucocystophyceae" ) ) { + return TaxonomyColors.GLAUCOPHYTA_COLOR; + } + else if ( tax_group.equals( "stramenopiles" ) ) { + return TaxonomyColors.STRAMENOPILES_COLOR; + } + else if ( tax_group.equals( "alveolata" ) ) { + return TaxonomyColors.ALVEOLATA_COLOR; + } + else if ( tax_group.equals( "rhizaria" ) ) { + return TaxonomyColors.RHIZARIA_COLOR; + } + else if ( tax_group.equals( "excavata" ) ) { + return TaxonomyColors.EXCAVATA_COLOR; + } + else if ( tax_group.equals( "apusozoa" ) ) { + return TaxonomyColors.APUSOZOA_COLOR; + } + else if ( tax_group.equals( "archaea" ) ) { + return TaxonomyColors.ARCHAEA_COLOR; + } + else if ( tax_group.equals( "bacteria" ) ) { + return TaxonomyColors.BACTERIA_COLOR; + } + } + return null; + } + + public final static String obtainNormalizedTaxonomyGroup( final String tax ) { + if ( tax.equalsIgnoreCase( "deuterostomia" ) ) { + return "deuterostomia"; + } + else if ( tax.equalsIgnoreCase( "protostomia" ) ) { + return "protostomia"; + } + else if ( tax.equalsIgnoreCase( "cnidaria" ) ) { + return "cnidaria"; + } + else if ( tax.toLowerCase().startsWith( "trichoplax" ) || tax.equalsIgnoreCase( "placozoa" ) ) { + return "placozoa"; + } + else if ( tax.toLowerCase().startsWith( "mnemiopsis" ) || tax.equalsIgnoreCase( "ctenophora" ) ) { + return "ctenophora"; + } + else if ( tax.toLowerCase().startsWith( "amphimedon" ) || tax.equalsIgnoreCase( "porifera" ) ) { + return "porifera"; + } + else if ( tax.equalsIgnoreCase( "codonosigidae" ) || tax.equalsIgnoreCase( "choanoflagellida" ) ) { + return "choanoflagellida"; + } + else if ( tax.toLowerCase().startsWith( "ichthyophonida & filasterea" ) + || tax.toLowerCase().startsWith( "ichthyophonida and filasterea" ) + || tax.toLowerCase().startsWith( "ichthyosporea & filasterea" ) + || tax.toLowerCase().startsWith( "ichthyosporea and filasterea" ) ) { + return "ichthyophonida & filasterea"; + } + else if ( tax.equalsIgnoreCase( "dikarya" ) ) { + return "dikarya"; + } + else if ( tax.equalsIgnoreCase( "other fungi" ) ) { + return "other fungi"; + } + else if ( tax.toLowerCase().startsWith( "nucleariidae and fonticula" ) ) { + return "nucleariidae and fonticula group"; + } + else if ( tax.equalsIgnoreCase( "amoebozoa" ) ) { + return "amoebozoa"; + } + else if ( tax.equalsIgnoreCase( "embryophyta" ) ) { + return "embryophyta"; + } + else if ( tax.equalsIgnoreCase( "chlorophyta" ) ) { + return "chlorophyta"; + } + else if ( tax.equalsIgnoreCase( "rhodophyta" ) ) { + return "rhodophyta"; + } + else if ( tax.toLowerCase().startsWith( "hacrobia" ) ) { + return "hacrobia"; + } + else if ( tax.equalsIgnoreCase( "glaucocystophyceae" ) || tax.equalsIgnoreCase( "glaucophyta" ) ) { + return "glaucocystophyceae"; + } + else if ( tax.equalsIgnoreCase( "stramenopiles" ) ) { + return "stramenopiles"; + } + else if ( tax.equalsIgnoreCase( "alveolata" ) ) { + return "alveolata"; + } + else if ( tax.equalsIgnoreCase( "rhizaria" ) ) { + return "rhizaria"; + } + else if ( tax.equalsIgnoreCase( "excavata" ) ) { + return "excavata"; + } + else if ( tax.equalsIgnoreCase( "apusozoa" ) ) { + return "apusozoa"; + } + else if ( tax.equalsIgnoreCase( "archaea" ) ) { + return "archaea"; + } + else if ( tax.equalsIgnoreCase( "bacteria" ) ) { + return "bacteria"; + } + return null; + } }