From 9d627b6d40d2f38258ffe2cdebd82a2e8c1b9ea6 Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Thu, 9 Aug 2012 12:45:47 +0000 Subject: [PATCH] in progress --- forester/java/src/org/forester/application/aa.java | 16 +++- .../java/src/org/forester/application/aaa.java | 101 ++++++++++++++++++++ 2 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 forester/java/src/org/forester/application/aaa.java diff --git a/forester/java/src/org/forester/application/aa.java b/forester/java/src/org/forester/application/aa.java index 1ae5dfb..899120c 100644 --- a/forester/java/src/org/forester/application/aa.java +++ b/forester/java/src/org/forester/application/aa.java @@ -3,6 +3,7 @@ package org.forester.application; import java.io.FileInputStream; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -10,6 +11,7 @@ import java.util.TreeSet; import org.forester.io.parsers.FastaParser; import org.forester.msa.Msa; +import org.forester.sequence.BasicSequence; import org.forester.sequence.Sequence; import org.forester.util.ForesterUtil; @@ -47,7 +49,8 @@ public class aa { if ( found > 0 ) { for( final Sequence found_seq : found_seqs ) { if ( found_seq.getLength() >= 85 ) { - all_found_seqs.add( found_seq ); + + all_found_seqs.add( BasicSequence.createAaSequence( id, found_seq.getMolecularSequenceAsString() ) ); } } if ( found > 1 ) { @@ -63,10 +66,19 @@ public class aa { System.exit( -1 ); } } + String fasta_ary[] = new String[ all_found_seqs.size() ]; int i = 0; for( Sequence sequence : all_found_seqs ) { - System.out.println( i++ + ": " + sequence ); + fasta_ary[ i ] = ">" + sequence.getIdentifier() + "\n" + sequence.getMolecularSequenceAsString(); + System.out.println( sequence ); + i++; } + Arrays.sort( fasta_ary ); + + for( int j = 0; j < fasta_ary.length; ++j ) { + System.out.println( fasta_ary[ j ] ); + } + System.out.println( "DONE." ); } catch ( final Exception e ) { diff --git a/forester/java/src/org/forester/application/aaa.java b/forester/java/src/org/forester/application/aaa.java new file mode 100644 index 0000000..b42ba7b --- /dev/null +++ b/forester/java/src/org/forester/application/aaa.java @@ -0,0 +1,101 @@ + +package org.forester.application; + +import java.io.FileInputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.io.parsers.FastaParser; +import org.forester.sequence.Sequence; +import org.forester.util.ForesterUtil; + +public class aaa { + + public final static Pattern GN_PATTERN = Pattern.compile( "GN=(\\S+)\\s" ); //use w+ instead of S+ for more stringent setting. + public final static Pattern RANGE_PATTERN = Pattern.compile( "\\[(\\d+-\\d+)\\]" ); //use w+ instead of S+ for more stringent setting. + + public static void main( final String args[] ) { + try { + System.out.println( "STARTING..." ); + final List orig = FastaParser + .parse( new FileInputStream( "C:\\Users\\zma\\Desktop\\RRMa_domains_ext_20_2.fasta" ) ); + final List new_seqs = new ArrayList(); + for( final Sequence seq : orig ) { + final Matcher matcher = GN_PATTERN.matcher( seq.getIdentifier() ); + String gn = ""; + if ( matcher.find() ) { + gn = matcher.group( 1 ); + } + else { + System.out.println( "ERROR: no gene for: " + seq.getIdentifier() ); + System.exit( -1 ); + } + new_seqs.add( ">" + gn + "|" + seq.getIdentifier() + "\n" + seq.getMolecularSequenceAsString() ); + } + final Set gn_ra_set = new HashSet(); + final Set mol_seq_set = new HashSet(); + Collections.sort( new_seqs ); + int unique_counter = 0; + int duplicate_counter_gn_ra = 0; + int duplicate_counter_mol_seq = 0; + String prev_gn = "____"; + for( final String seq : new_seqs ) { + final Matcher matcher_ra = RANGE_PATTERN.matcher( seq ); + final Matcher matcher_gn = GN_PATTERN.matcher( seq ); + String range = ""; + if ( matcher_ra.find() ) { + range = matcher_ra.group( 1 ); + } + else { + System.out.println( "ERROR: no range for: " + seq ); + System.exit( -1 ); + } + matcher_gn.find(); + final String gn = matcher_gn.group( 1 ); + final String gn_ra = gn + "_" + range; + if ( !gn_ra_set.contains( gn_ra ) ) { + gn_ra_set.add( gn_ra ); + final String mol_seq = seq.split( "\n" )[ 1 ]; + if ( !mol_seq_set.contains( mol_seq ) ) { + mol_seq_set.add( mol_seq ); + if ( prev_gn.equals( gn ) ) { + int count = same_protein_seqs.size(); + if ( count == 1 ) { + System.out.println( seq ); + } + else { + int c = 1; + for( final String s : same_protein_seqs ) { + System.out.println( new StringBuffer( s ).insert( s.indexOf( "|" ), + "__" + c + "_OF_" + count ) + .toString() ); + c++; + } + } + } + prev_gn = gn; + System.out.println( seq ); + unique_counter++; + } + else { + duplicate_counter_mol_seq++; + } + } + else { + duplicate_counter_gn_ra++; + } + } + System.out.println( "unique : " + unique_counter ); + System.out.println( "duplicate because gn and range same: " + duplicate_counter_gn_ra ); + System.out.println( "duplicate because mol seq same : " + duplicate_counter_mol_seq ); + } + catch ( final Exception e ) { + e.printStackTrace(); + } + } +} -- 1.7.10.2