From adebcf501c27ffaf9ce6ffdbcc6efc43f7cb771f Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Fri, 27 Sep 2013 22:54:39 +0000 Subject: [PATCH] moved --- .../src/org/forester/applications/aa.java | 86 +++++++++++ .../src/org/forester/applications/aaa.java | 151 ++++++++++++++++++++ 2 files changed, 237 insertions(+) create mode 100644 forester_applications/src/org/forester/applications/aa.java create mode 100644 forester_applications/src/org/forester/applications/aaa.java diff --git a/forester_applications/src/org/forester/applications/aa.java b/forester_applications/src/org/forester/applications/aa.java new file mode 100644 index 0000000..2a64278 --- /dev/null +++ b/forester_applications/src/org/forester/applications/aa.java @@ -0,0 +1,86 @@ +// + +package org.forester.applications; + +import java.io.FileInputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.forester.io.parsers.FastaParser; +import org.forester.msa.Msa; +import org.forester.sequence.BasicSequence; +import org.forester.sequence.Sequence; +import org.forester.util.ForesterUtil; + +public class aa { + + public static void main( final String args[] ) { + try { + System.out.println( "STARTING..." ); + final List orig = FastaParser + .parse( new FileInputStream( "C:\\Users\\zma\\Desktop\\RRMa_domains_ext_20.fasta" ) ); + final Msa msa = FastaParser.parseMsa( new FileInputStream( "C:\\Users\\zma\\Desktop\\test3_sorted.fasta" ) ); + final Set all_found_seqs = new HashSet(); + for( int i = 0; i < msa.getNumberOfSequences(); ++i ) { + final String id = msa.getIdentifier( i ); + final String id_ = id.substring( 0, id.indexOf( "_" ) ); + final String range = id.substring( id.indexOf( "[" ) + 1, id.indexOf( "]" ) ); + //System.out.println( i + ": " + id + "=>" + id_ + " " + range ); + if ( ForesterUtil.isEmpty( id_ ) ) { + System.out.println( "ERROR: id is empty for: " + id ); + System.exit( -1 ); + } + if ( ForesterUtil.isEmpty( range ) ) { + System.out.println( "ERROR: range is empty for: " + id ); + System.exit( -1 ); + } + int found = 0; + final List found_seqs = new ArrayList(); + for( final Sequence orig_seq : orig ) { + final String orig_seq_id = orig_seq.getIdentifier(); + if ( ( orig_seq_id.indexOf( id_ ) >= 0 ) && ( orig_seq_id.indexOf( "[" + range + "]" ) >= 0 ) ) { + found++; + found_seqs.add( orig_seq ); + } + } + if ( found > 0 ) { + for( final Sequence found_seq : found_seqs ) { + if ( found_seq.getLength() >= 85 ) { + all_found_seqs.add( BasicSequence.createAaSequence( id, found_seq + .getMolecularSequenceAsString() ) ); + } + } + if ( found > 1 ) { + System.out.println( i + ": " + id + "=>" + id_ + " " + range ); + System.out.println( " found: " + found ); + for( final Sequence found_seq : found_seqs ) { + System.out.println( found_seq.toString() ); + } + } + } + else { + System.out.println( "ERROR: not found: " + id ); + System.exit( -1 ); + } + } + final String fasta_ary[] = new String[ all_found_seqs.size() ]; + int i = 0; + for( final Sequence sequence : all_found_seqs ) { + fasta_ary[ i ] = ">" + sequence.getIdentifier() + "\n" + sequence.getMolecularSequenceAsString(); + System.out.println( sequence ); + i++; + } + Arrays.sort( fasta_ary ); + for( final String element : fasta_ary ) { + System.out.println( element ); + } + System.out.println( "DONE." ); + } + catch ( final Exception e ) { + e.printStackTrace(); + } + } +} diff --git a/forester_applications/src/org/forester/applications/aaa.java b/forester_applications/src/org/forester/applications/aaa.java new file mode 100644 index 0000000..d6722a9 --- /dev/null +++ b/forester_applications/src/org/forester/applications/aaa.java @@ -0,0 +1,151 @@ + +package org.forester.applications; + +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.io.parsers.FastaParser; +import org.forester.sequence.Sequence; +import org.forester.util.EasyWriter; +import org.forester.util.ForesterUtil; + +public class aaa { + + public final static Pattern GN_PATTERN = Pattern.compile( "GN=(\\S+)\\s" ); //use w+ instead of S+ for more stringent setting. + public final static Pattern RANGE_PATTERN = Pattern.compile( "\\[(\\d+-\\d+)\\]" ); //use w+ instead of S+ for more stringent setting. + public final static int MIN_LENGTH = 85; + + public static void main( final String args[] ) { + try { + final EasyWriter out = ( EasyWriter ) ForesterUtil.createEasyWriter( "aaa_out" ); + System.out.println( "STARTING..." ); + final List too_short = new ArrayList(); + final List orig = FastaParser + .parse( new FileInputStream( "C:\\Users\\zma\\Desktop\\RRMa_domains_ext_20_2.fasta" ) ); + final int initial_number = orig.size(); + final List new_seqs = new ArrayList(); + for( final Sequence seq : orig ) { + if ( seq.getLength() < MIN_LENGTH ) { + too_short.add( seq ); + continue; + } + final Matcher matcher = GN_PATTERN.matcher( seq.getIdentifier() ); + String gn = ""; + if ( matcher.find() ) { + gn = matcher.group( 1 ); + } + else { + System.out.println( "ERROR: no gene for: " + seq.getIdentifier() ); + System.exit( -1 ); + } + new_seqs.add( ">" + gn + "|" + seq.getIdentifier() + "\n" + seq.getMolecularSequenceAsString() ); + } + final Set gn_ra_set = new HashSet(); + final Set mol_seq_set = new HashSet(); + Collections.sort( new_seqs ); + int unique_counter = 0; + final List duplicate_gn_ra = new ArrayList(); + final List duplicate_mol_seq = new ArrayList(); + final List new_seqs_unique = new ArrayList(); + for( final String seq : new_seqs ) { + final Matcher matcher_ra = RANGE_PATTERN.matcher( seq ); + final Matcher matcher_gn = GN_PATTERN.matcher( seq ); + String range = ""; + if ( matcher_ra.find() ) { + range = matcher_ra.group( 1 ); + } + else { + System.out.println( "ERROR: no range for: " + seq ); + System.exit( -1 ); + } + matcher_gn.find(); + final String gn = matcher_gn.group( 1 ); + final String gn_ra = gn + "_" + range; + if ( !gn_ra_set.contains( gn_ra ) ) { + gn_ra_set.add( gn_ra ); + final String mol_seq = seq.split( "\n" )[ 1 ]; + if ( !mol_seq_set.contains( mol_seq ) ) { + mol_seq_set.add( mol_seq ); + new_seqs_unique.add( seq ); + unique_counter++; + } + else { + duplicate_mol_seq.add( seq ); + } + } + else { + duplicate_gn_ra.add( seq ); + } + } + String prev_gn = "___"; + boolean is_first = true; + List seqs_from_same_protein = new ArrayList(); + for( final String seq : new_seqs_unique ) { + final Matcher matcher_gn = GN_PATTERN.matcher( seq ); + matcher_gn.find(); + final String gn = matcher_gn.group( 1 ); + if ( !prev_gn.equals( gn ) && !is_first ) { + doit( seqs_from_same_protein, out ); + seqs_from_same_protein = new ArrayList(); + } + prev_gn = gn; + is_first = false; + seqs_from_same_protein.add( seq ); + } + doit( seqs_from_same_protein, out ); + out.println( "" ); + out.println( "" ); + out.println( "Removed because same GN and region:" ); + for( final String s : duplicate_gn_ra ) { + out.println( s ); + } + out.println( "" ); + out.println( "" ); + out.println( "Removed because identical mol sequence:" ); + for( final String s : duplicate_mol_seq ) { + out.println( s ); + } + out.println( "" ); + out.println( "" ); + out.println( "Removed because too short:" ); + for( final Sequence s : too_short ) { + out.println( s.toString() ); + } + out.println( "" ); + out.println( "" ); + out.println( "initial:" + initial_number ); + out.println( "ignored because shorter than " + MIN_LENGTH + "aa: " + too_short.size() ); + out.println( "unique : " + unique_counter ); + out.println( "unique : " + new_seqs_unique.size() ); + out.println( "duplicate because gn and range same: " + duplicate_gn_ra.size() ); + out.println( "duplicate because mol seq same : " + duplicate_mol_seq.size() ); + out.flush(); + out.close(); + System.out.println( "DONE " ); + } + catch ( final Exception e ) { + e.printStackTrace(); + } + } + + private static void doit( final List same_protein_seqs, final EasyWriter out ) throws IOException { + final int count = same_protein_seqs.size(); + if ( count == 1 ) { + out.println( same_protein_seqs.get( 0 ) ); + } + else { + int c = 1; + for( final String s : same_protein_seqs ) { + out.println( new StringBuffer( s ).insert( s.indexOf( "|" ), "__" + c + "_OF_" + count ).toString() ); + c++; + } + } + } +} -- 1.7.10.2