From 8ab567b0fe466b5206e2ffd7c02d101a12252419 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Fri, 27 Sep 2013 23:09:29 +0000 Subject: [PATCH] moved --- forester/java/src/org/forester/application/aa.java | 86 ----------- .../java/src/org/forester/application/aaa.java | 151 -------------------- 2 files changed, 237 deletions(-) delete mode 100644 forester/java/src/org/forester/application/aa.java delete mode 100644 forester/java/src/org/forester/application/aaa.java diff --git a/forester/java/src/org/forester/application/aa.java b/forester/java/src/org/forester/application/aa.java deleted file mode 100644 index b835e52..0000000 --- a/forester/java/src/org/forester/application/aa.java +++ /dev/null @@ -1,86 +0,0 @@ -// - -package org.forester.application; - -import java.io.FileInputStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import org.forester.io.parsers.FastaParser; -import org.forester.msa.Msa; -import org.forester.sequence.BasicSequence; -import org.forester.sequence.Sequence; -import org.forester.util.ForesterUtil; - -public class aa { - - public static void main( final String args[] ) { - try { - System.out.println( "STARTING..." ); - final List orig = FastaParser - .parse( new FileInputStream( "C:\\Users\\zma\\Desktop\\RRMa_domains_ext_20.fasta" ) ); - final Msa msa = FastaParser.parseMsa( new FileInputStream( "C:\\Users\\zma\\Desktop\\test3_sorted.fasta" ) ); - final Set all_found_seqs = new HashSet(); - for( int i = 0; i < msa.getNumberOfSequences(); ++i ) { - final String id = msa.getIdentifier( i ); - final String id_ = id.substring( 0, id.indexOf( "_" ) ); - final String range = id.substring( id.indexOf( "[" ) + 1, id.indexOf( "]" ) ); - //System.out.println( i + ": " + id + "=>" + id_ + " " + range ); - if ( ForesterUtil.isEmpty( id_ ) ) { - System.out.println( "ERROR: id is empty for: " + id ); - System.exit( -1 ); - } - if ( ForesterUtil.isEmpty( range ) ) { - System.out.println( "ERROR: range is empty for: " + id ); - System.exit( -1 ); - } - int found = 0; - final List found_seqs = new ArrayList(); - for( final Sequence orig_seq : orig ) { - final String orig_seq_id = orig_seq.getIdentifier(); - if ( ( orig_seq_id.indexOf( id_ ) >= 0 ) && ( orig_seq_id.indexOf( "[" + range + "]" ) >= 0 ) ) { - found++; - found_seqs.add( orig_seq ); - } - } - if ( found > 0 ) { - for( final Sequence found_seq : found_seqs ) { - if ( found_seq.getLength() >= 85 ) { - all_found_seqs.add( BasicSequence.createAaSequence( id, found_seq - .getMolecularSequenceAsString() ) ); - } - } - if ( found > 1 ) { - System.out.println( i + ": " + id + "=>" + id_ + " " + range ); - System.out.println( " found: " + found ); - for( final Sequence found_seq : found_seqs ) { - System.out.println( found_seq.toString() ); - } - } - } - else { - System.out.println( "ERROR: not found: " + id ); - System.exit( -1 ); - } - } - final String fasta_ary[] = new String[ all_found_seqs.size() ]; - int i = 0; - for( final Sequence sequence : all_found_seqs ) { - fasta_ary[ i ] = ">" + sequence.getIdentifier() + "\n" + sequence.getMolecularSequenceAsString(); - System.out.println( sequence ); - i++; - } - Arrays.sort( fasta_ary ); - for( final String element : fasta_ary ) { - System.out.println( element ); - } - System.out.println( "DONE." ); - } - catch ( final Exception e ) { - e.printStackTrace(); - } - } -} diff --git a/forester/java/src/org/forester/application/aaa.java b/forester/java/src/org/forester/application/aaa.java deleted file mode 100644 index ac643c4..0000000 --- a/forester/java/src/org/forester/application/aaa.java +++ /dev/null @@ -1,151 +0,0 @@ - -package org.forester.application; - -import java.io.FileInputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.forester.io.parsers.FastaParser; -import org.forester.sequence.Sequence; -import org.forester.util.EasyWriter; -import org.forester.util.ForesterUtil; - -public class aaa { - - public final static Pattern GN_PATTERN = Pattern.compile( "GN=(\\S+)\\s" ); //use w+ instead of S+ for more stringent setting. - public final static Pattern RANGE_PATTERN = Pattern.compile( "\\[(\\d+-\\d+)\\]" ); //use w+ instead of S+ for more stringent setting. - public final static int MIN_LENGTH = 85; - - public static void main( final String args[] ) { - try { - final EasyWriter out = ( EasyWriter ) ForesterUtil.createEasyWriter( "aaa_out" ); - System.out.println( "STARTING..." ); - final List too_short = new ArrayList(); - final List orig = FastaParser - .parse( new FileInputStream( "C:\\Users\\zma\\Desktop\\RRMa_domains_ext_20_2.fasta" ) ); - final int initial_number = orig.size(); - final List new_seqs = new ArrayList(); - for( final Sequence seq : orig ) { - if ( seq.getLength() < MIN_LENGTH ) { - too_short.add( seq ); - continue; - } - final Matcher matcher = GN_PATTERN.matcher( seq.getIdentifier() ); - String gn = ""; - if ( matcher.find() ) { - gn = matcher.group( 1 ); - } - else { - System.out.println( "ERROR: no gene for: " + seq.getIdentifier() ); - System.exit( -1 ); - } - new_seqs.add( ">" + gn + "|" + seq.getIdentifier() + "\n" + seq.getMolecularSequenceAsString() ); - } - final Set gn_ra_set = new HashSet(); - final Set mol_seq_set = new HashSet(); - Collections.sort( new_seqs ); - int unique_counter = 0; - final List duplicate_gn_ra = new ArrayList(); - final List duplicate_mol_seq = new ArrayList(); - final List new_seqs_unique = new ArrayList(); - for( final String seq : new_seqs ) { - final Matcher matcher_ra = RANGE_PATTERN.matcher( seq ); - final Matcher matcher_gn = GN_PATTERN.matcher( seq ); - String range = ""; - if ( matcher_ra.find() ) { - range = matcher_ra.group( 1 ); - } - else { - System.out.println( "ERROR: no range for: " + seq ); - System.exit( -1 ); - } - matcher_gn.find(); - final String gn = matcher_gn.group( 1 ); - final String gn_ra = gn + "_" + range; - if ( !gn_ra_set.contains( gn_ra ) ) { - gn_ra_set.add( gn_ra ); - final String mol_seq = seq.split( "\n" )[ 1 ]; - if ( !mol_seq_set.contains( mol_seq ) ) { - mol_seq_set.add( mol_seq ); - new_seqs_unique.add( seq ); - unique_counter++; - } - else { - duplicate_mol_seq.add( seq ); - } - } - else { - duplicate_gn_ra.add( seq ); - } - } - String prev_gn = "___"; - boolean is_first = true; - List seqs_from_same_protein = new ArrayList(); - for( final String seq : new_seqs_unique ) { - final Matcher matcher_gn = GN_PATTERN.matcher( seq ); - matcher_gn.find(); - final String gn = matcher_gn.group( 1 ); - if ( !prev_gn.equals( gn ) && !is_first ) { - doit( seqs_from_same_protein, out ); - seqs_from_same_protein = new ArrayList(); - } - prev_gn = gn; - is_first = false; - seqs_from_same_protein.add( seq ); - } - doit( seqs_from_same_protein, out ); - out.println( "" ); - out.println( "" ); - out.println( "Removed because same GN and region:" ); - for( final String s : duplicate_gn_ra ) { - out.println( s ); - } - out.println( "" ); - out.println( "" ); - out.println( "Removed because identical mol sequence:" ); - for( final String s : duplicate_mol_seq ) { - out.println( s ); - } - out.println( "" ); - out.println( "" ); - out.println( "Removed because too short:" ); - for( final Sequence s : too_short ) { - out.println( s.toString() ); - } - out.println( "" ); - out.println( "" ); - out.println( "initial:" + initial_number ); - out.println( "ignored because shorter than " + MIN_LENGTH + "aa: " + too_short.size() ); - out.println( "unique : " + unique_counter ); - out.println( "unique : " + new_seqs_unique.size() ); - out.println( "duplicate because gn and range same: " + duplicate_gn_ra.size() ); - out.println( "duplicate because mol seq same : " + duplicate_mol_seq.size() ); - out.flush(); - out.close(); - System.out.println( "DONE " ); - } - catch ( final Exception e ) { - e.printStackTrace(); - } - } - - private static void doit( final List same_protein_seqs, final EasyWriter out ) throws IOException { - final int count = same_protein_seqs.size(); - if ( count == 1 ) { - out.println( same_protein_seqs.get( 0 ) ); - } - else { - int c = 1; - for( final String s : same_protein_seqs ) { - out.println( new StringBuffer( s ).insert( s.indexOf( "|" ), "__" + c + "_OF_" + count ).toString() ); - c++; - } - } - } -} -- 1.7.10.2