2 package org.forester.application;
4 import java.io.FileInputStream;
5 import java.io.IOException;
6 import java.util.ArrayList;
7 import java.util.Collections;
8 import java.util.HashSet;
11 import java.util.regex.Matcher;
12 import java.util.regex.Pattern;
14 import org.forester.io.parsers.FastaParser;
15 import org.forester.sequence.Sequence;
16 import org.forester.util.EasyWriter;
17 import org.forester.util.ForesterUtil;
21 public final static Pattern GN_PATTERN = Pattern.compile( "GN=(\\S+)\\s" ); //use w+ instead of S+ for more stringent setting.
22 public final static Pattern RANGE_PATTERN = Pattern.compile( "\\[(\\d+-\\d+)\\]" ); //use w+ instead of S+ for more stringent setting.
23 public final static int MIN_LENGTH = 85;
25 public static void main( final String args[] ) {
27 final EasyWriter out = ( EasyWriter ) ForesterUtil.createEasyWriter( "aaa_out" );
28 System.out.println( "STARTING..." );
29 final List<Sequence> too_short = new ArrayList<Sequence>();
30 final List<Sequence> orig = FastaParser
31 .parse( new FileInputStream( "C:\\Users\\zma\\Desktop\\RRMa_domains_ext_20_2.fasta" ) );
32 final int initial_number = orig.size();
33 final List<String> new_seqs = new ArrayList<String>();
34 for( final Sequence seq : orig ) {
35 if ( seq.getLength() < MIN_LENGTH ) {
39 final Matcher matcher = GN_PATTERN.matcher( seq.getIdentifier() );
41 if ( matcher.find() ) {
42 gn = matcher.group( 1 );
45 System.out.println( "ERROR: no gene for: " + seq.getIdentifier() );
48 new_seqs.add( ">" + gn + "|" + seq.getIdentifier() + "\n" + seq.getMolecularSequenceAsString() );
50 final Set<String> gn_ra_set = new HashSet<String>();
51 final Set<String> mol_seq_set = new HashSet<String>();
52 Collections.sort( new_seqs );
53 int unique_counter = 0;
54 final List<String> duplicate_gn_ra = new ArrayList<String>();
55 final List<String> duplicate_mol_seq = new ArrayList<String>();
56 final List<String> new_seqs_unique = new ArrayList<String>();
57 for( final String seq : new_seqs ) {
58 final Matcher matcher_ra = RANGE_PATTERN.matcher( seq );
59 final Matcher matcher_gn = GN_PATTERN.matcher( seq );
61 if ( matcher_ra.find() ) {
62 range = matcher_ra.group( 1 );
65 System.out.println( "ERROR: no range for: " + seq );
69 final String gn = matcher_gn.group( 1 );
70 final String gn_ra = gn + "_" + range;
71 if ( !gn_ra_set.contains( gn_ra ) ) {
72 gn_ra_set.add( gn_ra );
73 final String mol_seq = seq.split( "\n" )[ 1 ];
74 if ( !mol_seq_set.contains( mol_seq ) ) {
75 mol_seq_set.add( mol_seq );
76 new_seqs_unique.add( seq );
80 duplicate_mol_seq.add( seq );
84 duplicate_gn_ra.add( seq );
87 String prev_gn = "___";
88 boolean is_first = true;
89 List<String> seqs_from_same_protein = new ArrayList<String>();
90 for( final String seq : new_seqs_unique ) {
91 final Matcher matcher_gn = GN_PATTERN.matcher( seq );
93 final String gn = matcher_gn.group( 1 );
94 if ( !prev_gn.equals( gn ) && !is_first ) {
95 doit( seqs_from_same_protein, out );
96 seqs_from_same_protein = new ArrayList<String>();
100 seqs_from_same_protein.add( seq );
102 doit( seqs_from_same_protein, out );
105 out.println( "Removed because same GN and region:" );
106 for( final String s : duplicate_gn_ra ) {
111 out.println( "Removed because identical mol sequence:" );
112 for( final String s : duplicate_mol_seq ) {
117 out.println( "Removed because too short:" );
118 for( final Sequence s : too_short ) {
119 out.println( s.toString() );
123 out.println( "initial:" + initial_number );
124 out.println( "ignored because shorter than " + MIN_LENGTH + "aa: " + too_short.size() );
125 out.println( "unique : " + unique_counter );
126 out.println( "unique : " + new_seqs_unique.size() );
127 out.println( "duplicate because gn and range same: " + duplicate_gn_ra.size() );
128 out.println( "duplicate because mol seq same : " + duplicate_mol_seq.size() );
131 System.out.println( "DONE " );
133 catch ( final Exception e ) {
138 private static void doit( final List<String> same_protein_seqs, final EasyWriter out ) throws IOException {
139 final int count = same_protein_seqs.size();
141 out.println( same_protein_seqs.get( 0 ) );
145 for( final String s : same_protein_seqs ) {
146 out.println( new StringBuffer( s ).insert( s.indexOf( "|" ), "__" + c + "_OF_" + count ).toString() );