2 package org.forester.application;
4 import java.io.FileInputStream;
5 import java.util.ArrayList;
6 import java.util.Collections;
7 import java.util.HashSet;
10 import java.util.regex.Matcher;
11 import java.util.regex.Pattern;
13 import org.forester.io.parsers.FastaParser;
14 import org.forester.sequence.Sequence;
18 public final static Pattern GN_PATTERN = Pattern.compile( "GN=(\\S+)\\s" ); //use w+ instead of S+ for more stringent setting.
19 public final static Pattern RANGE_PATTERN = Pattern.compile( "\\[(\\d+-\\d+)\\]" ); //use w+ instead of S+ for more stringent setting.
21 public static void main( final String args[] ) {
23 System.out.println( "STARTING..." );
24 final List<Sequence> orig = FastaParser
25 .parse( new FileInputStream( "C:\\Users\\zma\\Desktop\\RRMa_domains_ext_20_2.fasta" ) );
26 final List<String> new_seqs = new ArrayList<String>();
27 for( final Sequence seq : orig ) {
28 final Matcher matcher = GN_PATTERN.matcher( seq.getIdentifier() );
30 if ( matcher.find() ) {
31 gn = matcher.group( 1 );
34 System.out.println( "ERROR: no gene for: " + seq.getIdentifier() );
37 new_seqs.add( ">" + gn + "|" + seq.getIdentifier() + "\n" + seq.getMolecularSequenceAsString() );
39 final Set<String> gn_ra_set = new HashSet<String>();
40 final Set<String> mol_seq_set = new HashSet<String>();
41 Collections.sort( new_seqs );
42 int unique_counter = 0;
43 int duplicate_counter_gn_ra = 0;
44 int duplicate_counter_mol_seq = 0;
45 final List<String> new_seqs_unique = new ArrayList<String>();
46 for( final String seq : new_seqs ) {
47 final Matcher matcher_ra = RANGE_PATTERN.matcher( seq );
48 final Matcher matcher_gn = GN_PATTERN.matcher( seq );
50 if ( matcher_ra.find() ) {
51 range = matcher_ra.group( 1 );
54 System.out.println( "ERROR: no range for: " + seq );
58 final String gn = matcher_gn.group( 1 );
59 final String gn_ra = gn + "_" + range;
60 if ( !gn_ra_set.contains( gn_ra ) ) {
61 gn_ra_set.add( gn_ra );
62 final String mol_seq = seq.split( "\n" )[ 1 ];
63 if ( !mol_seq_set.contains( mol_seq ) ) {
64 mol_seq_set.add( mol_seq );
65 new_seqs_unique.add( seq );
69 duplicate_counter_mol_seq++;
73 duplicate_counter_gn_ra++;
76 String prev_gn = "___";
77 boolean is_first = true;
78 List<String> same_protein_seqs = new ArrayList<String>();
79 for( final String seq : new_seqs_unique ) {
81 final Matcher matcher_gn = GN_PATTERN.matcher( seq );
83 final String gn = matcher_gn.group( 1 );
84 if ( !prev_gn.equals( gn ) && !is_first ) {
85 doit( same_protein_seqs );
86 same_protein_seqs = new ArrayList<String>();
90 same_protein_seqs.add( seq );
92 doit( same_protein_seqs );
93 System.out.println( "unique : " + unique_counter );
94 System.out.println( "duplicate because gn and range same: " + duplicate_counter_gn_ra );
95 System.out.println( "duplicate because mol seq same : " + duplicate_counter_mol_seq );
97 catch ( final Exception e ) {
102 private static void doit( List<String> same_protein_seqs ) {
103 final int count = same_protein_seqs.size();
105 System.out.println( same_protein_seqs.get( 0 ) );
109 for( final String s : same_protein_seqs ) {
110 System.out.println( new StringBuffer( s ).insert( s.indexOf( "|" ),
111 "__" + c + "_OF_" + count ).toString() );