+ String prev_gn = "___";
+ boolean is_first = true;
+ List<String> seqs_from_same_protein = new ArrayList<String>();
+ for( final String seq : new_seqs_unique ) {
+ final Matcher matcher_gn = GN_PATTERN.matcher( seq );
+ matcher_gn.find();
+ final String gn = matcher_gn.group( 1 );
+ if ( !prev_gn.equals( gn ) && !is_first ) {
+ doit( seqs_from_same_protein, out );
+ seqs_from_same_protein = new ArrayList<String>();
+ }
+ prev_gn = gn;
+ is_first = false;
+ seqs_from_same_protein.add( seq );
+ }
+ doit( seqs_from_same_protein, out );
+ out.println( "" );
+ out.println( "" );
+ out.println( "Removed because same GN and region:" );
+ for( final String s : duplicate_gn_ra ) {
+ out.println( s );
+ }
+ out.println( "" );
+ out.println( "" );
+ out.println( "Removed because identical mol sequence:" );
+ for( final String s : duplicate_mol_seq ) {
+ out.println( s );
+ }
+ out.println( "" );
+ out.println( "" );
+ out.println( "Removed because too short:" );
+ for( final Sequence s : too_short ) {
+ out.println( s.toString() );
+ }
+ out.println( "" );
+ out.println( "" );
+ out.println( "initial:" + initial_number );
+ out.println( "ignored because shorter than " + MIN_LENGTH + "aa: " + too_short.size() );
+ out.println( "unique : " + unique_counter );
+ out.println( "unique : " + new_seqs_unique.size() );
+ out.println( "duplicate because gn and range same: " + duplicate_gn_ra.size() );
+ out.println( "duplicate because mol seq same : " + duplicate_mol_seq.size() );
+ out.flush();
+ out.close();
+ System.out.println( "DONE " );