3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27 package org.forester.io.parsers;
29 import java.io.BufferedReader;
31 import java.io.FileReader;
32 import java.io.IOException;
33 import java.util.ArrayList;
34 import java.util.Date;
35 import java.util.HashSet;
36 import java.util.List;
39 import java.util.SortedSet;
40 import java.util.TreeMap;
41 import java.util.TreeSet;
43 import org.forester.protein.BasicDomain;
44 import org.forester.protein.BasicProtein;
45 import org.forester.protein.Domain;
46 import org.forester.protein.Protein;
47 import org.forester.surfacing.SurfacingUtil;
48 import org.forester.util.ForesterUtil;
50 public final class HmmPfamOutputParser {
52 private static final String RETRO = "RETRO";
53 private static final String PHAGE = "PHAGE";
54 private static final String VIR = "VIR";
55 private static final String TRANSPOS = "TRANSPOS";
56 private static final String RV = "RV";
57 private static final String GAG = "GAG_";
58 private static final String HCV = "HCV_"; // New. Added on Jun 11, after 1st submission.
59 private static final String HERPES = "Herpes_"; // New. Added on Jun 11, after 1st submission.
60 private static final int E_VALUE_MAXIMUM_DEFAULT = -1;
61 private static final ReturnType RETURN_TYPE_DEFAULT = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN;
62 private static final boolean IGNORE_DUFS_DEFAULT = false;
63 private static final int MAX_ALLOWED_OVERLAP_DEFAULT = -1;
64 private final Set<String> _filter;
65 private final FilterType _filter_type;
66 private final File _input_file;
67 private final String _species;
68 private final String _model_type;
69 private double _e_value_maximum;
70 private Map<String, String> _individual_domain_score_cutoffs;
71 private boolean _ignore_dufs;
72 private boolean _ignore_virus_like_ids;
73 private boolean _allow_non_unique_query;
74 private boolean _verbose;
75 private int _max_allowed_overlap;
76 private boolean _ignore_engulfed_domains;
77 private ReturnType _return_type;
78 private int _proteins_encountered;
79 private int _proteins_ignored_due_to_filter;
80 private int _proteins_stored;
81 private int _domains_encountered;
82 private int _domains_ignored_due_to_duf;
83 private int _domains_ignored_due_to_overlap;
84 private int _domains_ignored_due_to_e_value;
85 private int _domains_ignored_due_to_individual_score_cutoff;
86 private int _domains_stored;
87 private SortedSet<String> _domains_stored_set;
89 private int _domains_ignored_due_to_negative_domain_filter;
90 private Map<String, Integer> _domains_ignored_due_to_negative_domain_filter_counts_map;
91 private int _domains_ignored_due_to_virus_like_id;
92 private Map<String, Integer> _domains_ignored_due_to_virus_like_id_counts_map;
94 public HmmPfamOutputParser( final File input_file, final String species, final String model_type ) {
95 _input_file = input_file;
97 _model_type = model_type;
99 _filter_type = FilterType.NONE;
103 public HmmPfamOutputParser( final File input_file,
104 final String species,
105 final String model_type,
106 final Set<String> filter,
107 final FilterType filter_type ) {
108 _input_file = input_file;
110 _model_type = model_type;
112 _filter_type = filter_type;
116 private void actuallyAddProtein( final List<Protein> proteins, final Protein current_protein ) {
117 final List<Domain> l = current_protein.getProteinDomains();
118 for( final Domain d : l ) {
119 getDomainsStoredSet().add( d.getDomainId() );
121 proteins.add( current_protein );
125 private void addProtein( final List<Protein> proteins, final Protein current_protein ) {
126 if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
127 final Set<String> domain_ids_in_protein = new HashSet<String>();
128 for( final Domain d : current_protein.getProteinDomains() ) {
129 domain_ids_in_protein.add( d.getDomainId() );
131 domain_ids_in_protein.retainAll( getFilter() );
132 if ( getFilterType() == FilterType.POSITIVE_PROTEIN ) {
133 if ( domain_ids_in_protein.size() > 0 ) {
134 actuallyAddProtein( proteins, current_protein );
137 ++_proteins_ignored_due_to_filter;
141 if ( domain_ids_in_protein.size() < 1 ) {
142 actuallyAddProtein( proteins, current_protein );
145 ++_proteins_ignored_due_to_filter;
150 actuallyAddProtein( proteins, current_protein );
154 public int getDomainsEncountered() {
155 return _domains_encountered;
158 public int getDomainsIgnoredDueToDuf() {
159 return _domains_ignored_due_to_duf;
162 public int getDomainsIgnoredDueToEval() {
163 return _domains_ignored_due_to_e_value;
166 public int getDomainsIgnoredDueToIndividualScoreCutoff() {
167 return _domains_ignored_due_to_individual_score_cutoff;
170 public int getDomainsIgnoredDueToNegativeDomainFilter() {
171 return _domains_ignored_due_to_negative_domain_filter;
174 public Map<String, Integer> getDomainsIgnoredDueToNegativeDomainFilterCountsMap() {
175 return _domains_ignored_due_to_negative_domain_filter_counts_map;
178 public int getDomainsIgnoredDueToOverlap() {
179 return _domains_ignored_due_to_overlap;
182 public Map<String, Integer> getDomainsIgnoredDueToVirusLikeIdCountsMap() {
183 return _domains_ignored_due_to_virus_like_id_counts_map;
186 public int getDomainsIgnoredDueToVirusLikeIds() {
187 return _domains_ignored_due_to_virus_like_id;
190 public int getDomainsStored() {
191 return _domains_stored;
194 public SortedSet<String> getDomainsStoredSet() {
195 return _domains_stored_set;
198 private double getEValueMaximum() {
199 return _e_value_maximum;
202 private Set<String> getFilter() {
206 private FilterType getFilterType() {
210 private Map<String, String> getIndividualDomainScoreCutoffs() {
211 return _individual_domain_score_cutoffs;
214 private File getInputFile() {
218 private int getMaxAllowedOverlap() {
219 return _max_allowed_overlap;
222 private String getModelType() {
226 public int getProteinsEncountered() {
227 return _proteins_encountered;
230 public int getProteinsIgnoredDueToFilter() {
231 return _proteins_ignored_due_to_filter;
234 public int getProteinsStored() {
235 return _proteins_stored;
238 private ReturnType getReturnType() {
242 private String getSpecies() {
246 public long getTime() {
250 private void init() {
251 _e_value_maximum = HmmPfamOutputParser.E_VALUE_MAXIMUM_DEFAULT;
252 setIgnoreDufs( HmmPfamOutputParser.IGNORE_DUFS_DEFAULT );
253 setReturnType( HmmPfamOutputParser.RETURN_TYPE_DEFAULT );
254 _max_allowed_overlap = HmmPfamOutputParser.MAX_ALLOWED_OVERLAP_DEFAULT;
255 setIndividualDomainScoreCutoffs( null );
256 setIgnoreEngulfedDomains( false );
257 setIgnoreVirusLikeIds( false );
258 setAllowNonUniqueQuery( false );
263 private void intitCounts() {
264 setDomainsStoredSet( new TreeSet<String>() );
265 setDomainsEncountered( 0 );
266 setProteinsEncountered( 0 );
267 setProteinsIgnoredDueToFilter( 0 );
268 setDomainsIgnoredDueToNegativeFilter( 0 );
269 setDomainsIgnoredDueToDuf( 0 );
270 setDomainsIgnoredDueToEval( 0 );
271 setDomainsIgnoredDueToIndividualScoreCutoff( 0 );
272 setDomainsIgnoredDueToVirusLikeId( 0 );
273 setDomainsIgnoredDueToOverlap( 0 );
274 setDomainsStored( 0 );
275 setProteinsStored( 0 );
277 setDomainsIgnoredDueToVirusLikeIdCountsMap( new TreeMap<String, Integer>() );
278 setDomainsIgnoredDueToNegativeDomainFilterCountsMap( new TreeMap<String, Integer>() );
281 private boolean isAllowNonUniqueQuery() {
282 return _allow_non_unique_query;
285 private boolean isIgnoreDufs() {
289 private boolean isIgnoreEngulfedDomains() {
290 return _ignore_engulfed_domains;
293 private boolean isIgnoreVirusLikeIds() {
294 return _ignore_virus_like_ids;
297 private boolean isVerbose() {
301 public List<Protein> parse() throws IOException {
303 final Set<String> queries = new HashSet<String>();
304 final String error = ForesterUtil.isReadableFile( getInputFile() );
305 if ( !ForesterUtil.isEmpty( error ) ) {
306 throw new IOException( error );
308 final BufferedReader br = new BufferedReader( new FileReader( getInputFile() ) );
310 final List<Protein> proteins = new ArrayList<Protein>();
311 Protein current_protein = null;
313 boolean saw_double_slash = true;
314 boolean can_parse_domains = false;
315 boolean saw_parsed_for_domains = false;
316 boolean saw_query_sequence = false;
317 boolean was_not_unique = false;
318 final long start_time = new Date().getTime();
319 while ( ( line = br.readLine() ) != null ) {
321 if ( line.length() < 1 ) {
324 else if ( line.startsWith( "Query sequence:" ) ) {
325 ++_proteins_encountered;
326 if ( !saw_double_slash ) {
327 throw new IOException( "unexpected format [line " + line_number + "] in ["
328 + getInputFile().getCanonicalPath() + "]" );
330 saw_double_slash = false;
331 saw_query_sequence = true;
332 was_not_unique = false;
333 final String query = line.substring( 16 ).trim();
334 if ( ForesterUtil.isEmpty( query ) ) {
335 throw new IOException( "query sequence cannot be empty [line " + line_number + "] in ["
336 + getInputFile().getCanonicalPath() + "]" );
338 if ( queries.contains( query ) ) {
339 if ( !isAllowNonUniqueQuery() ) {
340 throw new IOException( "query \"" + query + "\" is not unique [line " + line_number + "] in ["
341 + getInputFile().getCanonicalPath() + "]" );
343 else if ( isVerbose() ) {
344 ForesterUtil.printWarningMessage( getClass().getName(), "query \"" + query
345 + "\" is not unique [line " + line_number + "] in ["
346 + getInputFile().getCanonicalPath() + "]" );
350 queries.add( query );
352 if ( current_protein != null ) {
353 throw new IOException( "unexpected format [line " + line_number + "] in ["
354 + getInputFile().getCanonicalPath() + "]" );
356 if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) {
357 current_protein = new BasicProtein( query, getSpecies(), 0 );
360 throw new IllegalArgumentException( "unknown return type" );
363 else if ( line.startsWith( "Accession:" ) ) {
364 if ( !saw_query_sequence || ( current_protein == null ) ) {
365 throw new IOException( "unexpected format [line " + line_number + "] in ["
366 + getInputFile().getCanonicalPath() + "]" );
368 ( ( BasicProtein ) current_protein ).setAccession( line.substring( 11 ).trim() );
370 else if ( line.startsWith( "Description:" ) ) {
371 if ( !saw_query_sequence || ( current_protein == null ) ) {
372 throw new IOException( "unexpected format [line " + line_number + "] in ["
373 + getInputFile().getCanonicalPath() + "]" );
375 if ( was_not_unique ) {
376 if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) {
377 current_protein = new BasicProtein( current_protein.getProteinId() + " "
378 + line.substring( 13 ).trim(), getSpecies(), 0 );
382 ( ( BasicProtein ) current_protein ).setDescription( line.substring( 13 ).trim() );
385 else if ( line.startsWith( "Parsed for domains:" ) ) {
386 if ( !saw_query_sequence ) {
387 throw new IOException( "unexpected format [line " + line_number + "] in ["
388 + getInputFile().getCanonicalPath() + "]" );
390 saw_query_sequence = false;
391 saw_parsed_for_domains = true;
393 else if ( saw_parsed_for_domains && line.startsWith( "--------" ) ) {
394 can_parse_domains = true;
395 saw_parsed_for_domains = false;
397 else if ( line.startsWith( "Alignments of top-scoring domains:" ) ) {
398 if ( !can_parse_domains ) {
399 throw new IOException( "unexpected format [line " + line_number + "] in ["
400 + getInputFile().getCanonicalPath() + "]" );
402 can_parse_domains = false;
404 else if ( line.startsWith( "//" ) ) {
405 can_parse_domains = false;
406 saw_double_slash = true;
407 if ( current_protein.getProteinDomains().size() > 0 ) {
408 if ( ( getMaxAllowedOverlap() != HmmPfamOutputParser.MAX_ALLOWED_OVERLAP_DEFAULT )
409 || isIgnoreEngulfedDomains() ) {
410 final int domains_count = current_protein.getNumberOfProteinDomains();
411 current_protein = SurfacingUtil.removeOverlappingDomains( getMaxAllowedOverlap(),
412 isIgnoreEngulfedDomains(),
414 final int domains_removed = domains_count - current_protein.getNumberOfProteinDomains();
415 _domains_stored -= domains_removed;
416 _domains_ignored_due_to_overlap += domains_removed;
418 addProtein( proteins, current_protein );
420 current_protein = null;
422 else if ( can_parse_domains && ( line.indexOf( "[no hits above thresholds]" ) == -1 ) ) {
423 final String[] s = line.split( "\\s+" );
424 if ( s.length != 10 ) {
425 throw new IOException( "unexpected format in hmmpfam output: \"" + line + "\" [line "
426 + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
428 final String id = s[ 0 ];
429 final String domain_count_str = s[ 1 ];
430 final String from_str = s[ 2 ];
431 final String to_str = s[ 3 ];
432 final String query_match_str = s[ 4 ];
433 final String hmm_match_str = s[ 7 ];
434 final String score_str = s[ 8 ];
435 final String e_value_str = s[ 9 ];
440 boolean is_complete_hmm_match = false;
441 boolean is_complete_query_match = false;
443 from = Integer.valueOf( from_str ).intValue();
445 catch ( final NumberFormatException e ) {
446 throw new IOException( "could not parse seq-f from \"" + line + "\" [line " + line_number
447 + "] in [" + getInputFile().getCanonicalPath() + "]" );
450 to = Integer.valueOf( to_str ).intValue();
452 catch ( final NumberFormatException e ) {
453 throw new IOException( "could not parse seq-t from \"" + line + "\" [line " + line_number
454 + "] in [" + getInputFile().getCanonicalPath() + "]" );
457 score = Double.valueOf( score_str ).doubleValue();
459 catch ( final NumberFormatException e ) {
460 throw new IOException( "could not parse score from \"" + line + "\" [line " + line_number
461 + "] in [" + getInputFile().getCanonicalPath() + "]" );
464 e_value = Double.valueOf( e_value_str ).doubleValue();
466 catch ( final NumberFormatException e ) {
467 throw new IOException( "could not parse E-value from \"" + line + "\" [line " + line_number
468 + "] in [" + getInputFile().getCanonicalPath() + "]" );
470 if ( hmm_match_str.equals( "[]" ) ) {
471 is_complete_hmm_match = true;
473 else if ( !( hmm_match_str.equals( ".]" ) || hmm_match_str.equals( "[." ) || hmm_match_str
474 .equals( ".." ) ) ) {
475 throw new IOException( "unexpected format in hmmpfam output: \"" + line + "\" [line "
476 + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
478 if ( query_match_str.equals( ".." ) ) {
479 is_complete_query_match = true;
481 else if ( !( query_match_str.equals( ".]" ) || query_match_str.equals( "[." ) || query_match_str
482 .equals( "[]" ) ) ) {
483 throw new IOException( "unexpected format in hmmpfam output: \"" + line + "\" [line "
484 + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
486 final String[] ct = domain_count_str.split( "/" );
487 if ( ct.length != 2 ) {
488 throw new IOException( "unexpected format in hmmpfam output: \"" + line + "\" [line "
489 + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
491 final String number_str = ct[ 0 ];
492 final String total_str = ct[ 1 ];
496 number = Integer.valueOf( ( number_str ) ).intValue();
498 catch ( final NumberFormatException e ) {
499 throw new IOException( "could not parse domain number from \"" + line + "\" [line " + line_number
500 + "] in [" + getInputFile().getCanonicalPath() + "]" );
503 total = Integer.valueOf( ( total_str ) ).intValue();
505 catch ( final NumberFormatException e ) {
506 throw new IOException( "could not parse domain count from \"" + line + "\" [line " + line_number
507 + "] in [" + getInputFile().getCanonicalPath() + "]" );
509 ++_domains_encountered;
510 boolean failed_cutoff = false;
511 if ( getIndividualDomainScoreCutoffs() != null ) {
512 if ( getIndividualDomainScoreCutoffs().containsKey( id ) ) {
513 final double cutoff = Double.parseDouble( getIndividualDomainScoreCutoffs().get( id ) );
514 if ( score < cutoff ) {
515 failed_cutoff = true;
519 throw new IOException( "could not find a score cutoff value for domain id \"" + id
520 + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
523 final String uc_id = id.toUpperCase();
524 if ( failed_cutoff ) {
525 ++_domains_ignored_due_to_individual_score_cutoff;
527 else if ( ( getEValueMaximum() != HmmPfamOutputParser.E_VALUE_MAXIMUM_DEFAULT )
528 && ( e_value > getEValueMaximum() ) ) {
529 ++_domains_ignored_due_to_e_value;
531 else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) {
532 ++_domains_ignored_due_to_duf;
534 else if ( isIgnoreVirusLikeIds()
535 && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO )
536 || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG )
537 || uc_id.startsWith( HCV ) || uc_id.startsWith( HERPES ) ) ) {
538 ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToVirusLikeIdCountsMap(), id );
539 ++_domains_ignored_due_to_virus_like_id;
541 else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) && getFilter().contains( id ) ) {
542 ++_domains_ignored_due_to_negative_domain_filter;
543 ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToNegativeDomainFilterCountsMap(), id );
546 final BasicDomain pd = new BasicDomain( id,
553 current_protein.addProteinDomain( pd );
557 } // while ( ( line = br.readLine() ) != null )
558 setTime( new Date().getTime() - start_time );
559 if ( !saw_double_slash ) {
560 throw new IOException( "file ends unexpectedly [line " + line_number + "]" );
565 public void setAllowNonUniqueQuery( final boolean allow_non_unique_query ) {
566 _allow_non_unique_query = allow_non_unique_query;
569 private void setDomainsEncountered( final int domains_encountered ) {
570 _domains_encountered = domains_encountered;
573 private void setDomainsIgnoredDueToDuf( final int domains_ignored_due_to_duf ) {
574 _domains_ignored_due_to_duf = domains_ignored_due_to_duf;
577 public void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) {
578 _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value;
581 public void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
582 _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff;
585 private void setDomainsIgnoredDueToNegativeDomainFilterCountsMap( final Map<String, Integer> domains_ignored_due_to_negative_domain_filter_counts_map ) {
586 _domains_ignored_due_to_negative_domain_filter_counts_map = domains_ignored_due_to_negative_domain_filter_counts_map;
589 private void setDomainsIgnoredDueToNegativeFilter( final int domains_ignored_due_to_negative_domain_filter ) {
590 _domains_ignored_due_to_negative_domain_filter = domains_ignored_due_to_negative_domain_filter;
593 private void setDomainsIgnoredDueToOverlap( final int domains_ignored_due_to_overlap ) {
594 _domains_ignored_due_to_overlap = domains_ignored_due_to_overlap;
597 private void setDomainsIgnoredDueToVirusLikeId( final int i ) {
598 _domains_ignored_due_to_virus_like_id = i;
601 private void setDomainsIgnoredDueToVirusLikeIdCountsMap( final Map<String, Integer> domains_ignored_due_to_virus_like_id_counts_map ) {
602 _domains_ignored_due_to_virus_like_id_counts_map = domains_ignored_due_to_virus_like_id_counts_map;
605 private void setDomainsStored( final int domains_stored ) {
606 _domains_stored = domains_stored;
609 private void setDomainsStoredSet( final SortedSet<String> _storeddomains_stored ) {
610 _domains_stored_set = _storeddomains_stored;
613 public void setEValueMaximum( final double e_value_maximum ) {
614 if ( e_value_maximum < 0.0 ) {
615 throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" );
617 _e_value_maximum = e_value_maximum;
620 public void setIgnoreDufs( final boolean ignore_dufs ) {
621 _ignore_dufs = ignore_dufs;
625 * To ignore domains which are completely engulfed by domains (individual
626 * ones or stretches of overlapping ones) with better support values.
629 * @param ignored_engulfed_domains
631 public void setIgnoreEngulfedDomains( final boolean ignore_engulfed_domains ) {
632 _ignore_engulfed_domains = ignore_engulfed_domains;
635 public void setIgnoreVirusLikeIds( final boolean ignore_virus_like_ids ) {
636 _ignore_virus_like_ids = ignore_virus_like_ids;
640 * Sets the individual domain score cutoff values (for example, gathering
641 * thresholds from Pfam). Domain ids are the keys, cutoffs the values.
643 * @param individual_domain_score_cutoffs
645 public void setIndividualDomainScoreCutoffs( final Map<String, String> individual_domain_score_cutoffs ) {
646 _individual_domain_score_cutoffs = individual_domain_score_cutoffs;
649 public void setMaxAllowedOverlap( final int max_allowed_overlap ) {
650 if ( max_allowed_overlap < 0 ) {
651 throw new IllegalArgumentException( "Attempt to set max allowed overlap to less than zero." );
653 _max_allowed_overlap = max_allowed_overlap;
656 private void setProteinsEncountered( final int proteins_encountered ) {
657 _proteins_encountered = proteins_encountered;
660 private void setProteinsIgnoredDueToFilter( final int proteins_ignored_due_to_filter ) {
661 _proteins_ignored_due_to_filter = proteins_ignored_due_to_filter;
664 private void setProteinsStored( final int proteins_stored ) {
665 _proteins_stored = proteins_stored;
668 public void setReturnType( final ReturnType return_type ) {
669 _return_type = return_type;
672 private void setTime( final long time ) {
676 public void setVerbose( final boolean verbose ) {
680 public static enum FilterType {
681 NONE, POSITIVE_PROTEIN, NEGATIVE_PROTEIN, NEGATIVE_DOMAIN
684 public static enum ReturnType {
685 UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN