new mafft v 6.857 with extensions
[jabaws.git] / binaries / src / mafft / core / mltaln9.c
index e69c350..5e736f2 100644 (file)
@@ -2,6 +2,7 @@
 
 #define DEBUG 0
 
+#if 0
 int seqlen( char *seq )
 {
        int val = 0;
@@ -9,6 +10,26 @@ int seqlen( char *seq )
                if( *seq++ != '-' ) val++;
        return( val );
 }
+#else
+int seqlen( char *seq )
+{
+       int val = 0;
+       if( *newgapstr == '-' )
+       {
+               while( *seq )
+                       if( *seq++ != '-' ) val++;
+       }
+       else
+       {
+               while( *seq )
+               {
+                       if( *seq != '-' && *seq != *newgapstr ) val++;
+                       seq++;
+               }
+       }
+       return( val );
+}
+#endif
 
 int intlen( int *num )
 {
@@ -31,12 +52,13 @@ char seqcheck( char **seq )
 
                                fprintf( stderr, "========================================================================= \n" );
                                fprintf( stderr, "========================================================================= \n" );
-                               fprintf( stderr, "========================================================================= \n" );
                                fprintf( stderr, "=== \n" );
                                fprintf( stderr, "=== Alphabet '%c' is unknown.\n", (*seq)[i] );
                                fprintf( stderr, "=== Please check site %d in sequence %d.\n", i+1, (int)(seq-seqbk+1) );
                                fprintf( stderr, "=== \n" );
-                               fprintf( stderr, "========================================================================= \n" );
+                               fprintf( stderr, "=== To make an alignment having unusual characters (U, @, #, etc), try\n" );
+                               fprintf( stderr, "=== %% mafft --anysymbol input > output\n" );
+                               fprintf( stderr, "=== \n" );
                                fprintf( stderr, "========================================================================= \n" );
                                fprintf( stderr, "========================================================================= \n" );
                                return( (int)(*seq)[i] );
@@ -740,6 +762,43 @@ static void setnearest( int nseq, Bchain *acpt, float **eff, float *mindisfrompt
        }
 }
 
+static void setnearest_double_fullmtx( int nseq, Bchain *acpt, double **eff, double *mindisfrompt, int *nearestpt, int pos )
+{
+       int j;
+       double tmpfloat;
+       double **effptpt;
+       Bchain *acptj;
+
+       *mindisfrompt = 999.9;
+       *nearestpt = -1;
+
+//     if( (acpt+pos)->next ) effpt = eff[pos]+(acpt+pos)->next->pos-pos;
+
+//     for( j=pos+1; j<nseq; j++ )
+       for( acptj=(acpt+pos)->next; acptj!=NULL; acptj=acptj->next )
+       {
+               j = acptj->pos;
+//             if( (tmpfloat=*effpt++) < *mindisfrompt )
+               if( (tmpfloat=eff[pos][j]) < *mindisfrompt )
+               {
+                       *mindisfrompt = tmpfloat;
+                       *nearestpt = j;
+               }
+       }
+       effptpt = eff;
+//     for( j=0; j<pos; j++ )
+       for( acptj=acpt; (acptj&&acptj->pos!=pos); acptj=acptj->next )
+       {
+               j = acptj->pos;
+//             if( (tmpfloat=(*effptpt++)[pos-j]) < *mindisfrompt )
+               if( (tmpfloat=eff[j][pos]) < *mindisfrompt )
+               {
+                       *mindisfrompt = tmpfloat;
+                       *nearestpt = j;
+               }
+       }
+}
+
 
 
 static void loadtreeoneline( int *ar, float *len, FILE *fp )
@@ -767,7 +826,7 @@ static void loadtreeoneline( int *ar, float *len, FILE *fp )
 //     fprintf( stderr, "len[0] = %f, len[1] = %f\n", len[0], len[1] );
 }
 
-void loadtree( int nseq, int ***topol, float **len, char **name, int *nlen )
+void loadtree( int nseq, int ***topol, float **len, char **name, int *nlen, Treedep *dep )
 {
        int i, j, k, miniim, maxiim, minijm, maxijm;
        int *intpt, *intpt2;
@@ -804,7 +863,7 @@ void loadtree( int nseq, int ***topol, float **len, char **name, int *nlen )
                mindisfrom = AllocateFloatVec( njob );
                nearest = AllocateIntVec( njob );
                treetmp = AllocateCharVec( njob*50 );
-               nametmp = AllocateCharVec( 30 );
+               nametmp = AllocateCharVec( 31 );
                tree = AllocateCharMtx( njob, njob*50 );
        }
 
@@ -878,6 +937,7 @@ void loadtree( int nseq, int ***topol, float **len, char **name, int *nlen )
 #endif
 
                prevnode = hist[im];
+               if( dep ) dep[k].child0 = prevnode;
                nmemim = nmemar[im];
 
 //             fprintf( stderr, "prevnode = %d, nmemim = %d\n", prevnode, nmemim );
@@ -912,6 +972,7 @@ void loadtree( int nseq, int ***topol, float **len, char **name, int *nlen )
 
                nmemjm = nmemar[jm];
                prevnode = hist[jm];
+               if( dep ) dep[k].child1 = prevnode;
 
 //             fprintf( stderr, "prevnode = %d, nmemjm = %d\n", prevnode, nmemjm );
 
@@ -1331,7 +1392,7 @@ void loadtop( int nseq, float **eff, int ***topol, float **len ) // computes bra
 }
 
 
-void fixed_musclesupg_float_realloc_nobk_halfmtx_treeout( int nseq, float **eff, int ***topol, float **len, char **name, int *nlen )
+void fixed_musclesupg_float_realloc_nobk_halfmtx_treeout( int nseq, float **eff, int ***topol, float **len, char **name, int *nlen, Treedep *dep )
 {
        int i, j, k, miniim, maxiim, minijm, maxijm;
        int *intpt, *intpt2;
@@ -1352,7 +1413,7 @@ void fixed_musclesupg_float_realloc_nobk_halfmtx_treeout( int nseq, float **eff,
        float *mindisfrom = NULL; // by D.Mathog, a guess
        static char **tree;
        static char *treetmp;
-       static char *nametmp;
+       static char *nametmp, *nameptr, *tmpptr;
        FILE *fp;
        float (*clusterfuncpt[1])(float,float);
 
@@ -1379,25 +1440,34 @@ void fixed_musclesupg_float_realloc_nobk_halfmtx_treeout( int nseq, float **eff,
                nmemar = AllocateIntVec( njob );
                mindisfrom = AllocateFloatVec( njob );
                nearest = AllocateIntVec( njob );
-               treetmp = AllocateCharVec( njob*50 );
-               nametmp = AllocateCharVec( 30 );
-               tree = AllocateCharMtx( njob, njob*50 );
+               treetmp = AllocateCharVec( njob*150 );
+               nametmp = AllocateCharVec( 130 );
+               tree = AllocateCharMtx( njob, njob*150 );
        }
 
        
     for( i=0; i<nseq; i++ )
        {
-               for( j=0; j<30; j++ ) nametmp[j] = 0;
-               for( j=0; j<30; j++ ) 
+               for( j=0; j<130; j++ ) nametmp[j] = 0;
+               for( j=0; j<130; j++ ) 
                {
-                       if( isalnum( name[i][j] ) )
+                       if( name[i][j] == 0 )
+                               break;
+                       else if( isalnum( name[i][j] ) )
                                nametmp[j] = name[i][j];
                        else
                                nametmp[j] = '_';
                }
-               nametmp[30] = 0;
+               nametmp[129] = 0;
 //             sprintf( tree[i], "%d_l=%d_%.20s", i+1, nlen[i], nametmp+1 );
-               sprintf( tree[i], "%d_%.20s", i+1, nametmp+1 );
+               if( outnumber )
+                       nameptr = strstr( nametmp, "_numo_e" ) + 8;
+               else
+                       nameptr = nametmp + 1;
+
+               if( (tmpptr=strstr( nameptr, "_oripos__" )) ) nameptr = tmpptr + 9; // = -> _ no tame
+
+               sprintf( tree[i], "%d_%.60s", i+1, nameptr );
        }
        for( i=0; i<nseq; i++ )
        {
@@ -1440,6 +1510,7 @@ void fixed_musclesupg_float_realloc_nobk_halfmtx_treeout( int nseq, float **eff,
 
 
                prevnode = hist[im];
+               if( dep ) dep[k].child0 = prevnode;
                nmemim = nmemar[im];
                intpt = topol[k][0] = (int *)realloc( topol[k][0], ( nmemim + 1 ) * sizeof( int ) );
                if( prevnode == -1 )
@@ -1469,6 +1540,7 @@ void fixed_musclesupg_float_realloc_nobk_halfmtx_treeout( int nseq, float **eff,
                }
 
                prevnode = hist[jm];
+               if( dep ) dep[k].child1 = prevnode;
                nmemjm = nmemar[jm];
                intpt = topol[k][1] = (int *)realloc( topol[k][1], ( nmemjm + 1 ) * sizeof( int ) );
                if( !intpt )
@@ -1568,7 +1640,6 @@ void fixed_musclesupg_float_realloc_nobk_halfmtx_treeout( int nseq, float **eff,
                sprintf( treetmp, "(%s:%7.5f,%s:%7.5f)", tree[im], len[k][0], tree[jm], len[k][1] );
                strcpy( tree[im], treetmp );
 
-//             fprintf( stderr, "im,jm=%d,%d\n", im, jm );
                acjmprev = ac[jm].prev; 
                acjmnext = ac[jm].next; 
                acjmprev->next = acjmnext;
@@ -1615,8 +1686,298 @@ void fixed_musclesupg_float_realloc_nobk_halfmtx_treeout( int nseq, float **eff,
 
 }
 
+//void veryfastsupg_double_outtree( int nseq, double **eff, int ***topol, double **len, char **name )
+void fixed_musclesupg_double_treeout( int nseq, double **eff, int ***topol, double **len, char **name )
+{
+       int i, j, k, miniim, maxiim, minijm, maxijm;
+       int *intpt, *intpt2;
+       double tmpfloat;
+       double eff1, eff0;
+       static double *tmptmplen = NULL;
+       static int *hist = NULL;
+       static Bchain *ac = NULL;
+       int im = -1, jm = -1;
+       Bchain *acjmnext, *acjmprev;
+       int prevnode;
+       Bchain *acpti;
+       int *pt1, *pt2, *pt11, *pt22;
+       static int *nmemar;
+       int nmemim, nmemjm;
+       double minscore;
+       int *nearest = NULL; // by D.Mathog, a guess
+       double *mindisfrom = NULL; // by D.Mathog, a guess
+       static char **tree;
+       static char *treetmp;
+       static char *nametmp, *nameptr, *tmpptr;
+       FILE *fp;
+       double (*clusterfuncpt[1])(double,double);
+
 
-void fixed_musclesupg_float_realloc_nobk_halfmtx( int nseq, float **eff, int ***topol, float **len )
+       sueff1_double = 1 - SUEFF;
+       sueff05_double = SUEFF * 0.5;
+       if ( treemethod == 'X' )
+               clusterfuncpt[0] = cluster_mix_double;
+       else if ( treemethod == 'E' )
+               clusterfuncpt[0] = cluster_average_double;
+       else if ( treemethod == 'q' )
+               clusterfuncpt[0] = cluster_minimum_double;
+       else
+       {
+               fprintf( stderr, "Unknown treemethod, %c\n", treemethod );
+               exit( 1 );
+       }
+
+       if( !hist )
+       {
+               hist = AllocateIntVec( njob );
+               tmptmplen = AllocateDoubleVec( njob );
+               ac = (Bchain *)malloc( njob * sizeof( Bchain ) );
+               nmemar = AllocateIntVec( njob );
+               mindisfrom = AllocateDoubleVec( njob );
+               nearest = AllocateIntVec( njob );
+               treetmp = AllocateCharVec( njob*150 );
+               nametmp = AllocateCharVec( 91 );
+               tree = AllocateCharMtx( njob, njob*150 );
+       }
+
+       
+    for( i=0; i<nseq; i++ )
+       {
+               for( j=0; j<90; j++ ) nametmp[j] = 0;
+               for( j=0; j<90; j++ ) 
+               {
+                       if( name[i][j] == 0 )
+                               break;
+                       else if( isalnum( name[i][j] ) )
+                               nametmp[j] = name[i][j];
+                       else
+                               nametmp[j] = '_';
+               }
+               nametmp[90] = 0;
+//             sprintf( tree[i], "%d_%.60s", i+1, nametmp+1 );
+               if( outnumber )
+                       nameptr = strstr( nametmp, "_numo_e" ) + 8;
+               else
+                       nameptr = nametmp + 1;
+
+               if( (tmpptr=strstr( nameptr, "_oripos__" )) ) nameptr = tmpptr + 9; // = -> _ no tame
+
+               sprintf( tree[i], "%d_%.60s", i+1, nameptr );
+       }
+       for( i=0; i<nseq; i++ )
+       {
+               ac[i].next = ac+i+1;
+               ac[i].prev = ac+i-1;
+               ac[i].pos = i;
+       }
+       ac[nseq-1].next = NULL;
+
+       for( i=0; i<nseq; i++ ) setnearest_double_fullmtx( nseq, ac, eff, mindisfrom+i, nearest+i, i ); // muscle
+
+       for( i=0; i<nseq; i++ ) tmptmplen[i] = 0.0;
+       for( i=0; i<nseq; i++ ) 
+       {
+               hist[i] = -1;
+               nmemar[i] = 1;
+       }
+
+       fprintf( stderr, "\n" );
+       for( k=0; k<nseq-1; k++ )
+       {
+               if( k % 10 == 0 ) fprintf( stderr, "\r% 5d / %d", k, nseq );
+
+               minscore = 999.9;
+               for( acpti=ac; acpti->next!=NULL; acpti=acpti->next ) 
+               {
+                       i = acpti->pos;
+//                     fprintf( stderr, "k=%d i=%d\n", k, i );
+                       if( mindisfrom[i] < minscore ) // muscle
+                       {
+                               im = i;
+                               minscore = mindisfrom[i];
+                       }
+               }
+               jm = nearest[im];
+               if( jm < im ) 
+               {
+                       j=jm; jm=im; im=j;
+               }
+
+
+               prevnode = hist[im];
+               nmemim = nmemar[im];
+//             intpt = topol[k][0] = (int *)realloc( topol[k][0], ( nmemim + 1 ) * sizeof( int ) );
+               intpt = topol[k][0];
+               if( prevnode == -1 )
+               {
+                       *intpt++ = im;
+                       *intpt = -1;
+               }
+               else
+               {
+                       pt1 = topol[prevnode][0];
+                       pt2 = topol[prevnode][1];
+                       if( *pt1 > *pt2 )
+                       {
+                               pt11 = pt2;
+                               pt22 = pt1;
+                       }
+                       else
+                       {
+                               pt11 = pt1;
+                               pt22 = pt2;
+                       }
+                       for( intpt2=pt11; *intpt2!=-1; )
+                               *intpt++ = *intpt2++;
+                       for( intpt2=pt22; *intpt2!=-1; )
+                               *intpt++ = *intpt2++;
+                       *intpt = -1;
+               }
+
+               prevnode = hist[jm];
+               nmemjm = nmemar[jm];
+//             intpt = topol[k][1] = (int *)realloc( topol[k][1], ( nmemjm + 1 ) * sizeof( int ) );
+               intpt = topol[k][1];
+               if( prevnode == -1 )
+               {
+                       *intpt++ = jm;
+                       *intpt = -1;
+               }
+               else
+               {
+                       pt1 = topol[prevnode][0];
+                       pt2 = topol[prevnode][1];
+                       if( *pt1 > *pt2 )
+                       {
+                               pt11 = pt2;
+                               pt22 = pt1;
+                       }
+                       else
+                       {
+                               pt11 = pt1;
+                               pt22 = pt2;
+                       }
+                       for( intpt2=pt11; *intpt2!=-1; )
+                               *intpt++ = *intpt2++;
+                       for( intpt2=pt22; *intpt2!=-1; )
+                               *intpt++ = *intpt2++;
+                       *intpt = -1;
+               }
+
+               minscore *= 0.5;
+
+               len[k][0] = ( minscore - tmptmplen[im] );
+               len[k][1] = ( minscore - tmptmplen[jm] );
+
+
+               tmptmplen[im] = minscore;
+
+               hist[im] = k;
+               nmemar[im] = nmemim + nmemjm;
+
+               mindisfrom[im] = 999.9;
+               for( acpti=ac; acpti!=NULL; acpti=acpti->next )
+        {
+                       i = acpti->pos;
+            if( i != im && i != jm )
+            {
+                               if( i < im )
+                               {
+                                        miniim = i;
+                                        maxiim = im;
+                                        minijm = i;
+                                        maxijm = jm;
+                               }
+                               else if( i < jm )
+                               {
+                                        miniim = im;
+                                        maxiim = i;
+                                        minijm = i;
+                                        maxijm = jm;
+                               }
+                               else
+                               {
+                                        miniim = im;
+                                        maxiim = i;
+                                        minijm = jm;
+                                        maxijm = i;
+                               }
+                               eff0 = eff[miniim][maxiim];
+                               eff1 = eff[minijm][maxijm];
+#if 0
+                               tmpfloat = eff[miniim][maxiim] =
+                               MIN( eff0, eff1 ) * sueff1 + ( eff0 + eff1 ) * sueff05; 
+#else
+                tmpfloat = eff[miniim][maxiim] =
+                               (clusterfuncpt[0])( eff0, eff1 );
+#endif
+                               if( tmpfloat < mindisfrom[i]  )
+                               {
+                                       mindisfrom[i] = tmpfloat;
+                                       nearest[i] = im;
+                               }
+                               if( tmpfloat < mindisfrom[im]  )
+                               {
+                                       mindisfrom[im] = tmpfloat;
+                                       nearest[im] = i;
+                               }
+                               if( nearest[i] == jm )
+                               {
+                                       nearest[i] = im;
+                               }
+            }
+        }
+
+               sprintf( treetmp, "(%s:%7.5f,%s:%7.5f)", tree[im], len[k][0], tree[jm], len[k][1] );
+               strcpy( tree[im], treetmp );
+
+               acjmprev = ac[jm].prev; 
+               acjmnext = ac[jm].next; 
+               acjmprev->next = acjmnext;
+               if( acjmnext != NULL )
+                       acjmnext->prev = acjmprev;
+//             free( (void *)eff[jm] ); eff[jm] = NULL;
+
+#if 1 // muscle seems to miss this.
+               for( acpti=ac; acpti!=NULL; acpti=acpti->next )
+               {
+                       i = acpti->pos;
+                       if( nearest[i] == im ) 
+                       {
+//                             fprintf( stderr, "calling setnearest\n" );
+                               setnearest_double_fullmtx( nseq, ac, eff, mindisfrom+i, nearest+i, i );
+                       }
+               }
+#endif
+
+
+#if 0
+        fprintf( stdout, "vSTEP-%03d:\n", k+1 );
+               fprintf( stdout, "len0 = %f\n", len[k][0] );
+        for( i=0; topol[k][0][i]>-1; i++ ) fprintf( stdout, " %03d", topol[k][0][i]+1 );
+        fprintf( stdout, "\n" );
+               fprintf( stdout, "len1 = %f\n", len[k][1] );
+        for( i=0; topol[k][1][i]>-1; i++ ) fprintf( stdout, " %03d", topol[k][1][i]+1 );
+        fprintf( stdout, "\n" );
+#endif
+    }
+       fp = fopen( "infile.tree", "w" );
+               fprintf( fp, "%s\n", treetmp );
+       fclose( fp );
+
+       FreeCharMtx( tree );
+       free( treetmp );
+       free( nametmp );
+       free( (void *)tmptmplen ); tmptmplen = NULL;
+       free( hist ); hist = NULL;
+       free( (char *)ac ); ac = NULL;
+       free( (void *)nmemar ); nmemar = NULL;
+       free( mindisfrom );
+       free( nearest );
+}
+
+
+void fixed_musclesupg_float_realloc_nobk_halfmtx( int nseq, float **eff, int ***topol, float **len, Treedep *dep )
 {
        int i, j, k, miniim, maxiim, minijm, maxijm;
        int *intpt, *intpt2;
@@ -1706,6 +2067,7 @@ void fixed_musclesupg_float_realloc_nobk_halfmtx( int nseq, float **eff, int ***
 
 
                prevnode = hist[im];
+               if( dep ) dep[k].child0 = prevnode;
                nmemim = nmemar[im];
                intpt = topol[k][0] = (int *)realloc( topol[k][0], ( nmemim + 1 ) * sizeof( int ) );
                if( prevnode == -1 )
@@ -1735,6 +2097,7 @@ void fixed_musclesupg_float_realloc_nobk_halfmtx( int nseq, float **eff, int ***
                }
 
                prevnode = hist[jm];
+               if( dep ) dep[k].child1 = prevnode;
                nmemjm = nmemar[jm];
                intpt = topol[k][1] = (int *)realloc( topol[k][1], ( nmemjm + 1 ) * sizeof( int ) );
                if( !intpt )
@@ -2498,7 +2861,7 @@ void veryfastsupg_double( int nseq, double **eff, int ***topol, double **len )
 }
 #endif
 
-void veryfastsupg_double_outtree( int nseq, double **eff, int ***topol, double **len, char name[M][B] )
+void veryfastsupg_double_outtree( int nseq, double **eff, int ***topol, double **len, char **name )
 {
     int i, j, k, miniim, maxiim, minijm, maxijm;
        int *intpt, *intpt2;
@@ -2539,7 +2902,7 @@ void veryfastsupg_double_outtree( int nseq, double **eff, int ***topol, double *
                hist = AllocateIntVec( njob );
                tmptmplen = (double *)malloc( njob * sizeof( double ) );
                ac = (Achain *)malloc( njob * sizeof( Achain ) );
-               nametmp = AllocateCharVec( 30 );
+               nametmp = AllocateCharVec( 31 );
        }
 
 //     for( i=0; i<nseq; i++ ) sprintf( tree[i], "%d", i+1 );
@@ -5782,7 +6145,7 @@ void gapireru( char *res, char *ori, char *gt )
        {
                if( g == '-' )
                {
-                       *res++ = '-';
+                       *res++ = *newgapstr;
                }
                else
                {
@@ -6899,3 +7262,20 @@ float naivepairscore( int n1, int n2, char **seq1, char **seq2, double *eff1, do
        return( val );
 //     exit( 1 );
 }
+
+double plainscore( int nseq, char **s )
+{
+       int i, j, ilim;
+       double v = 0.0;
+       
+       ilim = nseq-1;
+       for( i=0; i<ilim; i++ ) for( j=i+1; j<nseq; j++ )
+       {
+               v += (double)naivepairscore11( s[i], s[j], penalty );
+       }
+
+       fprintf( stderr, "penalty = %d\n", penalty );
+
+       return( v );
+}
+