/*******************************************************
                        PFTOOLS
 *******************************************************
  Sep 26, 2011 xali1.c
 *******************************************************
 (C) 2011 Swiss Institute of Bioinformatics
     Thierry Schuepbach (thierry.schuepbach@isb-sib.ch)
 *******************************************************/
#include <stdlib.h>
#include <stdio.h>
#include <inttypes.h>
#include <stdbool.h>
#include <mmintrin.h>
#include <emmintrin.h>
#include <xmmintrin.h>
#include <smmintrin.h>
#include "profile.h"

#define MAX(a,b) (a>b) ? a : b
#ifndef NEWCONV 
static __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _MM_CVTEPI16_EPI32(__m128i __X) {
      // Convert signed WORD into signed DWORD
      const __m128i __sign = _mm_cmpgt_epi16((__m128i) _mm_setzero_epi32(), __X);
      // Interleave sign with data to produce a 128 bit (4 x DWORD)
      return _mm_unpacklo_epi16 (__X, __sign);
};
#else
static __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _MM_CVTEPI16_EPI32(__m128i __X) {
   return _mm_cvtepi16_epi32(__X);
};
#endif

int xali1(const struct Profile * const restrict prf, const unsigned char * const restrict Sequence,
          int * const WORK, const size_t BSEQ, const size_t LSEQ, const int CutOff, const _Bool LOPT)
/*
 * WARNING: for SSE version, WORK should be 4 times the (profile size + 1) + 63 to align to cache line
 */
{
  int KOPD, lScore = (int) NLOW;
#ifndef _SSE_4_1_
  const int * restrict IOPM_R;
  int * restrict IOPM_W = WORK;
  const int * restrict IOPI_R;
  int * restrict IOPI_W = &WORK[(prf->Length+1)];
  
//   register int * const restrict IOPM = WORK;
//   register int * const restrict IOPI = &WORK[prf->Length+1];
  
#define IOPI_R(i) IOPI_R[i]
#define IOPI_W(i) IOPI_W[i]
#define IOPM_R(i) IOPM_R[i]
#define IOPM_W(i) IOPM_W[i]
#else
  typedef struct { int M; int I; } sIOP;
  const sIOP * restrict IOP_R;
  sIOP * restrict IOP_W = (sIOP*) WORK;
 
//   register sIOP * const restrict IOP = (sIOP*) WORK;
#define IOPI_R(i) IOP_R[i].I
#define IOPI_W(i) IOP_W[i].I
#define IOPM_R(i) IOP_R[i].M
#define IOPM_W(i) IOP_W[i].M
#endif

  register const short int (* const restrict Transitions)[INSERTION_TRANSITIONS_SIZE] = \
  (short int (*const restrict)[INSERTION_TRANSITIONS_SIZE]) prf->Scores.Insertion.Transitions;
  const short int * const restrict Match = prf->Scores.Match.Alphabet;
  const short int * const restrict Insertion = prf->Scores.Insertion.Alphabet;
  const size_t AlignStep = prf->Scores.Match.AlignStep;

  /* NOTE: The following part could be replaced and performed only once for a profile as it
   *       is profile dependent. Nevertheless it does a good job loading Match and Transition
   *       matrices into the cache hierarchy.
   */
  {
    register const short int * restrict lMatch = &Match[_D];
#ifndef _SSE_4_1_
    register const short int (* const restrict FirstSequenceProtein)[FIRST_SIZE] = prf->Scores.Insertion.FirstSequenceProtein;
    IOPM_W[0] = (int) FirstSequenceProtein[0][_YM];
    IOPI_W[0] = (int) FirstSequenceProtein[0][_YI];
    KOPD      = (int) FirstSequenceProtein[0][_YD];

    for (size_t iprf=1; iprf<=prf->Length; ++iprf) {
      register const int KD = KOPD + (int) *lMatch;
      lMatch += AlignStep;
 
      IOPM_W[iprf] = MAX( KD + (int) Transitions[iprf][_DM], (int) FirstSequenceProtein[iprf][_YM] );
      IOPI_W[iprf] = MAX( KD + (int) Transitions[iprf][_DI], (int) FirstSequenceProtein[iprf][_YI] );
      KOPD         = MAX( KD + (int) Transitions[iprf][_DD], (int) FirstSequenceProtein[iprf][_YD] );
    }  
#else
    register const short int (* restrict FirstSequenceProtein)[FIRST_SIZE] = prf->Scores.Insertion.FirstSequenceProtein;
    IOP_W[0].M = (int) FirstSequenceProtein[0][_YM];
    IOP_W[0].I = (int) FirstSequenceProtein[0][_YI];
    KOPD       = (int) FirstSequenceProtein[0][_YD];
    FirstSequenceProtein++;
    register const short int (* restrict pTransitions)[INSERTION_TRANSITIONS_SIZE] = &Transitions[1];
    register sIOP * restrict pIOP = &IOP_W[1];
    register int Length = - (int) prf->Length;

//     while (Length-- != 0) {
    do {
      register const int KD = KOPD + (int) *lMatch;
      lMatch += AlignStep;
      
      // Transform KD into a vector
      __m128i __KD = _mm_set1_epi32(KD);
      // Load Transitions
      __m128i __Transitions = _mm_loadl_epi64((__m128i*) &pTransitions[0][_DM]);
      
      // Convert signed WORD into signed DWORD
      __Transitions = _MM_CVTEPI16_EPI32(__Transitions);
      
      // Add KD to Transitions
      __Transitions = _mm_add_epi32(__Transitions, __KD);
      
      // Move to next profile transitions
      pTransitions++;

      // Load FirstSequenceProtein
      __m128i __FirstSequenceProtein = _mm_loadl_epi64((__m128i*) &FirstSequenceProtein[0][_YM]);

      // Convert signed WORD into signed DWORD
      __FirstSequenceProtein = _MM_CVTEPI16_EPI32(__FirstSequenceProtein);

      // Move to next profile First Sequence
      FirstSequenceProtein++;
      
      // Get maximum ( this is SSE 4.1 )
      __m128i __max = _mm_max_epi32(__Transitions, __FirstSequenceProtein);

      // Store IOPI and IOPM
      _mm_storel_pi( (__m64*) pIOP, (__m128) __max);
      pIOP++;
      
      // Set KOPD ( this is SSE 4.1 )
      KOPD = _mm_extract_epi32(__max, 2);

      Length++;
    } while (Length < 0);
#endif
  }
#ifndef _PACKED_TRANSITIONS_
  register const short int (* const restrict IntermediateSequenceProtein)[INTERMEDIATE_SIZE] = prf->Scores.Insertion.IntermediateSequenceProtein;
#endif

#ifndef _SSE_4_1_
  // Swap and assign Read and write pointers
  IOPM_R = IOPM_W;
  IOPI_R = IOPI_W;
  IOPM_W = ((uintptr_t) &WORK[2*(prf->Length+1)] + 63) & ~63;
  IOPI_W = &IOPM_W[prf->Length+1];
#else
  // Swap and assign Read and write pointers
  IOP_R = IOP_W;
  IOP_W = (sIOP*) (((uintptr_t) &WORK[2*(prf->Length+1)] + 63) & ~63);
#endif
  for ( int iseq=BSEQ; iseq < LSEQ-1; ++iseq) {
    register const int j1 = (int) Sequence[iseq];
    int KOPM = IOPM_R(0);
    register const short int * restrict lInsertion = Insertion;
    {
      register const int KI = IOPI_R(0) + lInsertion[j1];
#ifndef _SSE_4_1_
      IOPM_W(0) = MAX( KI + (int) Transitions[0][_IM] , (int) Transitions[0][_XM] );
      IOPI_W(0) = MAX( KI + (int) Transitions[0][_II] , (int) Transitions[0][_XI] );
      KOPD      = MAX( KI + (int) Transitions[0][_ID] , (int) Transitions[0][_XD] );
  #ifndef _PACKED_TRANSITIONS_
      lScore = MAX( KI + (int) IntermediateSequenceProtein[0][_IX] , lScore );
  #else
      lScore = MAX( KI + (int) Transitions[0][_IX] , lScore );
  #endif
#else
  #ifndef _PACKED_TRANSITIONS_
      lScore  = MAX( KI + (int) IntermediateSequenceProtein[0][_IX] , lScore );
      // Transform KI into a vector
      __m128i __KI = _mm_set1_epi32(KI);
      // Load Transitions
      __m128i __TransitionsI = _mm_loadl_epi64((__m128i*) &Transitions[0][_IM]);
      // Convert signed WORD into signed DWORD
      __TransitionsI = _MM_CVTEPI16_EPI32(__TransitionsI);
      // Add KI to Transition
      __TransitionsI = _mm_add_epi32(__TransitionsI, __KI);

       // Load Transitions
      __m128i __TransitionsX = _mm_loadl_epi64((__m128i*) &Transitions[0][_XM]);
      // Convert signed WORD into signed DWORD
      __TransitionsX = _MM_CVTEPI16_EPI32(__TransitionsX);

      // Get maximum ( this is SSE 4.1 )
      __m128i __max = _mm_max_epi32(__TransitionsI, __TransitionsX);

      // Store IOPI and IOPM
      _mm_storel_pi( (__m64*) IOP_W, (__m128) __max);
  #else
      // Transform KI into a vector
      __m128i __KI = _mm_set1_epi32(KI);
      // Load Transitions
      __m128i __TransitionsI = _mm_loadl_epi64((__m128i*) &Transitions[0][_IM]);
      // Convert signed WORD into signed DWORD
      __TransitionsI = _MM_CVTEPI16_EPI32(__TransitionsI);
      // Add KI to Transition
      __TransitionsI = _mm_add_epi32(__TransitionsI, __KI);

       // Load Transitions
      __m128i __TransitionsX = _mm_loadl_epi64((__m128i*) &Transitions[0][_XM]);
      // Convert signed WORD into signed DWORD
      __TransitionsX = _MM_CVTEPI16_EPI32(__TransitionsX);

      // Insert lScore into __TransitionsX
      __TransitionsX = _mm_insert_epi32(__TransitionsX, lScore, 3);

      // Get maximum ( this is SSE 4.1 )
      __m128i __max = _mm_max_epi32(__TransitionsI, __TransitionsX);

      // Store IOPI and IOPM
      _mm_storel_pi( (__m64*) IOP_W, (__m128) __max);

      // Backup new score to xmm register
      lScore = _mm_extract_epi32(__max,3);
  #endif
#endif
    }
    
    lInsertion += AlignStep;
    register const short int * restrict lMatch = Match;
    
#ifndef _SSE_4_1_
    for (int iprf=1; iprf<=prf->Length; ++iprf ) {
      const int KM = KOPM         + lMatch[j1];
      const int KI = IOPI_R(iprf) + lInsertion[j1]; 
      const int KD = KOPD         + lMatch[_D];
      //printf("%i %i\tKM: %i\tKI: %i\t KD: %i\t Score: %i\t %i %i\n", iseq, iprf, KM,KI,KD,lScore, lInsertion[j1],IOPI[iprf]);
      //printf("%2i %2i\t%5i %5i %5i %5i %5i %5i %5i\n", iseq, iprf, KOPM, lMatch[j1], IOPM[iprf], IOPI[iprf],  lInsertion[j1], KOPD, lMatch[_D]);
      //printf("%3i %3i %3i %3i\n", iprf, IntermediateSequenceProtein[iprf][_MX], IntermediateSequenceProtein[iprf][_IX], IntermediateSequenceProtein[iprf][_DX]);
      lMatch     += AlignStep;
      lInsertion += AlignStep;

      KOPM = IOPM_R(iprf);

      const int tIOPM1 = MAX( KM + (int) Transitions[iprf][_MM] ,      (int) Transitions[iprf][_XM] );
      const int tIOPM2 = MAX( KI + (int) Transitions[iprf][_IM] , KD + (int) Transitions[iprf][_DM] );
      IOPM_W(iprf)       = MAX( tIOPM1, tIOPM2);
      
      const int tIOPI1 = MAX( KM + (int) Transitions[iprf][_MI] ,      (int) Transitions[iprf][_XI] );
      const int tIOPI2 = MAX( KI + (int) Transitions[iprf][_II] , KD + (int) Transitions[iprf][_DI] );
      IOPI_W(iprf)       = MAX( tIOPI1, tIOPI2);

      const int tIOPD1 = MAX( KM + (int) Transitions[iprf][_MD] ,      (int) Transitions[iprf][_XD] );
      const int tIOPD2 = MAX( KI + (int) Transitions[iprf][_ID] , KD + (int) Transitions[iprf][_DD] );
      KOPD             = MAX( tIOPD1, tIOPD2);
  #ifndef _PACKED_TRANSITIONS_      
      const int tIOPT1 = MAX( KM + (int) IntermediateSequenceProtein[iprf][_MX], KI + (int) IntermediateSequenceProtein[iprf][_IX] );
      const int tIOPT2 = MAX( lScore                                           , KD + (int) IntermediateSequenceProtein[iprf][_DX] );
  #else
      const int tIOPT1 = MAX( KM + (int) Transitions[iprf][_MX], KI + (int) Transitions[iprf][_IX] );
      const int tIOPT2 = MAX( lScore                           , KD + (int) Transitions[iprf][_DX] );
  #endif
      lScore           = MAX( tIOPT1, tIOPT2);

//       printf("%i %i\t\t%i\t%i\t\t%i\t%i\t%i\t\t%i\t%i\t%i\t%i\t\t%i\n",
//              iseq, iprf,
//              IOPM(iprf), IOPI(iprf),
//              KM, KI, KD,
//              Transitions[iprf][_MM], Transitions[iprf][_IM],  Transitions[iprf][_DM],  Transitions[iprf][_XM],
//              lScore);
    }

    // Swap Read and Write pointers
    int * ptr = IOPM_W;
    IOPM_W = (int*) IOPM_R;
    IOPM_R = ptr;
    ptr    = IOPI_W;
    IOPI_W = (int*) IOPI_R;
    IOPI_R = ptr;
#else
    for (int iprf=1; iprf<=prf->Length; ++iprf ) {
//     size_t iprf = 1; //prf->Length;
//     do {
      const int KM = KOPM         + (int) lMatch[j1];
      const int KI = IOPI_R(iprf) + (int) lInsertion[j1];
      const int KD = KOPD         + (int) lMatch[_D];

      lMatch     += AlignStep;
      lInsertion += AlignStep;

      KOPM = IOPM_R(iprf);

      // Transform KM into a vector
      __m128i __KM = _mm_set1_epi32(KM);
      // Load Transitions
      __m128i __TransitionsM = _mm_loadl_epi64((__m128i*) &Transitions[iprf][_MM]);
      // Convert signed WORD into signed DWORD
      __TransitionsM = _MM_CVTEPI16_EPI32(__TransitionsM);
      // Add KM to Transition
      __TransitionsM = _mm_add_epi32(__TransitionsM, __KM);

    
      // Transform KI into a vector
      __m128i __KI = _mm_set1_epi32(KI);
      // Load Transitions
      __m128i __TransitionsI = _mm_loadl_epi64((__m128i*) &Transitions[iprf][_IM]);
      // Convert signed WORD into signed DWORD
      __TransitionsI = _MM_CVTEPI16_EPI32(__TransitionsI);
      // Add KI to Transition
      __TransitionsI = _mm_add_epi32(__TransitionsI, __KI);

      // Get maximum ( this is SSE 4.1 )
      __m128i __max1 = _mm_max_epi32(__TransitionsM, __TransitionsI);

  #ifndef _PACKED_TRANSITIONS_
      const int tIOPT1 = MAX( KM + (int) IntermediateSequenceProtein[iprf][_MX], KI + (int) IntermediateSequenceProtein[iprf][_IX] );
  #endif
      // Load Transitions
      __m128i __TransitionsX = _mm_loadl_epi64((__m128i*) &Transitions[iprf][_XM]);
      // Convert signed WORD into signed DWORD
      __TransitionsX = _MM_CVTEPI16_EPI32(__TransitionsX);
      // Transform KD into a vector
      __m128i __KD = _mm_set1_epi32(KD);

      // Load Transitions
      __m128i __TransitionsD = _mm_loadl_epi64((__m128i*) &Transitions[iprf][_DM]);
      // Convert signed WORD into signed DWORD
      __TransitionsD = _MM_CVTEPI16_EPI32(__TransitionsD);
      // Add KD to Transition
      __TransitionsD = _mm_add_epi32(__TransitionsD, __KD);

  #ifndef _PACKED_TRANSITIONS_
      const int tIOPT2 = MAX( lScore                                           , KD + (int) IntermediateSequenceProtein[iprf][_DX] );
  #endif



  #ifdef _PACKED_TRANSITIONS_
      __TransitionsX = _mm_insert_epi32(__TransitionsX, lScore, 3);
  #endif
      
      // Get maximum ( this is SSE 4.1 )
      __m128i __max2 = _mm_max_epi32(__TransitionsD, __TransitionsX);
      __max1 = _mm_max_epi32(__max1, __max2);

      // Store IOPI and IOPM
      _mm_storel_pi( (__m64*) &IOP_W[iprf], (__m128) __max1);

      // Set KOPD ( this is SSE 4.1 )
      KOPD = _mm_extract_epi32(__max1,2);

  #ifndef _PACKED_TRANSITIONS_
      lScore = MAX( tIOPT1, tIOPT2);
  #else
      lScore = _mm_extract_epi32(__max1,3);
  #endif

//       printf("%i %i\t\t%i\t%i\t\t%i\t%i\t%i\t\t%i\t%i\t%i\t%i\t\t%i\n",
//              iseq, iprf,
//              IOPM(iprf), IOPI(iprf),
//              KM, KI, KD,
//              Transitions[iprf][_MM], Transitions[iprf][_IM],  Transitions[iprf][_DM],  Transitions[iprf][_XM],
//              lScore);

    } //while (++iprf <= prf->Length);

    // Swap Read and Write pointers
    sIOP * ptr = IOP_W;
    IOP_W = (sIOP*) IOP_R;
    IOP_R = ptr;
#endif

//     printf("%i %i\n", iseq+1, lScore);
    if ( ! LOPT && lScore > CutOff) return lScore;
  } 
  {
    register const short int * restrict lInsertion = Insertion;
    const int j1 = (int) Sequence[LSEQ-1];
    int KOPM     = IOPM_R(0);
    int KI       = IOPI_R(0) + lInsertion[j1];
    
    KOPD   = MAX( KI + (int) Transitions[0][_ID],      (int) Transitions[0][_XD] );
    register const short int (* const restrict LastSequenceProtein)[LAST_SIZE] = prf->Scores.Insertion.LastSequenceProtein;
    lScore = MAX( lScore                        , KI + (int) LastSequenceProtein[0][_IY] );
  
    register const short int * restrict lMatch = Match;
    lInsertion += AlignStep;
    
    for (int iprf=1; iprf<=prf->Length; ++iprf) {
      const int KM = KOPM         + lMatch[j1];
      KI           = IOPI_R(iprf) + lInsertion[j1];
      const int KD = KOPD         + lMatch[_D];

      lMatch     += AlignStep;
      lInsertion += AlignStep;

      KOPM = IOPM_R(iprf);

      const int tIOPD1 = MAX( KM + (int) Transitions[iprf][_MD],      (int) Transitions[iprf][_XD] );
      const int tIOPD2 = MAX( KI + (int) Transitions[iprf][_ID], KD + (int) Transitions[iprf][_DD] );
      KOPD             = MAX( tIOPD1, tIOPD2);

      const int tIOPT1 = MAX( KM + (int) LastSequenceProtein[iprf][_MY], KI + (int) LastSequenceProtein[iprf][_IY] );
      const int tIOPT2 = MAX( lScore                                   , KD + (int) LastSequenceProtein[iprf][_DY] );
      lScore           = MAX( tIOPT1, tIOPT2);
    }
  }
  //printf("That one went to the end\n");
  return lScore;
}

#undef MAX
