/*
 * 
 * G6CHIP_SIMLIB.C
 * Created on 2000/3/11
 * Based on g6chip.c
 *
 * full simulator of a single chip with external memory unit
 *
 * Copyright Jun Makino 1997
 *
 * Version 1.0 98/02/05
 * Version 1.1 98/04/26
 *   routines to generate test vectors added
 *
 * Debug note:
     pop->pacc_sum_flag = accflags[6];
      corrected to 
     pop->pacc_sum_flag |= accflags[6];
     in force_pipeline_step
     1998/8/29

 *
 *

 Note: Guide to setting xunit, tunit, xscale etc...
       xunit, tunit: the location of the binary point
                     for the fixed point format, counted
		     from LSB. 2^(63-xunit)gives the max value
		     for position. For systems with the standard
		     unit, xunit=tunit=54 should work fine
       fscale: -fscale+xunit*2-512 is used as the argument
               for ldexp. If the force is around 2^k, the argument
	       for ldexp should be k-(48-56), which means
	       fscale = (48-56)-512+xunit*2-k ??

       jscale: similarly,
	       jscale = (48-56)-512+xunit*3-k ??
       pscale: similarly,
	       pscale = (48-56)-512+xunit-k ??
       
 */

#include <stdlib.h>

#include "grape6sim.h"

#define G6CHIP_BODY

#include "g6chip_simlib.h"

#include "../g6hib/grape6util.h"


#define NMAX 100000
static int njp;
static int nip;

static struct jparticle jpmem[NMAX];
static struct predicted_particle predmem[NMAX];
static struct iparticle ipmem[NMAX];


static int dump_mode = 0;







int simg6_getnpipe_()
{
  return NPIPEPERCHIP;
}

void set_tunit(int newtunit)
{
  tunit = newtunit;
}

void simg6_set_tunit_(int * newtunit)
{
  set_tunit(*newtunit);
}


void  simg6_set_debug_level_(int * i)
{
  set_debug_level(*i);
}

void set_xunit(int newxunit)
{
  dprintf(3,"(set_xunit) arg = %ld\n", newxunit);
  xunit = newxunit;
  xscale2 = (ULONG_ONE)<<((int)xunit);
  xscaleinv = 1.0/xscale2;
  xscale2 *= xscale2;
}
void simg6_set_xunit_(int * pxunit)
{
  set_xunit(*pxunit);
}


void  adjust_scaling_parameters(struct iparticle * inreg)
{
  inreg->fscale += xunit*2 - 512;
  inreg->jscale += xunit*2+tunit - 512;
  inreg->phiscale += xunit - 512;
}
  




void set_i_particle_data_on_emulator(SIMGRAPE6_CLUSTER_PTR simgp,
				     int address,
				     double x[3], /* position */
				     double v[3], /* velocity */
				     double eps2,
				     double h2,
				     double rscale,
				     int index,
				     int fscale,
				     int  jscale,
				     int  phiscale)
     
     
{
    struct iparticle * ip;
    int k;
    if (simgp->simulator_verbose_level > 0){
	fprintf(stderr,"set_ip_on_sim: address = %d\n", address);
    }
    
    ip = &(simgp->ipmem[address]);
    /*  printf("set_iparticle address = %d\n", address);*/
    for (k=0;k<3;k++){
	ip->xi[k] = CONVERT_DOUBLE_TO_GRAPE_INT_POS(x[k],xunit);
	ip->vi[k] = convert_double_to_grape_float(ldexp(v[k],(int)(xunit-tunit)),
						  INTERACTION_F_LEN_U); 
    }
    ip->eps2 = convert_double_to_grape_float(eps2*xscale2, INTERACTION_F_LEN_U);
    ip->h2 = convert_double_to_grape_float(h2*xscale2, INTERACTION_F_LEN_U);
    ip->rscale = convert_double_to_grape_float(xscaleinv/rscale,CUTOFF_MANTISSA_LEN);
    ip->index = index;
    ip->fscale = fscale;
    ip->jscale = jscale;
    ip->phiscale = phiscale;
#ifdef INTERNAL_OUT
    fprintf(stderr,"xi = %lx %lx %lx %lx\n", (ULONG) (&ip->xi[0]),ip->xi[0],ip->xi[1],ip->xi[2]);
#endif
}


static int acc_point  = 50;

static int jerk_point = 26;




ULONG set_j_particle_on_emulator(SIMGRAPE6_CLUSTER_PTR simg6p,
				 int address,
				 int index,
				 double mass,
				 ULONG tjlsb,
				 ULONG dtjmsb,
				 ULONG ix[3][5])
{
    int i,j;
    struct jparticle *jp = &(simg6p->jmem[address]);
    if (address > NMAX) return 1;
    jp->index = index;
    jp->mass = convert_double_to_grape_float(mass, INTERACTION_F_LEN_U);
    jp->tjlsb = tjlsb;
    jp->dtjmsb = dtjmsb;
    for(i = 0;i<3;i++)for(j=0;j<5;j++)jp->ix[i][j] = ix[i][j];
    return 0;
}

static ULONG iti;


void sim_set_ti(ULONG iti_arg)
{
    iti = iti_arg;
}


void get_predictor(SIMGRAPE6_CLUSTER_PTR simg6p,
		   int address,
		   LONG ixp[3],
		   ULONG ivp[3],
		   ULONG *mass)
{
    struct jparticle * jp;
    int k;
    jp = simg6p->jmem + address;
    *mass = jp->mass;
      dprintf(4,"predictor for address = %d\n", address);
    for(k=0;k<3;k++){
	predict((ULONG*)ixp+k,ULONG_ZERO,iti,jp->tjlsb, jp->dtjmsb, jp->ix[k][4],
		jp->ix[k][3],jp->ix[k][2],jp->ix[k][1],jp->ix[k][0]);
	predict(ivp+k,ULONG_ONE,iti,jp->tjlsb, jp->dtjmsb, jp->ix[k][4],
		jp->ix[k][3],jp->ix[k][2],jp->ix[k][1],jp->ix[k][0]);
	
    }
    
}


void predict_in_chip(SIMGRAPE6_CLUSTER_PTR simg6p, int njp )
{
  int i;
  struct predicted_particle * pp;
  
  for(i=0,pp=simg6p->pmem;i<njp; i++,pp++){
    int k;
    pp->index = simg6p->jmem[i].index;
    get_predictor(simg6p,i, pp->xj,  pp->vj,  &(pp->mass));
#ifdef INTERNAL_OUT
    fprintf(stderr,"(predict_in_chip), i, index, x %d %d %lx %lx %lx\n",
	    i, pp->index,pp->xj[0], pp->xj[1], pp->xj[2]);
#endif
    for(k=0;k<3;k++){
	double xp, vp;
	convert_predicted_result(&xp,&vp,pp->xj[k],pp->vj[k],
				 xunit, tunit);
#ifdef INTERNAL_OUT
	fprintf(stderr,"(predict_in_chip), i,k,  x, v %d %d %le %le\n",
		i, k, xp, vp);
#endif
    }
  }
}

void force_pipeline_step(struct predicted_particle * predmem,
			 int j,
			 struct gchip *chip,
			 int ipipe,
			 int * nbflag,
			 int clear)
{
    struct predicted_particle * pjp;
    struct iparticle * pip;
    struct pipe_output_register_set * pop;
    ULONG r2, unbflag;
    ULONG flags;
    ULONG accflags[7];
    int k;
    pjp = predmem+j;
    pip = &(chip->inreg[ipipe]);
    pop = &(chip->outreg[ipipe]);
    
    dprintf(1,"force_pipeline_step, %d %d\n", j, ipipe);
    
    /*    if(pjp->index != pip->index){*/
    
#ifdef INTERNAL_OUT    
    fprintf(stderr,"xj = %lx %lx %lx %lx\n", (ULONG) (&pjp->xj[0]),pjp->xj[0],pjp->xj[1],pjp->xj[2]);
    fprintf(stderr,"xi = %lx %lx %lx %lx\n", (ULONG) (&pip->xi[0]),pip->xi[0],pip->xi[1],pip->xi[2]);
#endif
    flags = force(pjp->xj, pjp->vj, pjp->mass,
		  pip->xi, pip->vi, pip->eps2, pip->h2,
		  pip->rscale, pip->fscale, pip->jscale, pip->phiscale,
		  pjp->index,pip->index,
		  clear,
		  pop->acc,pop->jerk, &(pop->phi), &unbflag, &r2,accflags);
    *nbflag = unbflag;
#ifdef INTERNAL_OUT
    fprintf(stderr,"acc = %lx %lx %lx\n", pop->acc[0],pop->acc[1],pop->acc[2]);
#endif    
    /* set flags... */
    
    /* first, update nearest particle register */
    if(pjp->index != pip->index){
	if (compare_grape_floats(pop->rnnb, r2, INTERACTION_F_LEN_U)){
	    pop->rnnb = r2;
	    pop->innb = pjp->index;
	}
    }
#ifdef INTERNAL_OUT
    fprintf(stderr,"rnnb, innb = %lx %lx \n", pop->rnnb, pop->innb);
#endif
    /* set overflow flags etc... */
    for(k=0;k<3;k++){
	if(k == 0){
	    pop->facc_sum_flag |= accflags[k];
	    pop->jacc_sum_flag |= accflags[k+3];
	}else{
	    pop->facc_sum_flag |= accflags[k]<<(k*3);
	    pop->jacc_sum_flag |= accflags[k+3]<<(k*3);
	}
    }
    pop->pacc_sum_flag |= accflags[6];
    /* the above used to be simple = until 1998/8/29 !!! */
    for(k=0;k<7;k++) pop->sum_raw_flags[k] |= accflags[k];
    dprintf(1,"force_pipeline_step flags = %lx %lx %lx %lx %lx %lx %lx\n",
	    pop->sum_raw_flags[0],
	    pop->sum_raw_flags[1],
	    pop->sum_raw_flags[2],
	    pop->sum_raw_flags[3],
	    pop->sum_raw_flags[4],
	    pop->sum_raw_flags[5],
	    pop->sum_raw_flags[6]);
#ifdef PRINT_TEST_PATTERN    
    print_forcepipe_test_pattern(pjp->xj, pjp->vj, pjp->mass,
				 pip->xi, pip->vi, pip->eps2, pip->h2,
				 pip->rscale, pip->fscale, pip->jscale, pip->phiscale,
				 pjp->index, pip->index,
				 clear,
				 pop->acc, pop->jerk,pop->phi,
				 unbflag, pop->rnnb, pop->innb, pop->sum_raw_flags);
#endif    
    
}

int simg6_check_overflow_(int * aflag, int * jflag, int * pflag)
{
    int flags = 0;
    int k;
    flags |= (*aflag & FADD_OVERFLOW_MASK);
    flags |= (*jflag & FADD_OVERFLOW_MASK);
    flags |= (*pflag & FADD_OVERFLOW_MASK);
    for(k=1;k<3;k++){
	flags |= (((*aflag)>>(k*3)) & FADD_OVERFLOW_MASK);
	flags |= (((*jflag)>>(k*3)) & FADD_OVERFLOW_MASK);
    }
    return flags;
}

void set_g6chip_nip(int n)
{
    nip = n;
}
void force_chip_pipeline_step(struct predicted_particle *predmem,
			      int jindex,
			      struct gchip *chip,
			      int clear)
{
    int nbflags[NPIPEPERCHIP];
    int i, j, k;
    for(i=0;i<NPIPEPERCHIP; i++){
	nbflags[i] =  0;
    }
    for(i=0;i<nip; i++){
	force_pipeline_step(predmem,jindex,chip,i, nbflags+i,clear);
    }
    
    for(k=0;k<3; k++){
	struct neighbour_memory * nmp = &(chip->nbmem[k]);
	int anynb = 0;
	for(i=0, j = k*16;i<16; i++, j++){
	    if(nbflags[j]!= 0){
		anynb = 1;
		i = 16;
	    }
	}
	if(anynb){
	    ULONG flagword = ULONG_ZERO;
	    int inb = nmp->nnb;
	    
	    if (inb < NNBMAX){
		for(i=0, j = k*16+15;i<16; i++, j--){
		    flagword <<= 1;
		    flagword |= (nbflags[j] & ULONG_ONE);
		}
		nmp->nbflags[inb] = flagword;
		nmp->index[inb] = (predmem+jindex)->index;
		inb++;
		nmp->nnb = inb;
	    }else{
		nmp->overflown = ULONG_ONE;
	    }
	}
    }
}

void reset_nbmem(struct neighbour_memory * nbmem)
{
    nbmem->nnb = 0;
    nbmem->overflown = 0;
}
void reset_outregs(struct pipe_output_register_set * outreg)
{
    int k;
    for(k=0;k<3;k++){
	outreg->acc[k] = LONG_ZERO;
	outreg->jerk[k] = LONG_ZERO;
    }
    outreg->phi = LONG_ZERO;
    outreg->rnnb = compose_float(INTERACTION_F_LEN_U, (ULONG) 0x3ff, ULONG_ZERO,
				 ULONG_ZERO, (ULONG) 0xffffff);
    outreg->innb = ULONG_ZERO;
    outreg->facc_sum_flag = ULONG_ZERO;
    outreg->jacc_sum_flag = ULONG_ZERO;
    outreg->pacc_sum_flag = ULONG_ZERO;
    for(k=0;k<7;k++){
	outreg->sum_raw_flags[k] = ULONG_ZERO;
    }
}



/*
 * run_chip : driver routine for simulator main body
 */
  
void run_chip(struct predicted_particle * predmem,
	      int jstart, /* note that count is reverse order */
	      int jstep,
	      int ni,
	      struct gchip *chip )
{
    int i,j;
    int clear;
    for(i=0;i<NPIPEPERCHIP; i++){
	reset_outregs(&(chip->outreg[i]));
    }
    for(i=0;i<NNBUNITS;i++){
	reset_nbmem(&(chip->nbmem[i]));
    }

    clear = 1;
    for(j=jstart; j>=0; j-=jstep){
	force_chip_pipeline_step(predmem, j,chip, clear );
	clear = 0;
    }
}

int reduce_force(LONG * accumulator, /* ACCUMULATOR*/
		 LONG   addval,
		 ULONG outbits,/* accumulator bit length */
		 int clear)
     /* return value : error code
	0: okay
	1: acc overflow
	*/
     
{
    ULONG accsign, newsign_inv, sign;
    ULONG err = 0;
    ULONG sgnmask = (ULONG_ONE) <<(outbits-1);
    if (clear) *accumulator = 0;
    accsign = 0;
    if(((ULONG)*accumulator) & sgnmask) accsign = 1;
    sign = 0;
    if(((ULONG)addval) & sgnmask) sign = 1;
    *accumulator += addval;
    newsign_inv = 1;
    if(((ULONG)*accumulator) & sgnmask) newsign_inv = 0;
    if((sign == accsign) && (accsign == newsign_inv)){
	err = 2;
    }
    return err;
}


void reduce_results(struct pipe_output_register_set *pp,
		    struct gchip * gp,
		    int nchips,
		    int ipipe)
{
    int clear = 1;
    ULONG reduce_raw_flags[7];
    int i, k;
    for(i=0;i<7;i++)reduce_raw_flags[i] = 0;
    for(i = 0;i<nchips; i++){
	ULONG rnnb_masked;
	for(k = 0;k<3;k++){
	    reduce_raw_flags[k] |= reduce_force(pp->acc+k,
						gp[i].outreg[ipipe].acc[k],
						LONGBITS, clear);
	    reduce_raw_flags[k+3] |= reduce_force(pp->jerk+k,
						  gp[i].outreg[ipipe].jerk[k],
						  J_ACC_LEN, clear);
	}
	reduce_raw_flags[6] |= reduce_force(&(pp->phi),
					    gp[i].outreg[ipipe].phi,
					    LONGBITS, clear);

	rnnb_masked = gp[i].outreg[ipipe].rnnb & 0xfffffffffffffff0L;
	if (clear){
	    pp->rnnb = rnnb_masked;
	    pp->innb = gp[i].outreg[ipipe].innb;
	    for(k=0;k<7;k++)pp->sum_raw_flags[k] = gp[i].outreg[ipipe].sum_raw_flags[k];
	    
	}else{
	    if (!compare_grape_floats( rnnb_masked,pp->rnnb, INTERACTION_F_LEN_U)){
		pp->rnnb = rnnb_masked;
		pp->innb = gp[i].outreg[ipipe].innb;
	    }
	    for(k=0;k<7;k++)pp->sum_raw_flags[k] |= (gp[i].outreg[ipipe].sum_raw_flags[k]
						     | reduce_raw_flags[k]);
	}
	clear = 0;
    }
    /* cut the lower 4 bits of rnnb */
    pp->rnnb = (pp->rnnb)>>4;
    
    /* set overflow flags etc... */
    for(k=0;k<3;k++){
	if(k == 0){
	    pp->facc_sum_flag = pp->sum_raw_flags[k];
	    pp->jacc_sum_flag = pp->sum_raw_flags[k+3];
	}else{
	    pp->facc_sum_flag |= pp->sum_raw_flags[k]<<(k*3);
	    pp->jacc_sum_flag |= pp->sum_raw_flags[k+3]<<(k*3);
	}
    }
    pp->pacc_sum_flag = pp->sum_raw_flags[6];
    pp->flags =  ((pp->pacc_sum_flag)<<18)|
	((pp->jacc_sum_flag)<<9)|
	((pp->facc_sum_flag));
    
}


		
void run_simulated_cluster(SIMGRAPE6_CLUSTER_PTR simg6p,
			   int nj,
			   int ni,
			   int nchips)
{
    int i,jstart, ip;
    int jpercluster;
    jpercluster = (nj+nchips-1)/nchips;
    jstart = (jpercluster-1)*nchips;
    predict_in_chip(simg6p, jstart+nchips);
    set_g6chip_nip(ni);

    for(i=0;i<nchips; i++)for(ip=0;ip<ni;ip++){
	simg6p->gc[i].inreg[ip] = simg6p->ipmem[ip];
    }
    for(i = 0; i<nchips; i++){
#ifdef INTERNAL_OUT
	fprintf(stderr,"run_cluster %d %d %d %d\n", jstart, nchips, ni, i);
#endif
	run_chip(simg6p->pmem, jstart+i, nchips, ni, (simg6p->gc) + i);
    }

    for(ip = 0; ip<ni; ip++){
	reduce_results(simg6p->reduced_result+ip, simg6p->gc, nchips, ip);
    }
}


int initialize_simulator(SIMGRAPE6_CLUSTER_PTR simg6p)
{
    simg6p->jmem = (struct jparticle *) calloc(NMAX, sizeof(struct jparticle));
    simg6p->pmem = (struct predicted_particle *)
	calloc(NMAX, sizeof(struct predicted_particle));
    if ((simg6p->jmem == NULL)||(simg6p->pmem == NULL)){
	fprintf(stderr,"(initialize_simulator) internal error: malloc failed\n");
	return -1;
    }
    simg6p->use_simulator = 1;
    simg6p->simulator_verbose_level = 0;
    reset_cutoff();
    return 0;
}

void set_simulator_mode(SIMGRAPE6_CLUSTER_PTR simg6p, int mode)
{
    simg6p->use_simulator = mode;
}
    
void set_simulator_verbose_level(SIMGRAPE6_CLUSTER_PTR simg6p, int level)
{
    simg6p->simulator_verbose_level = level;
}
    
void construct_sim_fodata(unsigned int fodata[14],
			  struct pipe_output_register_set * simrp)
{
    int k;
    for(k=0;k<3;k++){
	fodata[k*2] = (unsigned int) (simrp->acc[k] & 0xffffffffl);
	fodata[k*2+1] = (unsigned int) ((simrp->acc[k]>>32) & 0xffffffffl);
	fodata[k+8] = (unsigned int)(simrp->jerk[k]& 0xffffffffl);
    }
    fodata[6] = (unsigned int) (simrp->phi & 0xffffffffl);
    fodata[7] = (unsigned int) ((simrp->phi>>32) & 0xffffffffl);
    fodata[11] = simrp->rnnb;
    fodata[12] = simrp->innb;
    fodata[13] = simrp->flags;
}


static struct neighbour_memory nbmem[NNBUNITS];

void read_neighbor_list(struct gchip * chip)
{
    int i;
    for(i=0;i<NNBUNITS;i++)
	nbmem[i] = chip->nbmem[i];
}




int get_neighbor_list(int index,
		      int nblist[])
{
    int ilocal = index % 16;
    int iunit = index / 16;
    int nbl = 0;
    int i;
    struct neighbour_memory * nbp;
    ULONG mask;
    /*
      fprintf(ftestout,"get_neighbor index = %d ilocal, iunit = %d %d\n", index,
	   ilocal, iunit);*/
    nbp = &nbmem[iunit];
    mask = ULONG_ONE <<ilocal;
    /*printf("nnb = %d mask = %x\n", nbp->nnb, mask);*/
    for(i= nbp->nnb - 1; i>=0 ; i --){
	/*	printf("i, flags =  %d  %lx\n", i, nbp->nbflags[i]);*/
	
	if( ((int)(nbp->nbflags[i])) & mask){
	    nblist[nbl] = nbp->index[i]; nbp->index[i];
	    nbl ++;
	    /*	printf("found, i, nbl, index =   %d  %d %d\n", i, nbl, nbp->index[i]);*/
	}
    }

    /*    printf("returning %d\n", nbl);*/
    return nbl;
	
}

int simg6_getnbl_(int * index, int * nblist)
{
    return get_neighbor_list(*index, nblist);
}
void put_cutoff_tables()
{}
