/*<html><title></title><body fgcolor="#000000" bgcolor="#FFFFFF"><PRE>

 (C) february-april 2000 by Yann GUIDON
 first proto attempt, proof of concept, trick gallery...

This document is derived from the buggy F-CPU manual rev. 0.1
and will change quickly. Be sure to download the most recent
version and never rely on it. It is not made to be compiled
but gives implementation hints for translation to HDL !

This is free software ; see the GPL for copying condi
tions. There is NO warranty ; not even for MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. By definition, it
is completely buggy. You've been warned.

<font color="#C00000">Attention : some parts rely on WORD_SIZE==8 (where noted).</font>

contact whygee@f-cpu.org or f-cpu@eGroups.com

*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* the opcode map : */
#include "f-cpu_map.h"

/*<font color="#0000C0">
 definition of the architecture parameters here :
 ------------------------------------------------
</font> */

/* instruction size = 4 bytes (it's not worth #defining it...) */

/* size of a register word : 64 bits */
#define WORD_SIZE      8

/* width of a cache line : 256 bits*/
#define CACHELINEWIDTH 32

/* width of the main memory bus : 64 bits */
#define MEMORYBUSWIDTH 8

/* there are 8 memory buffer lines of 256 bits each. */
#define LINECOUNT      8

/* there is 2KB of instruction cache memory */
#define ICACHELINES    256

/* Maximum pipeline depth : (arbitrary) */
#define MAXDEPTH 8

#include "SR.h"


typedef struct reg_struct {
  unsigned char val[WORD_SIZE];

  unsigned int fsm;         /* finite state machine, or shift register (here) */
      /* values :
       0 : nothing to do, register ready
       1 : write the register during this cycle
       2 : value is available on the Xbar
       more : value is being computed
      */
  unsigned int wait_flag;   /* set if we are waiting for the result of an asynchronous unit */
     /* it's part of the FSM mechanism... */

  unsigned int zero; /* each bit represents a byte of the register,
     that is zero if the byte is cleared. it represents a 2-level, 64-in OR gate,
     where only the byte intermediary byte results are stored (latched), not the top-level result. */

} R7[64]; /* [0] is not used... hardwired... */

/* this table gives the latency of every supported instruction : */
unsigned int InstructionLatency[256]; 
  /* must be computed somewhere */

/* the Control FIFO */
typedef struct fifo_struct {
  /* for each cycle there are two possible busses for writes : */
  unsigned int w1busy,
  unsigned int w1register,
  unsigned int w1EU,
  unsigned int w2busy,
  unsigned int w2register,
  unsigned int w2EU
} FIFO[MAXDEPTH];

/* <font color="#00B000"> there is 1 bit for the "busy" bit (rather than a OR of the register field).

FIFO(0) is the register/value present at the moment on the Xbar ports.

---------------------------------------------------------------------------------------------------

Advice for a 64-bit implementation follows :

 3 bits for the "busy" format :
- 1 bit for the type of format (0 for "loadcons" format, 1 for "register" format)
- 2 bits for size/displacement

  f   cons  reg
  0   b0    8LSB
  1   b16   16LSB
  2   b32   32LSB
  3   b48   64LSB

there are 5 "domains" : (if we are in a 64-bit CPU)
                     b0    b16    b32    b48    8LSB  16LSB 32LSB 64LSB
f0                         *             *            *           *
f1                                *      *                  *     *
f2                                              *     *     *     *

 - d0= bits[0:7]   = *                          *     *     *     *      = f2+ /(f0+f1)
 - d1= bits[8:15]  = *                                *     *     *      = 
 - d2= bits[16:31] =       *                                *     *      =
 - d3= bits[32:47] =              *                               *      = 
 - d4= bits[48:63] =                     *                        *      = f0.f1

That's for the implementation advices and the simplification. Yet, nothing keeps us 
from using a dumb byte mask.
---------------------------------------------------------------------------------------------------
</font>*/



/* internal data of the SR unit : */
unsigned char SR_buffer [SR_NUMBERS][WORD_SIZE],

/* decoding stage, register read */


/* decoding stage, register write */

/* Xbar buffers : */


/* input of the execution units : */
  XbarRead1Out2[WORD_SIZE],XbarRead2Out2[WORD_SIZE],XbarRead3Out2[WORD_SIZE],


/* output of the execution units : */
  ROP2_out[WORD_SIZE], /* connected to both Xbar result bus 1 and 2 */
  ASU_out1result[WORD_SIZE], /* ASU pipe stage #1 */ /* connected to both Xbar result bus 1 and 2 */
  ASU_out2result[WORD_SIZE], /* ASU pipe stage #2 */ /* connected to both Xbar result bus 1 and 2 */
  ASU_out1carry[WORD_SIZE],  /* ASU pipe stage #1 */ /* connected only to Xbar result bus 2 ... */
  ASU_out2carry[WORD_SIZE];  /* ASU pipe stage #2 */ /* ... coz when it's used, it's only in 2r2W mode */

unsigned int simd_size,simd_size0,simd_size1,simd_size2; /* pipeline for the size of the SIMD packets */

/* little byte-oriented utility */
void set(unsigned char *dest, unsigned int val, unsigned int size) {
  int i=0;
  while (val) {
    dest[i++]=(val & 255);
    if (i>=size)
      return;
    val>>=8;
  }
}

void print_64(unsigned char *p) {
  printf (" %02X%02X%02X%02X%02X%02X%02X%02X ",*(p+7),*(p+6),*(p+5),*(p+4),*(p+3),*(p+2),*(p+1),*p);
}

void print_instruction(unsigned char *p) {
  printf (" %02X%02X%02X%02X ",*p,*(p+1),*(p+2),*(p+3));
}


void cycle_ASU(){ /* warning ! 64-bit version only (tested anyway) */
  signed int i=0, j, k, l;

  /* last part of the pipeline, before we flush the temporary
      result (economy of C writing, NOT hardware accurate !) */

  if ((ASU_operation2!=0)&&(simd_size2 > 1)) { /* don't compute if we needed a byte result only */
    while (i < WORD_SIZE) { /* scan the word */
      j=0; /* clear the carry */
      l=i; /* remember where we started -> scan carry later */
      for (k=0;k < simd_size2;k++) { /* could it be modified to a "mask and test" algo ?... */
	j+=(ASU_out1result[i]|((signed char)ASU_out1carry[i]) << 8);
	ASU_out2result[i++]=(unsigned char)j;
	j>>=8;
      }
      for (k=0;k < simd_size2;k++){
	ASU_out2carry[l+k]=j; /* j should be zero, 1 or -1 */
	j>>=8; /* shift out 1, or keep -1 */
      }
    }
  }  
    
/* propagation in the pipeline : (could be avoided if seen with a larger analysis [control signals]) */
  ASU_operation2=ASU_operation1;
  simd_size2=simd_size1;

  /* first part of the pipeline : 8 * 8-bit adders */
  if (ASU_operation1) {
    if (ASU_operation1==1) { /* ADD */
      for (i=0; i < WORD_SIZE; i++) {
	j=XbarRead2Out2[i]+XbarRead3Out2[i];
        ASU_out1result[i]=(unsigned char)j;
        ASU_out1carry[i]=(unsigned char)(j>>8);
      }
    } else { /* SUB */
      /* could have used the carry+xor trick */
      for (i=0; i< WORD_SIZE; i++) {
	j=XbarRead2Out2[i]-XbarRead3Out2[i];
        ASU_out1result[i]=(unsigned char)j;
        ASU_out1carry[i]=(unsigned char)(j>>8);  /* notice that the carry outputs are only -1, 0 or 1 */
      }
    }
  }
}

char result_rop2[WORD_SIZE];

void cycle_ROP2(){
  /* stay tuned */  
  
}

char result_inc[WORD_SIZE];

void cycle_INC(){    /* only word increment is done yet */
  signed int i=0, j, k, l;
  while (i < WORD_SIZE) {
    j=1;
    for (k=0;k < simd_size1;k++) {
      j+=XbarRead2Out2[i];
      result_inc[i++]=(unsigned char)j;
    }
  } 

}

void cycle_POP(){
  /* register 1 : source register, register 2 : substract amount */  


}


/* Special Register Unit (SRU) */

  /*<font color="#00B000"> in this example, we don't do much things with the SRs,
     a lot of things are missing : supervisor bits, range checking, etc.
     we only read all the SR and write to the cycle counter
     BUT it is an asynchronous unit : the decoder waits for a READY signal. </font>*/

/*<font color="#B00000">Warning : not yet logically connected to the Xbar</font>*/

/* interface with the control signals and the Xbar */
unsigned char
 SR_address[WORD_SIZE],    /* this comes from the Xbar from the register read port #2 or from the immediate field of the instruction */
 SR_write_port[WORD_SIZE], /* connected to the Xbar, read/write port #1, */
 SR_read_port[WORD_SIZE];  /* the write port is routed by the Xbar to R7's write port #1 or #2 according to the scheduler. */

unsigned int
 SR_signal, /* ==1 if the decoder decodes a GET or PUT instruction */
 SR_read,   /* ==1 if GET, ==0 if PUT (active if SR_signal is valid) */
 SR_pending,/* ==1 when the SRU is "thinking". */
 SR_trap;   /* !=0 : something's wrong ! routed to the decoder, fetch the SR trap handler. */
   /* 'diagnostic' is currently merged with this signal, but not in the circuit. */

/* at reset time. */
void init_SR() {
  memset (SR_buffer,sizeof(SR_buffer),0);
  set(SR_buffer[SR_NUMBERS],SR_NUMBERS_val,8);
  set(SR_buffer[SR_FAMILY],SR_FAMILY_val,8);
  set(SR_buffer[SR_STEPPING],SR_STEPPING_val,8);
  set(SR_buffer[SR_MAX_SIZE],WORD_SIZE,8);
  set(SR_buffer[SR_SIZE_1],SR_SIZE_1_val,8);
  set(SR_buffer[SR_SIZE_2],SR_SIZE_2_val,8);
  set(SR_buffer[SR_SIZE_3],SR_SIZE_3_val,8);
  set(SR_buffer[SR_SIZE_4],SR_SIZE_4_val,8);

  /*
     SR_CYCLE=0 : cold boot, counter cleared
     SR_PAGING=0 : paging is not enabled
  */
}


/* called every cycle : */
void cycle_SR() {          /* <font color="#B00000">warning : this part has not been tested or compiled yet ! This is not even cycle accurate !</font> */
  unsigned int i=0;

  /* increment the cycle counter */
  do {
    SR_buffer[SR_CYCLE][i]++
  } while ((SR_buffer[SR_CYCLE][i++]!=0) && (i < WORD_SIZE));

  if (!SR_pending) {
   /* if nothing is pending, we can check if the decoder has a GET/PUT instruction */
    if (SR_signal) {
      SR_pending=1;

      /* first check the address */
      /* assume SR_NUMBERS<256 ! */
      if (SR_address[0]>=SR_NUMBERS_val)
        SR_trap=1;
      else {
      for (i=1; i < WORD_SIZE;i++)
        if (SR_address[i]!=0)
          SR_trap=1;
      }
      
      if (SR_trap==0) { /* if still ok */

          /* test read or write : check the read mask or write mask */
        if (SR_read) {
          if (!((1 << SR_address[0])&SR_READ_MASK))
            SR_trap=2;
          else {
/* action ! */
            /* here, the only thing we can do is read the registers. no complex bahaviour. */
            memcpy(SR_read_port,SR_buffer[SR_address[0]],WORD_SIZE);
/*here, we should do something for SR_pending... */

/* later, this part will be rather ... complex. Keep it clean ! */

          }
        } 
          /* write mask */
        else {
          if (!((1 << SR_address[0])&SR_WRITE_MASK))
            SR_trap=3;
          else
            memcpy(SR_buffer[SR_address[0]],SR_write_port,WORD_SIZE);
/*here, we should do something for SR_pending... */

/* idem here for later... */

        }
      }
    }
  }
}



int main(){
  return 0;
}
