-- iadd.vhdl -- F-CPU 64-bit Add/Subtract Unit
-- Copyright (C) 2000, 2001 Michael Riepe <michael@stud.uni-hannover.de>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

-- $Id: iadd.vhdl,v 1.29 2001/09/05 00:10:42 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;
use work.Generic_Adder.all;

entity IAdd is
	generic (
		WIDTH : natural := 64;	-- do not change!
		PIPELINED : integer := 0
	);
	port (
		-- operand inputs
		A : in std_ulogic_vector(WIDTH-1 downto 0);
		B : in std_ulogic_vector(WIDTH-1 downto 0);
		-- subtract mode enable
		Sub : in std_ulogic;
		-- saturate/floor mode enable
		Sat : in std_ulogic;
		-- SIMD mode switches
		U : in std_ulogic_vector(2 downto 0);
		-- clock/reset inputs
		Clk : in std_ulogic;
		Rst : in std_ulogic;
		En : in std_ulogic;
	--
		-- 8-bit tap outputs
		Y8l : out std_ulogic_vector(WIDTH-1 downto 0);
		Y8h : out std_ulogic_vector(WIDTH-1 downto 0);
		-- regular outputs
		Yl : out std_ulogic_vector(WIDTH-1 downto 0);
		Yh : out std_ulogic_vector(WIDTH-1 downto 0)
	);
--pragma synthesis_off
begin
	assert WIDTH = 64
		report "width of IAdd must be 64"
		severity failure;
--pragma synthesis_on
end IAdd;

-- Known limitations:
--
--	1: Not fully tested.
--
--	2: 8-bit SIMD mode works but adds/subf is slower (2 cycles)
--	than add/addc/sub/subb (1 cycle).  8-bit adds/subf seems to
--	be impossible with 6 gates, unless we use lookup tables.
--
--	3: There was no space (or rather, delay time) to include
--	the avg and diff instructions.	They will probably need an
--	additional output port.  Different avg/diff rounding modes
--	won't work either, but `truncate' should be sufficient anyway.
--
--	4: subb mode differs from F-CPU manual.  IMHO the manual
--	should be changed :)  See the rationale in the code below.

-- Operating Modes:
--
--	Sub = '0', Sat = '0': add operation
--	Sub = '0', Sat = '1': add operation with unsigned saturation (ceiling)
--	Sub = '1', Sat = '0': sub operation
--	Sub = '1', Sat = '1': sub operation with unsigned saturation (floor)
--
--	carry/borrow is always available on the second output port (Yh);
--	that means all operating modes from the manual are supported:
--	add, addc, adds, sub, subb and subf.  8-bit add/addc/sub/subb
--	has its own output ports in the first pipeline stage; 8-bit
--	adds/subf uses the output port of the second stage.

-- SIMD Modes:
--
--	U = "000": 8-bit mode
--	U = "001": 16-bit mode
--	U = "011": 32-bit mode
--	U = "111": 64-bit mode
--	(others combinations are invalid)
--
--	Note: I intend to use this encoding scheme everywhere; it seems
--	to be the most appropriate one.

-- Modus Operandi:
--
--	The IAdd unit is a multi-level carry look-ahead/increment adder
--	with SIMD capabilities.  Its first level calculates 4-bit
--	slices using carry look-ahead; the second and third level
--	are SIMD-enabled incrementers that provide wider results.
--	There is also a `tap' output that provides faster 8-bit
--	results for some operations.
--
--	Subtraction is implemented as `not ((not A) + B)' rather
--	than the usual `A + (not B) + 1' because that makes the
--	saturation modes easier -- and we don't need a carry input
--	either, which simplifies the SIMD stuff in the final stages.
--	Expressed as a simple equation, this unit calculates:
--
--		Yl := (((A xor Sub) + B) or (Sat and Carry)) xor Sub
--		Yh := 1 when Carry is set, 0 otherwise
--
--	where `Carry' is the appropriate carry output from the adder;
--	any other signals can be found in the entity declaration.

-- Implementation:
--
--	The whole unit consist of ordinary and/or gates with up
--	to 4 inputs, 2-input xor gates and inverters.  In timing
--	calculations, all gates are assumed to have a delay of 1.
--	Some basic elements may be optimized further if the target
--	supports arbitrary functions of 3 or 4 inputs, e.g. the half
--	adders can be combined with the input inverters.  With the
--	conservative assumptions above, the unit has a delay of
--	12, and it can be split in the middle (at d=6) to form two
--	pipeline stages.  *schwitz* :)

architecture Behave_1 of IAdd is
	-- signals used by both stages
	signal r_Y1, r_C1 : std_ulogic_vector(WIDTH-1 downto 0);
	signal r_S2, r_I2 : std_ulogic_vector(WIDTH/4-1 downto 0);
	signal r_G2, r_P2 : std_ulogic_vector(WIDTH/16-1 downto 0);
	signal r_C08 : std_ulogic_vector(WIDTH/8-1 downto 0);
	-- pipelined mode signals
	signal U_2 : std_ulogic_vector(U'length-1 downto 0);
	signal Sub_2, Sat_2 : std_ulogic;
	signal En_2 : std_ulogic;
begin
	stage_1 : process (A, B, Sub, Sat, U, Clk, Rst, En)
		-- signals used by stage 1 exclusively
		variable G0, P0 : std_ulogic_vector(WIDTH-1 downto 0);
		variable Y1, C1 : std_ulogic_vector(WIDTH-1 downto 0);
		variable S1, I1 : std_ulogic_vector(WIDTH-1 downto 0);
		variable G1, P1 : std_ulogic_vector(WIDTH/4-1 downto 0);
		variable S2, I2 : std_ulogic_vector(WIDTH/4-1 downto 0);
		variable G2, P2 : std_ulogic_vector(WIDTH/16-1 downto 0);
		variable C08 : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable neg, n_A : std_ulogic_vector(WIDTH-1 downto 0);
		variable yh, yl : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		-- input stage
		-- (half adders with A input inverted by Sub)
		-- d=2
		neg := (others => Sub);
		n_A := A xor neg;
		P0 := n_A xor B;
		G0 := n_A and B;

		-- first-level carry look-ahead
		-- d=3/4
		CIA_Row(G0, P0, S1, I1, G1, P1);

		-- 4-bit partial results
		-- d=5
		Y1 := P0 xor S1;
		C1 := I1;

		-- 8-bit carry out (used in tap and second stage)
		-- d=6
		for i in WIDTH/8-1 downto 0 loop
			C08(i) := G1(2*i+1) or (P1(2*i+1) and G1(2*i));
		end loop;

		-- 8-bit SIMD add/sub tap
		-- d=6
		-- yl := (P0 xor neg) xor S1;
		-- (((A xor Sub) xor B) xor Sub) xor S1 => (A xor B) xor S1 !!!
		yl := (A xor B) xor S1;
		yh := (others => '0');
		for i in WIDTH/8-1 downto 0 loop
			for j in 7 downto 4 loop
				yl(8*i+j) := yl(8*i+j) xor (C1(8*i+j) and G1(2*i));
			end loop;
			yh(8*i) := C08(i);
		end loop;
		Y8l <= yl;
		Y8h <= yh;

		-- second-level carry look-ahead
		-- (like CIA_Row but with SIMD splits after 2 bits)
		-- d=6
		for i in WIDTH/16-1 downto 0 loop
			-- d=4
			I2(4*i+0) := '1';
			I2(4*i+1) := P1(4*i+0);
			I2(4*i+2) := (P1(4*i+1) and P1(4*i+0))
					  or (not U(0));
			I2(4*i+3) := (P1(4*i+2) and P1(4*i+1) and P1(4*i+0))
					  or (P1(4*i+2) and not U(0));
			-- d=6
			S2(4*i+0) := '0';
			S2(4*i+1) := G1(4*i+0);
			S2(4*i+2) := (U(0) and P1(4*i+1) and G1(4*i+0))
					  or (U(0) and G1(4*i+1));
			S2(4*i+3) := G1(4*i+2)
					  or (U(0) and P1(4*i+2) and G1(4*i+1))
					  or (U(0) and P1(4*i+2) and P1(4*i+1) and G1(4*i+0));
			-- Note: for P2 and G2, U(0) = '1' is assumed (16-bit mode)
			-- d=5
			P2(i) := P1(4*i+3) and P1(4*i+2) and P1(4*i+1) and P1(4*i+0);
			-- d=6
			G2(i) := G1(4*i+3)
				  or (P1(4*i+3) and G1(4*i+2))
				  or (P1(4*i+3) and P1(4*i+2) and G1(4*i+1))
				  or (P1(4*i+3) and P1(4*i+2) and P1(4*i+1) and G1(4*i+0));
		end loop;

		-- end of first stage
		if PIPELINED = 0 then
			r_Y1 <= Y1;
			r_C1 <= C1;
			r_S2 <= S2;
			r_I2 <= I2;
			r_G2 <= G2;
			r_P2 <= P2;
			r_C08 <= C08;
			Sat_2 <= Sat;
			Sub_2 <= Sub;
			U_2 <= U;
			En_2 <= En;
		elsif to_X01(Rst) = '1' then
			r_Y1 <= (others => '0');
			r_C1 <= (others => '0');
			r_S2 <= (others => '0');
			r_I2 <= (others => '0');
			r_G2 <= (others => '0');
			r_P2 <= (others => '0');
			r_C08 <= (others => '0');
			Sat_2 <= '0';
			Sub_2 <= '0';
			U_2 <= (others => '0');
			En_2 <= '0';
		elsif rising_edge(Clk) then
			if to_X01(En) = '1' then
				r_Y1 <= Y1;
				r_C1 <= C1;
				r_S2 <= S2;
				r_I2 <= I2;
				r_G2 <= G2;
				r_P2 <= P2;
				r_C08 <= C08;
				Sat_2 <= Sat;
				Sub_2 <= Sub;
				U_2 <= U;
			end if;
			En_2 <= En;
		end if;
	end process;

	stage_2 : process (r_S2, r_G2, r_P2, r_Y1, r_C08, r_C1, r_I2,
					   Sat_2, Sub_2, U_2, Clk, Rst, En_2)
		variable Y1, C1 : std_ulogic_vector(WIDTH-1 downto 0);
		variable Y2, C2 : std_ulogic_vector(WIDTH-1 downto 0);
		variable Y3, C3 : std_ulogic_vector(WIDTH-1 downto 0);
		variable S2, I2 : std_ulogic_vector(WIDTH/4-1 downto 0);
		variable G2, P2 : std_ulogic_vector(WIDTH/16-1 downto 0);
		variable S3, I3 : std_ulogic_vector(WIDTH/16-1 downto 0);
		variable S, C08, C16, C32, C64 : std_ulogic_vector(WIDTH/8-1 downto 0);
	begin
		Y1 := r_Y1;
		C1 := r_C1;
		S2 := r_S2;
		I2 := r_I2;
		G2 := r_G2;
		P2 := r_P2;
		C08 := r_C08;

		-- 16-bit partial results
		-- d=8
		CIA_Inc(Y1, C1, S2, I2, Y2, C2, 4);

		-- third-level carry look-ahead
		-- (like CIA_Row but with SIMD splits, and no G/P outputs)
		-- d=8
		for i in WIDTH/64-1 downto 0 loop
			S3(4*i+0) := '0';
			S3(4*i+1) := (U_2(1) and G2(4*i+0));
			S3(4*i+2) := (U_2(2) and G2(4*i+1))
					  or (U_2(2) and P2(4*i+1) and G2(4*i+0));
			S3(4*i+3) := (U_2(1) and G2(4*i+2))
					  or (U_2(2) and P2(4*i+2) and G2(4*i+1))
					  or (U_2(2) and P2(4*i+2) and P2(4*i+1) and G2(4*i+0));
		end loop;
		I3 := (others => '0');

		-- 64-bit result
		-- d=10
		CIA_Inc(Y2, C2, S3, I3, Y3, C3, 16);

		-- saturate/carry logic

		-- 16-bit carry out
		-- d=6
		C16 := (others => '0');
		for i in WIDTH/16-1 downto 0 loop
			C16(2*i) := G2(i);
		end loop;

		-- 32-bit carry out
		-- d=8
		C32 := (others => '0');
		for i in WIDTH/32-1 downto 0 loop
			C32(4*i) := G2(2*i+1) or (P2(2*i+1) and G2(2*i));
		end loop;

		-- 64-bit carry out
		-- d=8
		C64 := (others => '0');
		for i in WIDTH/64-1 downto 0 loop
			C64(8*i) := G2(4*i+3)
					 or (P2(4*i+3) and G2(4*i+2))
					 or (P2(4*i+3) and P2(4*i+2) and G2(4*i+1))
					 or (P2(4*i+3) and P2(4*i+2) and P2(4*i+1) and G2(4*i+0));
		end loop;

		-- saturate vector (taken from carry outputs)
		-- d=10
		for i in WIDTH/8-1 downto 0 loop
			S(i) := (Sat_2 and C08(i) and not U_2(0))
				 or (Sat_2 and C16(2*(i/2)) and U_2(0) and not U_2(1))
				 or (Sat_2 and C32(4*(i/4)) and U_2(1) and not U_2(2))
				 or (Sat_2 and C64(8*(i/8)) and U_2(2));
		end loop;

		-- high output vector
		-- d=10
		Yh <= (others => '0');
		for i in WIDTH/8-1 downto 0 loop
			--
			-- Note that this differs from the F-CPU
			-- manual, Rev.0.2.  In the manual, the
			-- `subb' borrow output is set to all 1's
			-- (numeric value -1) while this unit
			-- sets it to the numeric value 1.
			-- This is much easier to do in the
			-- presence of SIMD, and it's also
			-- more logical: `borrow -1' actually
			-- means `add 1', which is wrong.
			--
			Yh(8*i) <= (C08(i) and not U_2(0))
					or (C16(i) and U_2(0) and not U_2(1))
					or (C32(i) and U_2(1) and not U_2(2))
					or (C64(i) and U_2(2));
		end loop;

		-- output stage
		-- (saturate and invert)
		-- d=12
		for i in WIDTH-1 downto 0 loop
			Yl(i) <= (Y3(i) or S(i/8)) xor Sub_2;
		end loop;
	end process;
end Behave_1;

-- vi: set ts=4 sw=4 equalprg="fmt -72 -p--": please
