-- iadd.vhdl -- F-CPU 64-bit Add/Subtract Unit
-- Copyright (C) 2000, 2001, 2003 Michael Riepe <michael@stud.uni-hannover.de>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

-- @(#) $Id: iadd.vhdl,v 1.40 2003/03/27 18:33:03 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;

entity IAdd is
	generic (
		WIDTH : natural := 64;
		PIPELINED : integer := 0
	);
	port (
		-- operand inputs
		A : in std_ulogic_vector(WIDTH-1 downto 0);
		B : in std_ulogic_vector(WIDTH-1 downto 0);
		-- subtract mode enable
		Sub : in std_ulogic;
		-- saturate/floor mode enable
		Sat : in std_ulogic;
		-- increment (add) or decrement (sub) result by one
		Inc : in std_ulogic;
		-- average mode enable
		Avg : in std_ulogic;
		-- SIMD mode switches
		U : in std_ulogic_vector(2 downto 0);
		-- clock/reset inputs
		Clk : in std_ulogic;
		Rst : in std_ulogic;
		En : in std_ulogic;
	--
		-- 8-bit tap outputs
		Y8l : out std_ulogic_vector(WIDTH-1 downto 0);
		Y8h : out std_ulogic_vector(WIDTH-1 downto 0);
		-- regular outputs
		Yl : out std_ulogic_vector(WIDTH-1 downto 0);
		Yh : out std_ulogic_vector(WIDTH-1 downto 0)
	);
--pragma synthesis_off
begin
	assert WIDTH mod 64 = 0
		report "width of IAdd must be an integer multiple of 64"
		severity failure;
--pragma synthesis_on
end IAdd;

--  The IAdd unit is a multi-level carry-select adder with SIMD
--  capabilities.  Its first level calculates 4-bit slices using carry
--  look-ahead; the second and third level are just carry propagate
--  logic and muxes that select the right chunk.  There is also a
--  `tap' output that provides fast 8-bit results for some operations.
--  Subtraction is implemented as `not ((not A) + B)' rather than
--  the usual `A + (not B) + 1' because that makes the saturation
--  modes easier to implement.
--
-- Known limitations:
--
--  1: Not tested exhaustively.
--
--  2: Some 8-bit operations take two clock cycles (see the table below).
--
--  3: subb mode differs from F-CPU manual:  In the examples section,
--  the `borrow' output is set to all 1's (numeric: -1), This unit sets
--  it to `0...01' (numeric: +1).  See the rationale in the code below.
--
-- Operating Modes (`?' means don't care):
--
--  Avg Sat Sub Inc | Yl                    | F-CPU insn
--  =====================================================
--   ?   0   0   0  | A + B                 | add
--   ?   0   0   1  | A + B + 1             | --- (add1)
--   ?   0   1   0  | A - B                 | sub
--   ?   0   1   1  | A - B - 1             | --- (sub1)
--   ?   1   0   0  | usat(A + B)       [*] | adds
--   ?   1   0   1  | usat(A + B + 1)   [*] | --- (adds1)
--   ?   1   1   0  | usat(A - B)       [*] | subf
--   ?   1   1   1  | usat(A - B - 1)   [*] | --- (subf1)
--
--  Avg Sat Sub Inc | Yh                    | F-CPU insn
--  =====================================================
--   0   ?   0   0  | A + B     >= 2**width | addc
--   0   ?   0   1  | A + B + 1 >= 2**width | --- (addc1)
--   0   ?   1   0  | A u< B                | subb
--   0   ?   1   1  | A u< B + 1            | --- (subb1)
--   1   ?   0   0  | (A + B) / 2       [*] | --- (avg)
--   1   ?   0   1  | (A + B + 1) / 2   [*] | --- (avg1)
--   1   ?   1   0  | (B - A - 1) / 2   [*] | ---
--   1   ?   1   1  | (B - A) / 2       [*] | ---
--
-- [*] Avg and Sat operations always take two cycles
--
-- SIMD Modes:
--
--  U = "000": 8-bit mode
--  U = "001": 16-bit mode
--  U = "011": 32-bit mode
--  U = "111": 64-bit mode
--  (others combinations are invalid)

architecture Behave_1 of IAdd is
	-- signals used by both stages
	signal r_Y1, r_Z1 : std_ulogic_vector(WIDTH-1 downto 0);
	signal r_S2, r_T2 : std_ulogic_vector(WIDTH/4-1 downto 0);
	signal r_G2, r_P2 : std_ulogic_vector(WIDTH/16-1 downto 0);
	signal r_C08 : std_ulogic_vector(WIDTH/8-1 downto 0);
	-- pipelined mode signals
	signal U_2 : std_ulogic_vector(U'length-1 downto 0);
	signal Sub_2, Sat_2, Inc_2, Avg_2 : std_ulogic;
	signal En_2 : std_ulogic;
begin
	stage_1 : process (A, B, Sub, Sat, Inc, Avg, U, Clk, Rst, En)
		-- signals used by stage 1 exclusively
		variable G0, P0 : std_ulogic_vector(WIDTH-1 downto 0);
		variable Y1, Z1 : std_ulogic_vector(WIDTH-1 downto 0);
		variable S1, T1 : std_ulogic_vector(WIDTH-1 downto 0);
		variable G1, P1 : std_ulogic_vector(WIDTH/4-1 downto 0);
		variable S2, T2 : std_ulogic_vector(WIDTH/4-1 downto 0);
		variable G2, P2 : std_ulogic_vector(WIDTH/16-1 downto 0);
		variable C08 : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable neg, n_A : std_ulogic_vector(WIDTH-1 downto 0);
		variable yh, yl, zl : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		-- input stage
		-- (half adders with A input inverted by Sub)
		-- d=2
		neg := (others => Sub);
		n_A := A xor neg;
		P0 := n_A xor B;
		G0 := n_A and B;

		-- first-level carry look-ahead
		-- d=3-4
		for i in WIDTH/4-1 downto 0 loop
			-- d=4
			S1(4*i+0) := '0';
			S1(4*i+1) := G0(4*i+0);
			S1(4*i+2) := G0(4*i+1)
				or (P0(4*i+1) and G0(4*i+0));
			S1(4*i+3) := G0(4*i+2)
				or (P0(4*i+2) and G0(4*i+1))
				or (P0(4*i+2) and P0(4*i+1) and G0(4*i+0));
			-- d=4
			T1(4*i+0) := '1';
			T1(4*i+1) := G0(4*i+0)
				or P0(4*i+0);
			T1(4*i+2) := G0(4*i+1)
				or (P0(4*i+1) and G0(4*i+0))
				or (P0(4*i+1) and P0(4*i+0));
			T1(4*i+3) := G0(4*i+2)
				or (P0(4*i+2) and G0(4*i+1))
				or (P0(4*i+2) and P0(4*i+1) and G0(4*i+0))
				or (P0(4*i+2) and P0(4*i+1) and P0(4*i+0));
			-- d=3
			P1(i) := P0(4*i+3) and P0(4*i+2) and P0(4*i+1) and P0(4*i+0);
			-- d=4
			G1(i) := G0(4*i+3)
				or (P0(4*i+3) and G0(4*i+2))
				or (P0(4*i+3) and P0(4*i+2) and G0(4*i+1))
				or (P0(4*i+3) and P0(4*i+2) and P0(4*i+1) and G0(4*i+0));
		end loop;

		-- 4-bit partial results
		-- d=5
		Y1 := P0 xor S1;	-- n_A + B
		Z1 := P0 xor T1;	-- n_A + B + 1

		-- 8-bit SIMD add/sub tap
		-- d=6
		-- yl := Y1 xor neg
		-- zl := Z1 xor neg
		-- Note the identities:
		--  P0 xor neg = A xor B
		--  Y1 xor neg = (P0 xor S1) xor neg = (A xor B) xor S1
		--  Z1 xor neg = (P0 xor T1) xor neg = (A xor B) xor T1
		yl := (A xor B) xor S1;
		zl := (A xor B) xor T1;
		for i in WIDTH/8-1 downto 0 loop
			if to_X01(Inc) = '1' then
				yl(8*i+3 downto 8*i+0) := zl(8*i+3 downto 8*i+0);
			end if;
			if to_X01(G1(2*i) or (P1(2*i) and Inc)) = '1' then
				yl(8*i+7 downto 8*i+4) := zl(8*i+7 downto 8*i+4);
			end if;
		end loop;
		Y8l <= yl;

		-- 8-bit carry vector
		-- d=6
		for i in WIDTH/8-1 downto 0 loop
			C08(i) := G1(2*i+1)
				or (P1(2*i+1) and G1(2*i+0))
				or (P1(2*i+1) and P1(2*i+0) and Inc);
		end loop;

		-- 8-bit carry out tap
		-- d=6
		yh := (others => '0');
		for i in WIDTH/8-1 downto 0 loop
			yh(8*i) := C08(i);
		end loop;
		Y8h <= yh;

		-- second-level carry look-ahead
		-- d=6
		for i in WIDTH/16-1 downto 0 loop
			-- d=6
			S2(4*i+0) := '0';
			S2(4*i+1) := G1(4*i+0);
			S2(4*i+2) := (U(0) and G1(4*i+1))
				or (U(0) and P1(4*i+1) and G1(4*i+0));
			S2(4*i+3) := G1(4*i+2)
				or (U(0) and P1(4*i+2) and G1(4*i+1))
				or (U(0) and P1(4*i+2) and P1(4*i+1) and G1(4*i+0));
			-- d=6
			T2(4*i+0) := '1';
			T2(4*i+1) := G1(4*i+0) or P1(4*i+0);
			T2(4*i+2) := G1(4*i+1)
				or (P1(4*i+1) and G1(4*i+0))
				or (P1(4*i+1) and P1(4*i+0))
				or (not U(0));
			T2(4*i+3) := (G1(4*i+2) or (P1(4*i+2) and not U(0)))
				or (P1(4*i+2) and G1(4*i+1))
				or (P1(4*i+2) and P1(4*i+1) and G1(4*i+0))
				or (P1(4*i+2) and P1(4*i+1) and P1(4*i+0));
			-- Note: for P2 and G2, U(0) = '1' is assumed (16-bit mode)
			-- d=4
			P2(i) := P1(4*i+3) and P1(4*i+2) and P1(4*i+1) and P1(4*i+0);
			-- d=6
			G2(i) := G1(4*i+3)
				or (P1(4*i+3) and G1(4*i+2))
				or (P1(4*i+3) and P1(4*i+2) and G1(4*i+1))
				or (P1(4*i+3) and P1(4*i+2) and P1(4*i+1) and G1(4*i+0));
		end loop;

		-- end of first stage
		if PIPELINED = 0 then
			r_Y1 <= Y1;
			r_Z1 <= Z1;
			r_S2 <= S2;
			r_T2 <= T2;
			r_G2 <= G2;
			r_P2 <= P2;
			r_C08 <= C08;
			Sat_2 <= Sat;
			Sub_2 <= Sub;
			Inc_2 <= Inc;
			Avg_2 <= Avg;
			U_2 <= U;
			En_2 <= En;
		elsif to_X01(Rst) = '1' then
			r_Y1 <= (others => '0');
			r_Z1 <= (others => '0');
			r_S2 <= (others => '0');
			r_T2 <= (others => '0');
			r_G2 <= (others => '0');
			r_P2 <= (others => '0');
			r_C08 <= (others => '0');
			Sat_2 <= '0';
			Sub_2 <= '0';
			Inc_2 <= '0';
			Avg_2 <= '0';
			U_2 <= (others => '0');
			En_2 <= '0';
		elsif rising_edge(Clk) then
			if to_X01(En) = '1' then
				r_Y1 <= Y1;
				r_Z1 <= Z1;
				r_S2 <= S2;
				r_T2 <= T2;
				r_G2 <= G2;
				r_P2 <= P2;
				r_C08 <= C08;
				Sat_2 <= Sat;
				Sub_2 <= Sub;
				Inc_2 <= Inc;
				Avg_2 <= Avg;
				U_2 <= U;
			end if;
			En_2 <= En;
		end if;
	end process;

	stage_2 : process (r_Y1, r_Z1, r_S2, r_T2, r_G2, r_P2, r_C08,
					   Sat_2, Sub_2, Inc_2, Avg_2, U_2, Clk, Rst, En_2)
		variable Y1, Z1 : std_ulogic_vector(WIDTH-1 downto 0);
		variable Y2, Z2 : std_ulogic_vector(WIDTH-1 downto 0);
		variable Y3, Z3 : std_ulogic_vector(WIDTH-1 downto 0);
		variable S2, T2 : std_ulogic_vector(WIDTH/4-1 downto 0);
		variable G2, P2 : std_ulogic_vector(WIDTH/16-1 downto 0);
		variable S3, I3 : std_ulogic_vector(WIDTH/16-1 downto 0);
		variable cv : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable sv : std_ulogic_vector(WIDTH-1 downto 0);
		variable hv : std_ulogic_vector(WIDTH-1 downto 0);
		variable lv : std_ulogic_vector(WIDTH-1 downto 0);
		variable mm : std_ulogic_vector(1 downto 0);
	begin
		Y1 := r_Y1;
		Z1 := r_Z1;
		S2 := r_S2;
		T2 := r_T2;
		G2 := r_G2;
		P2 := r_P2;

		-- 16-bit partial results
		-- d=7
		for i in WIDTH/4-1 downto 0 loop
			if to_X01(S2(i)) = '1' then
				Y2(4*i+3 downto 4*i) := Z1(4*i+3 downto 4*i);
			else
				Y2(4*i+3 downto 4*i) := Y1(4*i+3 downto 4*i);
			end if;
			if to_X01(T2(i)) = '1' then
				Z2(4*i+3 downto 4*i) := Z1(4*i+3 downto 4*i);
			else
				Z2(4*i+3 downto 4*i) := Y1(4*i+3 downto 4*i);
			end if;
		end loop;

		-- third-level carry look-ahead
		-- d=8
		for i in WIDTH/64-1 downto 0 loop
			-- d=8
			if to_X01(U_2(2)) = '1' then
				I3(4*i+0) := '1';
				I3(4*i+1) := P2(4*i+0);
				I3(4*i+2) := P2(4*i+1) and P2(4*i+0);
				I3(4*i+3) := P2(4*i+2) and P2(4*i+1) and P2(4*i+0);
			elsif to_X01(U_2(1)) = '1' then
				I3(4*i+0) := '1';
				I3(4*i+1) := P2(4*i+0);
				I3(4*i+2) := '1';
				I3(4*i+3) := P2(4*i+2);
			else
				I3(4*i+0) := '1';
				I3(4*i+1) := '1';
				I3(4*i+2) := '1';
				I3(4*i+3) := '1';
			end if;
			-- d=8
			S3(4*i+0) := '0';
			S3(4*i+1) := (U_2(1) and G2(4*i+0));
			S3(4*i+2) := (U_2(2) and G2(4*i+1))
				or (U_2(2) and P2(4*i+1) and G2(4*i+0));
			S3(4*i+3) := (U_2(1) and G2(4*i+2))
				or (U_2(2) and P2(4*i+2) and G2(4*i+1))
				or (U_2(2) and P2(4*i+2) and P2(4*i+1) and G2(4*i+0));
		end loop;

		-- 64-bit result
		-- d=9
		for i in WIDTH/16-1 downto 0 loop
--pragma synthesis_off
			assert to_X01(I3(i) and S3(i)) /= '1';
--pragma synthesis_on
			if to_X01(S3(i)) = '1' then
				Y3(16*i+15 downto 16*i) := Z2(16*i+15 downto 16*i);
				Z3(16*i+15 downto 16*i) := Z2(16*i+15 downto 16*i);
			elsif to_X01(I3(i)) = '1' then
				Y3(16*i+15 downto 16*i) := Y2(16*i+15 downto 16*i);
				Z3(16*i+15 downto 16*i) := Z2(16*i+15 downto 16*i);
			else
				Y3(16*i+15 downto 16*i) := Y2(16*i+15 downto 16*i);
				Z3(16*i+15 downto 16*i) := Y2(16*i+15 downto 16*i);
			end if;
		end loop;

		-- result after increment (if any)
		-- d=10
		if to_X01(Inc_2) = '1' then
			lv := Z3;
		else
			lv := Y3;
		end if;

		-- carry and saturate vectors
		-- d=10
		if to_X01(U_2(2)) = '1' then
			-- 64 bit
			-- d=9
			cv := (others => '0');
			for i in WIDTH/64-1 downto 0 loop
				cv(8*i) := G2(4*i+3)
					or (P2(4*i+3) and G2(4*i+2))
					or (P2(4*i+3) and P2(4*i+2) and G2(4*i+1))
					or (P2(4*i+3) and P2(4*i+2) and P2(4*i+1) and G2(4*i+0))
					or (P2(4*i+3) and P2(4*i+2) and P2(4*i+1) and P2(4*i+0) and Inc_2);
			end loop;
			for i in WIDTH-1 downto 0 loop
				sv(i) := cv(8*(i/64));
			end loop;
		elsif to_X01(U_2(1)) = '1' then
			-- 32 bit
			-- d=8
			cv := (others => '0');
			for i in WIDTH/32-1 downto 0 loop
				cv(4*i) := G2(2*i+1)
					or (P2(2*i+1) and G2(2*i+0))
					or (P2(2*i+1) and P2(2*i+0) and Inc_2);
			end loop;
			for i in WIDTH-1 downto 0 loop
				sv(i) := cv(4*(i/32));
			end loop;
		elsif to_X01(U_2(0)) = '1' then
			-- 16 bit
			-- d=6
			cv := (others => '0');
			for i in WIDTH/16-1 downto 0 loop
				cv(2*i) := G2(i) or (P2(i) and Inc_2);
			end loop;
			for i in WIDTH-1 downto 0 loop
				sv(i) := cv(2*(i/16));
			end loop;
		else
			-- 8 bit
			-- d=6
			cv := r_C08;
			for i in WIDTH-1 downto 0 loop
				sv(i) := cv(1*(i/8));
			end loop;
		end if;

		-- high output vector
		-- d=12
		if to_X01(Avg_2) /= '1' then
			-- d=10
			hv := (others => '0');
			for i in WIDTH/8-1 downto 0 loop
				hv(8*i) := cv(i);
			end loop;
		elsif to_X01(U_2(2)) = '1' then
			hv(WIDTH-2 downto 0) := lv(WIDTH-1 downto 1);
			for i in WIDTH/64-1 downto 0 loop
				hv(64*i+63) := Sub_2 xor cv(8*i);
			end loop;
		elsif to_X01(U_2(1)) = '1' then
			hv(WIDTH-2 downto 0) := lv(WIDTH-1 downto 1);
			for i in WIDTH/32-1 downto 0 loop
				hv(32*i+31) := Sub_2 xor cv(4*i);
			end loop;
		elsif to_X01(U_2(0)) = '1' then
			hv(WIDTH-2 downto 0) := lv(WIDTH-1 downto 1);
			for i in WIDTH/16-1 downto 0 loop
				hv(16*i+15) := Sub_2 xor cv(2*i);
			end loop;
		else
			hv(WIDTH-2 downto 0) := lv(WIDTH-1 downto 1);
			for i in WIDTH/8-1 downto 0 loop
				hv(8*i+7) := Sub_2 xor cv(i);
			end loop;
		end if;

		-- low output vector
		-- d=12
		mm(1) := to_X01(Sub_2);
		mm(0) := to_X01(Sat_2);
		case mm is
			when "11" => lv := lv nor sv;
			when "10" => lv := not lv;
			when "01" => lv := lv or sv;
			when others => null;
		end case;

		-- outputs
		Yh <= hv;
		Yl <= lv;
	end process;
end Behave_1;

-- vi: set ts=4 sw=4 equalprg="fmt -72 -p--": please
