-- imul64.vhdl - F-CPU 64-Bit SIMD Integer Multiplication Unit
-- Copyright (C) 2000 Michael Riepe <michael@stud.uni-hannover.de>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

-- $Id: imul64.vhdl,v 1.34 2000/12/11 04:56:35 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;

entity IMul64 is
	generic (
		PIPE_AFTER : natural := 0;	-- gates per stage; 0 means no pipelining
		PIPE_DELAY : natural := 0	-- additional delay before 1st stage
	);
	port (
		-- inputs
		A : in std_ulogic_vector(63 downto 0);
		B : in std_ulogic_vector(63 downto 0);
		-- optional add input
		X : in std_ulogic_vector(63 downto 0) := (others => '0');
		-- signed/unsigned mode switch
		SignedMode : in std_ulogic := '0';
		-- MAC modes
		MacLo : in std_ulogic := '0';
		MacHi : in std_ulogic := '0';
		MacAlt : in std_ulogic := '0';	-- alternative (same-size) MAC version
		-- SIMD mode switches, as usual
		U08, U16, U32 : in std_ulogic := '1';
		-- clock/reset inputs
		Clk : in std_ulogic := '0';
		Rst : in std_ulogic := '0';
	--
		-- 8-bit results (d=18)
		Y08l : out std_ulogic_vector(63 downto 0);
		Y08h : out std_ulogic_vector(63 downto 0);
		-- 16-bit results (d=24)
		Y16l : out std_ulogic_vector(63 downto 0);
		Y16h : out std_ulogic_vector(63 downto 0);
		-- 32-bit results (d=30)
		Y32l : out std_ulogic_vector(63 downto 0);
		Y32h : out std_ulogic_vector(63 downto 0);
		-- 64-bit results (d=36)
		Y64l : out std_ulogic_vector(63 downto 0);
		Y64h : out std_ulogic_vector(63 downto 0)
	);
begin
	-- pipelining restrictions
	assert PIPE_AFTER mod 2 = 0
		report "PIPE_AFTER must be an even number"
		severity failure;
	assert PIPE_DELAY mod 2 = 0
		report "PIPE_DELAY must be an even number"
		severity failure;
end IMul64;

architecture Struct_1 of IMul64 is
	component AND2
		port (A, B : in std_ulogic; Y : out std_ulogic);
	end component;
	component AND3
		port (A, B, C : in std_ulogic; Y : out std_ulogic);
	end component;
	component AND4
		port (A, B, C, D : in std_ulogic; Y : out std_ulogic);
	end component;
	component OR2
		port (A, B : in std_ulogic; Y : out std_ulogic);
	end component;
	component OR3
		port (A, B, C : in std_ulogic; Y : out std_ulogic);
	end component;
	component OR4
		port (A, B, C, D : in std_ulogic; Y : out std_ulogic);
	end component;
	component NOT1
		port (A : in std_ulogic; Y : out std_ulogic);
	end component;
	component XOR2
		port (A, B : in std_ulogic; Y : out std_ulogic);
	end component;
	component XOR3
		port (A, B, C : in std_ulogic; Y : out std_ulogic);
	end component;
	component MAJ23
		port (A, B, C : in std_ulogic; Y : out std_ulogic);
	end component;
	component ReduceTree
		generic (
			WIDTH : natural := 128;
			ILINES : natural := 4;
			OLINES : natural := 3
		);
		port (
			A : in std_ulogic_vector(WIDTH*ILINES-1 downto 0);
			Y : out std_ulogic_vector(WIDTH*OLINES-1 downto 0)
		);
	end component;
	component Piped_CIAdd
		generic (
			WIDTH : natural := 64;
			PIPE_AFTER : natural := 0;	-- gates per stage; 0 means no pipelining
			PIPE_DELAY : natural := 0	-- additional delay before 1st stage
		);
		port (
			A : in std_ulogic_vector(WIDTH-1 downto 0);
			B : in std_ulogic_vector(WIDTH-1 downto 0);
			Clk : in std_ulogic := '0';
			Rst : in std_ulogic := '0';
			Y : out std_ulogic_vector(WIDTH-1 downto 0);
			C : out std_ulogic_vector(WIDTH-1 downto 0);
			G : out std_ulogic;
			P : out std_ulogic
		);
	end component;
	component PipeReg
		generic (WIDTH : natural := 64; DOREG : boolean := false);
		port (
			Clk, Rst : in std_ulogic;
			D : in std_ulogic_vector(WIDTH-1 downto 0);
			Q : out std_ulogic_vector(WIDTH-1 downto 0)
		);
	end component;

	function do_pipe (d : natural) return boolean is
	begin
		if PIPE_AFTER /= 0 then
			return ((PIPE_DELAY + d) mod PIPE_AFTER) = 0;
		end if;
		return false;
	end do_pipe;

	constant w : natural := 128;

	signal v0 : std_ulogic_vector(64*w-1 downto 0) := (others => '0');
	signal v1 : std_ulogic_vector(32*w-1 downto 0);
	signal vc : std_ulogic_vector(16*w-1 downto 0);
	signal v2 : std_ulogic_vector(32*w-1 downto 0);
	signal v3 : std_ulogic_vector(16*w-1 downto 0);
	signal v4 : std_ulogic_vector( 8*w-1 downto 0);
	signal v5 : std_ulogic_vector( 4*w-1 downto 0);
	signal v6 : std_ulogic_vector( 2*w-1 downto 0);
begin
	-- gated 1x1-bit products
	-- d=1
	input : block
		type mode_table is array(7 downto 0, 7 downto 0) of std_ulogic;
		signal simd : mode_table;
	begin
		-- SIMD gate array
		-- d=0
		simd <= (
			--7    6    5    4    3    2    1    0
			('1', U08, U16, U16, U32, U32, U32, U32), --7
			(U08, '1', U16, U16, U32, U32, U32, U32), --6
			(U16, U16, '1', U08, U32, U32, U32, U32), --5
			(U16, U16, U08, '1', U32, U32, U32, U32), --4
			(U32, U32, U32, U32, '1', U08, U16, U16), --3
			(U32, U32, U32, U32, U08, '1', U16, U16), --2
			(U32, U32, U32, U32, U16, U16, '1', U08), --1
			(U32, U32, U32, U32, U16, U16, U08, '1')  --0
		);

		-- product matrix
		-- d=1
		outer : for j in 63 downto 0 generate
			inner : for i in 63 downto 0 generate
				mul_1x1 : AND3 port map (
					A(j), B(i), simd(i/8, j/8), v0(64*(j+i)+j)
				);
			end generate;
		end generate;
	end block;

	-- 4:2 reducer
	-- d=6
	level_0 : block
		signal t0 : std_ulogic_vector(64*w-1 downto 0);
		signal t1, t2 : std_ulogic_vector(48*w-1 downto 0);
		signal t3 : std_ulogic_vector(32*w-1 downto 0);
	begin
		-- d=2
		reg_0 : PipeReg
			generic map (WIDTH => 64*w, DOREG => do_pipe(2))
			port map (D => v0, Q => t0, Clk => Clk, Rst => Rst);

		-- d=4
		red_1 : ReduceTree
			generic map (WIDTH => w, ILINES => 64, OLINES => 48)
			port map (A => t0, Y => t1);
		reg_1 : PipeReg
			generic map (WIDTH => 48*w, DOREG => do_pipe(4))
			port map (D => t1, Q => t2, Clk => Clk, Rst => Rst);

		-- d=6
		red_2 : ReduceTree
			generic map (WIDTH => w, ILINES => 48, OLINES => 32)
			port map (A => t2, Y => t3);
		reg_2 : PipeReg
			generic map (WIDTH => 32*w, DOREG => do_pipe(6))
			port map (D => t3, Q => v1, Clk => Clk, Rst => Rst);
	end block;

	-- signed/unsigned correction, MAC input
	-- d=6
	signed_corr : block
		signal an, bn : std_ulogic_vector(63 downto 0);
		signal corr08 : std_ulogic_vector(7 downto 0);
		signal un08, un16, un32 : std_ulogic;
		signal vb : std_ulogic_vector(32*w-1 downto 0) := (others => '0');
	begin
		-- negated size flags
		-- d=1
		un_08 : NOT1 port map (U08, un08);
		un_16 : NOT1 port map (U16, un16);
		un_32 : NOT1 port map (U32, un32);

		-- 8-bit correction gate vector
		-- d=1
		corr08 <= (7 => '1', 3 => un32, 5|1 => un16, others => un08);

		-- both operands, inverted (for subtraction)
		-- d=1
		invert : for i in 63 downto 0 generate
			inv_a : NOT1 port map (A(i), an(i));
			inv_b : NOT1 port map (B(i), bn(i));
		end generate;

		-- gated X input (standard F-CPU `widening' MAC instruction)
		-- d=1
		input_x : for i in 0 to 3 generate
			bits : for j in 0 to 15 generate
				x_lo : AND2 port map (
					MacLo, X(16*i+j), vb(32*(16*i+j+ 0)+4*i+ 0+3)
				);
				x_hi : AND2 port map (
					MacHi, X(16*i+j), vb(32*(16*i+j+64)+4*i+16+3)
				);
			end generate;
		end generate;

		-- gated X input (MR's alternative `same-size' MAC instruction)
		-- d=2
		alt_input_x : block
			signal gate_mac : std_ulogic_vector(7 downto 0);
		begin
			gate_mac <= (0 => '1', 4 => un32, 6|2 => un16, others => un08);

			x_08 : for i in 0 to 7 generate
				bits : for j in 0 to 7 generate
					x_b : AND3 port map (
						MacAlt, X(8*i+j), gate_mac(i),
						vb(32*(16*i+j)+ 4*i+ 0)
					);
				end generate;
			end generate;

			x_16 : for i in 0 to 3 generate
				bits : for j in 8 to 15 generate
					x_w : AND4 port map (
						MacAlt, X(16*i+j), gate_mac(2*i), U08,
						vb(32*(32*i+j)+ 8*i+ 4)
					);
				end generate;
			end generate;

			x_32 : for i in 0 to 1 generate
				bits : for j in 16 to 31 generate
					x_d : AND4 port map (
						MacAlt, X(32*i+j), gate_mac(4*i), U16,
						vb(32*(64*i+j)+16*i+ 8)
					);
				end generate;
			end generate;

			x_64 : for i in 0 to 0 generate
				bits : for j in 32 to 63 generate
					x_q : AND4 port map (
						MacAlt, X(64*i+j), gate_mac(8*i), U32,
						vb(32*(128*i+j)+32*i+16)
					);
				end generate;
			end generate;
		end block;

		-- 8-bit correction vectors
		-- d=2
		mul08 : for i in 0 to 7 generate
			bl : block
				signal g, t1, t2 : std_ulogic;
			begin
				gate : g <= SignedMode;
				tmp1 : XOR2 port map (A(8*i+7), B(8*i+7), t1);
				tmp2 : AND2 port map (A(8*i+7), B(8*i+7), t2);
				ci_1 : AND3 port map (
					t1, g, un08, vb(32*(16*i+0+8)+4*i+0)
				);
				ci_2 : AND3 port map (
					t2, g, un08, vb(32*(16*i+1+8)+4*i+0)
				);
				bits : for j in 0 to 7 generate
					corr_a : AND4 port map (
						B(8*i+7), g, corr08(i), an(8*i+j),
						vb(32*(16*i+j+8)+4*i+1)
					);
					corr_b : AND4 port map (
						A(8*i+7), g, corr08(i), bn(8*i+j),
						vb(32*(16*i+j+8)+4*i+2)
					);
				end generate;
			end block;
		end generate;

		-- 16-bit correction vectors
		-- d=2
		mul16 : for i in 0 to 3 generate
			bl : block
				signal g, t1, t2 : std_ulogic;
			begin
				gate : AND2 port map (SignedMode, U08, g);
				tmp1 : XOR2 port map (A(16*i+15), B(16*i+15), t1);
				tmp2 : AND2 port map (A(16*i+15), B(16*i+15), t2);
				ci_1 : AND3 port map (
					t1, g, un16, vb(32*(32*i+0+16)+8*i+0)
				);
				ci_2 : AND3 port map (
					t2, g, un16, vb(32*(32*i+1+16)+8*i+0)
				);
				bits : for j in 0 to 7 generate
					corr_a : AND4 port map (
						B(16*i+15), g, corr08(2*i+1), an(16*i+j),
						vb(32*(32*i+j+16)+8*i+1)
					);
					corr_b : AND4 port map (
						A(16*i+15), g, corr08(2*i+1), bn(16*i+j),
						vb(32*(32*i+j+16)+8*i+2)
					);
				end generate;
			end block;
		end generate;

		-- 32-bit correction vectors
		-- d=2
		mul32 : for i in 0 to 1 generate
			bl : block
				signal g, t1, t2 : std_ulogic;
			begin
				gate : AND2 port map (SignedMode, U16, g);
				tmp1 : XOR2 port map (A(32*i+31), B(32*i+31), t1);
				tmp2 : AND2 port map (A(32*i+31), B(32*i+31), t2);
				ci_1 : AND3 port map (
					t1, g, un32, vb(32*(64*i+0+32)+16*i+4)
				);
				ci_2 : AND3 port map (
					t2, g, un32, vb(32*(64*i+1+32)+16*i+4)
				);
				bits : for j in 0 to 15 generate
					corr_a : AND4 port map (
						B(32*i+31), g, corr08(4*i+3), an(32*i+j),
						vb(32*(64*i+j+32)+16*i+5)
					);
					corr_b : AND4 port map (
						A(32*i+31), g, corr08(4*i+3), bn(32*i+j),
						vb(32*(64*i+j+32)+16*i+6)
					);
				end generate;
			end block;
		end generate;

		-- 64-bit correction vectors
		-- d=2
		mul64 : for i in 0 to 0 generate
			bl : block
				signal g, t1, t2 : std_ulogic;
			begin
				gate : AND2 port map (SignedMode, U32, g);
				tmp1 : XOR2 port map (A(64*i+63), B(64*i+63), t1);
				tmp2 : AND2 port map (A(64*i+63), B(64*i+63), t2);
				ci_1 : AND2 port map (
					t1, g, vb(32*(128*i+0+64)+32*i+12)
				);
				ci_2 : AND2 port map (
					t2, g, vb(32*(128*i+1+64)+32*i+12)
				);
				bits : for j in 0 to 31 generate
					corr_a : AND4 port map (
						B(64*i+63), g, corr08(8*i+7), an(64*i+j),
						vb(32*(128*i+j+64)+32*i+13)
					);
					corr_b : AND4 port map (
						A(64*i+63), g, corr08(8*i+7), bn(64*i+j),
						vb(32*(128*i+j+64)+32*i+14)
					);
				end generate;
			end block;
		end generate;

		-- 4:2 reducer
		-- d=6
		reduce : block
			signal t0 : std_ulogic_vector(32*w-1 downto 0);
			signal t1, t2 : std_ulogic_vector(24*w-1 downto 0);
			signal t3 : std_ulogic_vector(16*w-1 downto 0);
		begin
			-- d=2
			reg_0 : PipeReg
				generic map (WIDTH => 32*w, DOREG => do_pipe(2))
				port map (D => vb, Q => t0, Clk => Clk, Rst => Rst);

			-- d=4
			red_1 : ReduceTree
				generic map (WIDTH => w, ILINES => 32, OLINES => 24)
				port map (A => t0, Y => t1);
			reg_1 : PipeReg
				generic map (WIDTH => 24*w, DOREG => do_pipe(4))
				port map (D => t1, Q => t2, Clk => Clk, Rst => Rst);

			-- d=6
			red_2 : ReduceTree
				generic map (WIDTH => w, ILINES => 24, OLINES => 16)
				port map (A => t2, Y => t3);
			reg_2 : PipeReg
				generic map (WIDTH => 16*w, DOREG => do_pipe(6))
				port map (D => t3, Q => vc, Clk => Clk, Rst => Rst);
		end block;
	end block;

	-- 3:2 reducer (with irregular inputs)
	-- d=8
	level_1 : block
		signal t1 : std_ulogic_vector(32*w-1 downto 0);
	begin
		red : for i in 0 to 8*w-1 generate
			init : if i < 8 generate
				t1(4*i+0) <= '0';
				t1(4*i+1) <= '0';
			end generate;
			x1 : XOR3 port map (v1(4*i+0), v1(4*i+1), v1(4*i+2), t1(4*i+2));
			x2 : XOR3 port map (v1(4*i+3), vc(2*i+0), vc(2*i+1), t1(4*i+3));
			carry : if i < 8*(w-1) generate
				m1 : MAJ23 port map (v1(4*i+0), v1(4*i+1), v1(4*i+2), t1(4*i+32));
				m2 : MAJ23 port map (v1(4*i+3), vc(2*i+0), vc(2*i+1), t1(4*i+33));
			end generate;
		end generate;
		reg : PipeReg
			generic map (WIDTH => 32*w, DOREG => do_pipe(8))
			port map (D => t1, Q => v2, Clk => Clk, Rst => Rst);
	end block;

	-- 4:2 reducer
	-- d=12
	level_2 : block
		signal t1, t2 : std_ulogic_vector(24*w-1 downto 0);
		signal t3 : std_ulogic_vector(16*w-1 downto 0);
	begin
		-- d=10
		red_1 : ReduceTree
			generic map (WIDTH => w, ILINES => 32, OLINES => 24)
			port map (A => v2, Y => t1);
		reg_1 : PipeReg
			generic map (WIDTH => 24*w, DOREG => do_pipe(10))
			port map (D => t1, Q => t2, Clk => Clk, Rst => Rst);

		-- d=12
		red_2 : ReduceTree
			generic map (WIDTH => w, ILINES => 24, OLINES => 16)
			port map (A => t2, Y => t3);
		reg_2 : PipeReg
			generic map (WIDTH => 16*w, DOREG => do_pipe(12))
			port map (D => t3, Q => v3, Clk => Clk, Rst => Rst);
	end block;

	-- 4:2 reducer
	-- d=16
	level_3 : block
		signal t1, t2 : std_ulogic_vector(12*w-1 downto 0);
		signal t3 : std_ulogic_vector(8*w-1 downto 0);
	begin
		-- d=14
		red_1 : ReduceTree
			generic map (WIDTH => w, ILINES => 16, OLINES => 12)
			port map (A => v3, Y => t1);
		reg_1 : PipeReg
			generic map (WIDTH => 12*w, DOREG => do_pipe(14))
			port map (D => t1, Q => t2, Clk => Clk, Rst => Rst);

		-- d=16
		red_2 : ReduceTree
			generic map (WIDTH => w, ILINES => 12, OLINES => 8)
			port map (A => t2, Y => t3);
		reg_2 : PipeReg
			generic map (WIDTH => 8*w, DOREG => do_pipe(16))
			port map (D => t3, Q => v4, Clk => Clk, Rst => Rst);
	end block;

	-- 4:2 reducer
	-- d=20
	level_4 : block
		signal t1, t2 : std_ulogic_vector(6*w-1 downto 0);
		signal t3 : std_ulogic_vector(4*w-1 downto 0);
	begin
		-- d=18
		red_1 : ReduceTree
			generic map (WIDTH => w, ILINES => 8, OLINES => 6)
			port map (A => v4, Y => t1);
		reg_1 : PipeReg
			generic map (WIDTH => 6*w, DOREG => do_pipe(18))
			port map (D => t1, Q => t2, Clk => Clk, Rst => Rst);

		-- d=20
		red_2 : ReduceTree
			generic map (WIDTH => w, ILINES => 6, OLINES => 4)
			port map (A => t2, Y => t3);
		reg_2 : PipeReg
			generic map (WIDTH => 4*w, DOREG => do_pipe(20))
			port map (D => t3, Q => v5, Clk => Clk, Rst => Rst);
	end block;

	-- 4:2 reducer
	-- d=24
	level_5 : block
		signal t1, t2 : std_ulogic_vector(3*w-1 downto 0);
		signal t3 : std_ulogic_vector(2*w-1 downto 0);
	begin
		-- d=22
		red_1 : ReduceTree
			generic map (WIDTH => w, ILINES => 4, OLINES => 3)
			port map (A => v5, Y => t1);
		reg_1 : PipeReg
			generic map (WIDTH => 3*w, DOREG => do_pipe(22))
			port map (D => t1, Q => t2, Clk => Clk, Rst => Rst);

		-- d=24
		red_2 : ReduceTree
			generic map (WIDTH => w, ILINES => 3, OLINES => 2)
			port map (A => t2, Y => t3);
		reg_2 : PipeReg
			generic map (WIDTH => 2*w, DOREG => do_pipe(24))
			port map (D => t3, Q => v6, Clk => Clk, Rst => Rst);
	end block;

	-- 8-bit results (located in stage 3)
	-- d=18
	res_08 : for i in 0 to 7 generate
		bl : block
			signal ta, tb, ty : std_ulogic_vector(15 downto 0);
		begin
			-- d=12
			inputs : for j in 0 to 15 generate
				ta(j) <= v3(256*i+16*j+2*i+0);
				tb(j) <= v3(256*i+16*j+2*i+1);
			end generate;

			-- pipelined version of CIAddSmall
			-- d=18
			adder : block
				-- first micro-stage
				signal G0, P0 : std_ulogic_vector(15 downto 0);
				-- second micro-stage
				signal r_G0   : std_ulogic_vector(15 downto 0);
				signal r_P0   : std_ulogic_vector(15 downto 0);
				signal C1, I1 : std_ulogic_vector(15 downto 0);
				signal G1, P1 : std_ulogic_vector( 3 downto 0);
				-- third micro-stage
				signal Y1     : std_ulogic_vector(15 downto 0);
				signal C2     : std_ulogic_vector(15 downto 0);
			begin
				-- a row of half adders
				-- d=13
				half_adders : for j in 0 to 15 generate
					sum   : XOR2 port map (ta(j), tb(j), P0(j));
					carry : AND2 port map (ta(j), tb(j), G0(j));
				end generate;

				-- d=14
				reg_p0 : PipeReg
					generic map (WIDTH => 16, DOREG => do_pipe(14))
					port map (D => P0, Q => r_P0, Clk => Clk, Rst => Rst);
				reg_g0 : PipeReg
					generic map (WIDTH => 16, DOREG => do_pipe(14))
					port map (D => G0, Q => r_G0, Clk => Clk, Rst => Rst);

				-- carry-increment tree
				-- d=14
				inc_prop : block
					signal t1, t2 : std_ulogic_vector(15 downto 0);
				begin
					lp_1 : for j in 0 to 3 generate
						tmp0 : t1(4*j+0) <= P0(4*j+0);
						tmp1 : AND2 port map (
							P0(4*j+1), P0(4*j+0),
							t1(4*j+1)
						);
						tmp2 : AND3 port map (
							P0(4*j+2), P0(4*j+1), P0(4*j+0),
							t1(4*j+2)
						);
						tmp3 : AND4 port map (
							P0(4*j+3), P0(4*j+2), P0(4*j+1), P0(4*j+0),
							t1(4*j+3)
						);
					end generate;

					reg : PipeReg
						generic map (WIDTH => 16, DOREG => do_pipe(14))
						port map (D => t1, Q => t2, Clk => Clk, Rst => Rst);

					lp_2 : for j in 0 to 3 generate
						i_o : I1(4*j+3 downto 4*j) <= t2(4*j+2 downto 4*j) & '1';
						p_o : P1(j) <= t2(4*j+3);
					end generate;
				end block;

				-- d=15
				gen_carry : block
					signal t1, t2 : std_ulogic_vector(23 downto 0);
				begin
					-- d=14
					lp_1 : for j in 0 to 3 generate
						tmp0 : AND2 port map (
							P0(4*j+1), G0(4*j+0),
							t1(6*j+0)
						);
						tmp1 : AND2 port map (
							P0(4*j+2), G0(4*j+1),
							t1(6*j+1)
						);
						tmp2 : AND3 port map (
							P0(4*j+2), P0(4*j+1), G0(4*j+0),
							t1(6*j+2)
						);
						tmp3 : AND2 port map (
							P0(4*j+3), G0(4*j+2),
							t1(6*j+3)
						);
						tmp4 : AND3 port map (
							P0(4*j+3), P0(4*j+2), G0(4*j+1),
							t1(6*j+4)
						);
						tmp5 : AND4 port map (
							P0(4*j+3), P0(4*j+2), P0(4*j+1), G0(4*j+0),
							t1(6*j+5)
						);
					end generate;

					-- d=14
					reg : PipeReg
						generic map (WIDTH => 24, DOREG => do_pipe(14))
						port map (D => t1, Q => t2, Clk => Clk, Rst => Rst);

					-- d=15
					lp_2 : for j in 0 to 3 generate
						co_0 : C1(4*j+0) <= '0';
						co_1 : C1(4*j+1) <= r_G0(4*j+0);
						co_2 : OR2 port map (
							r_G0(4*j+1), t2(6*j+0),
							C1(4*j+2)
						);
						co_3 : OR3 port map (
							r_G0(4*j+2), t2(6*j+1), t2(6*j+2),
							C1(4*j+3)
						);
						g_o  : OR4 port map (
							r_G0(4*j+3), t2(6*j+3), t2(6*j+4), t2(6*j+5),
							G1(j)
						);
					end generate;
				end block;

				-- d=16
				y_1 : block
					signal t1 : std_ulogic_vector(15 downto 0);
				begin
					y_04 : for j in 0 to 15 generate
						y_04 : XOR2 port map (r_P0(j), C1(j), t1(j));
					end generate;

					reg : PipeReg
						generic map (WIDTH => 16, DOREG => do_pipe(16))
						port map (D => t1, Q => Y1, Clk => Clk, Rst => Rst);
				end block;

				-- d=17
				inc_16 : block
					signal t1, t2 : std_ulogic_vector(23 downto 0);
				begin
					-- d=16
					lp_1 : for j in 0 to 3 generate
						tmp0 : AND2 port map (
							G1(0), I1(j+4), t1(6*j+0)
						);
						tmp1 : AND2 port map (
							G1(1), I1(j+8), t1(6*j+1)
						);
						tmp2 : AND3 port map (
							P1(1), G1(0), I1(j+8), t1(6*j+2)
						);
						tmp3 : AND2 port map (
							G1(2), I1(j+12), t1(6*j+3)
						);
						tmp4 : AND3 port map (
							P1(2), G1(1), I1(j+12), t1(6*j+4)
						);
						tmp5 : AND4 port map (
							P1(2), P1(1), G1(0), I1(j+12), t1(6*j+5)
						);
					end generate;

					-- d=16
					reg : PipeReg
						generic map (WIDTH => 24, DOREG => do_pipe(16))
						port map (D => t1, Q => t2, Clk => Clk, Rst => Rst);

					-- d=17
					lp_2 : for j in 0 to 3 generate
						co_0 : C2(j) <= '0';
						co_1 : C2(j+4) <= t2(6*j+0);
						co_2 : OR2 port map (
							t2(6*j+1), t2(6*j+2), C2(j+8)
						);
						co_3 : OR3 port map (
							t2(6*j+3), t2(6*j+4), t2(6*j+5), C2(j+12)
						);
					end generate;
				end block;

				-- d=18
				output : for j in 0 to 15 generate
					y_16 : XOR2 port map (Y1(j), C2(j), ty(j));
				end generate;
			end block;

			res_lo : Y08l(8*i+7 downto 8*i) <= ty( 7 downto 0);
			res_hi : Y08h(8*i+7 downto 8*i) <= ty(15 downto 8);
		end block;
	end generate;

	-- 16-bit results (located in stage 3-4)
	-- d=24
	res_16 : for i in 0 to 3 generate
		--
		-- I gotta play some tricks here because CIAdd is too slow.
		--
		bl : block
			component Piped_CIA_Row
				generic (WIDTH : natural := 64; PIPELINED : boolean := false);
				port (
					Gi : in std_ulogic_vector(WIDTH-1 downto 0);
					Pi : in std_ulogic_vector(WIDTH-1 downto 0);
					Clk : in std_ulogic := '0';
					Rst : in std_ulogic := '0';
					Co : out std_ulogic_vector(WIDTH-1 downto 0);
					Io : out std_ulogic_vector(WIDTH-1 downto 0);
					Go : out std_ulogic_vector((WIDTH-1)/4 downto 0);
					Po : out std_ulogic_vector((WIDTH-1)/4 downto 0)
				);
			end component;
			component Piped_CIA_Inc
				generic (
					WIDTH : natural := 64; STEP : natural := 4;
					PIPELINED : boolean := false
				);
				port (
					Yi : in std_ulogic_vector(WIDTH-1 downto 0);
					Ci : in std_ulogic_vector(WIDTH-1 downto 0);
					Ii : in std_ulogic_vector((WIDTH-1)/STEP downto 0);
					Cs : in std_ulogic_vector((WIDTH-1)/STEP downto 0);
					Clk : in std_ulogic := '0';
					Rst : in std_ulogic := '0';
					Yo : out std_ulogic_vector(WIDTH-1 downto 0);
					Co : out std_ulogic_vector(WIDTH-1 downto 0)
				);
			end component;

			signal Y1, C1 : std_ulogic_vector(31 downto 0);
			signal Y2, C2 : std_ulogic_vector(31 downto 0);
			signal P0, G0 : std_ulogic_vector(31 downto 0);
			signal P1, G1 : std_ulogic_vector(15 downto 0);
			signal P2, G2 : std_ulogic_vector( 3 downto 0);

			signal ta, tb, ty : std_ulogic_vector(31 downto 0);
		begin
			-- d=16
			inputs : for j in 0 to 31 generate
				ta(j) <= v4(256*i+8*j+2*i+0);
				tb(j) <= v4(256*i+8*j+2*i+1);
			end generate;

			-- d=17
			half_adders : for j in 0 to 31 generate
				sum   : XOR2 port map (ta(j), tb(j), P0(j));
				carry : AND2 port map (ta(j), tb(j), G0(j));
			end generate;

			-- 2-bit partial results, optimized for low delay
			-- d=18
			two_bit : for j in 0 to 15 generate
				bl : block
					signal t1, t2 : std_ulogic;
				begin
					y_0 : Y1(2*j+0) <= P0(2*j+0);
					y_1 : XOR2 port map (P0(2*j+1), G0(2*j+0), Y1(2*j+1));
					c_0 : C1(2*j+0) <= '1';
					c_1 : C1(2*j+1) <= P0(2*j+0);
					p_o : AND2 port map (P0(2*j+1), P0(2*j+0), P1(j));
					t_1 : AND3 port map (ta(2*j+1), ta(2*j+0), tb(2*j+0), t1);
					t_2 : AND3 port map (tb(2*j+1), ta(2*j+0), tb(2*j+0), t2);
					g_o : OR3  port map (G0(2*j+1), t1, t2, G1(j));
				end block;
			end generate;

			level_1 : block
				signal Ct, It : std_ulogic_vector(15 downto 0);

				signal r_Y1, r_C1 : std_ulogic_vector(31 downto 0);
			begin
				-- extra pipeline registers for Y1/C1
				-- d=18
				reg_1 : PipeReg
					generic map (WIDTH => 32, DOREG => do_pipe(18))
					port map (D => Y1, Q => r_Y1, Clk => Clk, Rst => Rst);
				reg_2 : PipeReg
					generic map (WIDTH => 32, DOREG => do_pipe(18))
					port map (D => C1, Q => r_C1, Clk => Clk, Rst => Rst);

				-- d=20
				cia_1 : Piped_CIA_Row
					generic map (WIDTH => 32/2, PIPELINED => do_pipe(18))
					port map (
						Gi => G1, Pi => P1, Clk => Clk, Rst => Rst,
						Co => Ct, Io => It, Go => G2, Po => P2
					);

				-- d=22
				inc_1 : Piped_CIA_Inc
					generic map (WIDTH => 32, STEP => 2, PIPELINED => do_pipe(20))
					port map (
						Yi => r_Y1, Ci => r_C1, Ii => It, Cs => Ct,
						Clk => Clk, Rst => Rst, Yo => Y2, Co => C2
					);
			end block;

			level_2 : block
				signal Ct, It : std_ulogic_vector(3 downto 0);
			begin
				-- d=22
				cia_2 : Piped_CIA_Row
					generic map (WIDTH => 32/8, PIPELINED => do_pipe(20))
					port map (
						Gi => G2, Pi => P2, Clk => Clk, Rst => Rst,
						Co => Ct, Io => It, Go => open, Po => open
					);

				-- d=24
				inc_2 : Piped_CIA_Inc
					generic map (WIDTH => 32, STEP => 8, PIPELINED => do_pipe(22))
					port map (
						Yi => Y2, Ci => C2, Ii => It, Cs => Ct,
						Clk => Clk, Rst => Rst, Yo => ty, Co => open
					);
			end block;

			res_lo : Y16l(16*i+15 downto 16*i) <= ty(15 downto  0);
			res_hi : Y16h(16*i+15 downto 16*i) <= ty(31 downto 16);
		end block;
	end generate;

	-- 32-bit results (located in stage 4-5)
	-- d=30
	res_32 : for i in 0 to 1 generate
		bl : block
			signal ta, tb, ty : std_ulogic_vector(63 downto 0);
		begin
			-- d=20
			inputs : for j in 0 to 63 generate
				ta(j) <= v5(256*i+4*j+2*i+0);
				tb(j) <= v5(256*i+4*j+2*i+1);
			end generate;

			-- d=30
			adder : Piped_CIAdd
				generic map (
					WIDTH => 64,
					PIPE_AFTER => PIPE_AFTER,
					PIPE_DELAY => PIPE_DELAY + 20 + 1	-- must be odd
				)
				port map (
					A => ta, B => tb, Clk => Clk, Rst => Rst,
					Y => ty, C => open, G => open, P => open
				);

			res_lo : Y32l(32*i+31 downto 32*i) <= ty(31 downto  0);
			res_hi : Y32h(32*i+31 downto 32*i) <= ty(63 downto 32);
		end block;
	end generate;

	-- 64-bit results (located in stage 5-6)
	-- d=36
	res_64 : for i in 0 to 0 generate
		bl : block
			signal ta, tb, ty : std_ulogic_vector(127 downto 0);
		begin
			-- d=24
			inputs : for j in 0 to 127 generate
				ta(j) <= v6(256*i+2*j+2*i+0);
				tb(j) <= v6(256*i+2*j+2*i+1);
			end generate;

			-- d=36
			adder : Piped_CIAdd
				generic map (
					WIDTH => 128,
					PIPE_AFTER => PIPE_AFTER,
					PIPE_DELAY => PIPE_DELAY + 24 + 1	-- must be odd
				)
				port map (
					A => ta, B => tb, Clk => Clk, Rst => Rst,
					Y => ty, C => open, G => open, P => open
				);

			res_lo : Y64l(64*i+63 downto 64*i) <= ty( 63 downto  0);
			res_hi : Y64h(64*i+63 downto 64*i) <= ty(127 downto 64);
		end block;
	end generate;
end Struct_1;

-- vi: set ts=4 sw=4 equalprg="fmt -72 -p--": please
