-- afun.vhdl -- F-CPU 64-bit Additional Functions Unit (aka A Fun Unit ;-)
-- Copyright (C) 2003 Michael Riepe <michael@stud.uni-hannover.de>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

-- @(#) $Id: afun.vhdl,v 1.5 2003/04/12 15:22:21 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;

use work.Bit_Manipulation.all;
use work.Generic_Adder.all;
use work.Misc.all;

entity AFun is
	generic (
		WIDTH : natural := 64
	);
	port (
		-- operand inputs
		A : in std_ulogic_vector(WIDTH-1 downto 0);
		B : in std_ulogic_vector(WIDTH-1 downto 0);
		-- mode switches
		Hamming : in std_ulogic;
		-- SIMD mode switches (currently unused)
		U : in std_ulogic_vector(2 downto 0);
		-- clock/reset inputs
		Clk : in std_ulogic;
		Rst : in std_ulogic;
		En : in std_ulogic;
	--
		Popc08 : out std_ulogic_vector(WIDTH-1 downto 0);
		Popc16 : out std_ulogic_vector(WIDTH-1 downto 0);
		Popc32 : out std_ulogic_vector(WIDTH-1 downto 0);
		Popc64 : out std_ulogic_vector(WIDTH-1 downto 0)
	);
--pragma synthesis_off
begin
	assert WIDTH mod 64 = 0
		report "width of AFun must be an integer multiple of 64"
		severity failure;
--pragma synthesis_on
end AFun;

architecture Behave_1 of AFun is
	-- count bits in every 4-bit chunk
	-- d=3 (t=4)
	function count4 (A : in std_ulogic_vector) return std_ulogic_vector is
		constant L : natural := A'length;
		variable aa : std_ulogic_vector(L-1 downto 0);
		variable bb : std_ulogic_vector(L-1 downto 0);
		variable cc : std_ulogic_vector(L-1 downto 0);
		variable dd : std_ulogic_vector(3*(L/4)-1 downto 0);
	begin
--pragma synthesis_off
		assert L mod 4 = 0;
--pragma synthesis_on
		aa := A;
		for i in L/4-1 downto 0 loop
			-- first row: half adders
			-- d=1
			bb(4*i+0) := aa(4*i+1) xor aa(4*i+0);	-- t=2
			bb(4*i+1) := aa(4*i+1) and aa(4*i+0);	-- t=1
			bb(4*i+2) := aa(4*i+3) xor aa(4*i+2);	-- t=2
			bb(4*i+3) := aa(4*i+3) and aa(4*i+2);	-- t=1
			-- second row: half adders
			-- d=2
			cc(4*i+0) := bb(4*i+2) xor bb(4*i+0);	-- t=4
			cc(4*i+1) := bb(4*i+2) and bb(4*i+0);	-- t=3
			cc(4*i+2) := bb(4*i+3) xor bb(4*i+1);	-- t=3
			cc(4*i+3) := bb(4*i+3) and bb(4*i+1);	-- t=2
			-- third row: OR gates
			-- d=3
			dd(3*i+0) := cc(4*i+0);
			dd(3*i+1) := cc(4*i+2) or cc(4*i+1);	-- t=4
			dd(3*i+2) := cc(4*i+3);
		end loop;
		return dd;
	end count4;

	-- stage 1 -> 2
	signal r1_G0, r1_P0 : std_ulogic_vector(WIDTH/2-1 downto 0);
	signal r1_En : std_ulogic;

	-- stage 2 -> 3
	signal r2_P0 : std_ulogic_vector(WIDTH/2-1 downto 0);
	signal r2_S0 : std_ulogic_vector(WIDTH/2-1 downto 0);
	signal r2_G1 : std_ulogic_vector(WIDTH/8-1 downto 0);
	signal r2_En : std_ulogic;

	-- stage 3 -> 4
	signal r3_Y1 : std_ulogic_vector(WIDTH/4-1 downto 0);
	signal r3_En : std_ulogic;
begin
	popcount_1 : process (A, B, Hamming, Clk, Rst, En)
		variable aa : std_ulogic_vector(WIDTH-1 downto 0);
		variable xx, yy : std_ulogic_vector(WIDTH/2-1 downto 0);
		variable g0, p0 : std_ulogic_vector(WIDTH/2-1 downto 0);
	begin
		-- d=2
		if to_X01(Hamming) = '1' then
			aa := A xor B;
		else
			aa := A;
		end if;
		-- d=5
		for i in WIDTH/4-1 downto 0 loop
			aa(4*i+2 downto 4*i) := count4(aa(4*i+3 downto 4*i));
			aa(4*i+3) := '0';
		end loop;
		xx := (others => '0');
		yy := (others => '0');
		for i in WIDTH/8-1 downto 0 loop
			xx(4*i+3 downto 4*i) := aa(8*i+7 downto 8*i+4);
			yy(4*i+3 downto 4*i) := aa(8*i+3 downto 8*i+0);
		end loop;
		-- d=6
		g0 := xx and yy;
		p0 := xx xor yy;

		if to_X01(Rst) = '1' then
			r1_G0 <= (others => '0');
			r1_P0 <= (others => '0');
			r1_En <= '0';
		elsif rising_edge(Clk) then
			if to_X01(En) = '1' then
				r1_G0 <= g0;
				r1_P0 <= p0;
			end if;
			r1_En <= En;
		end if;
	end process;

	popcount_2 : process (r1_G0, r1_P0, Clk, Rst, r1_En)
		variable g0, p0 : std_ulogic_vector(WIDTH/2-1 downto 0);
		variable g1, p1 : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable s0, t0 : std_ulogic_vector(WIDTH/2-1 downto 0);
		variable y1, z1 : std_ulogic_vector(WIDTH/2-1 downto 0);
		variable xx, yy : std_ulogic_vector(WIDTH/2-1 downto 0);
	begin
		-- d=6
		g0 := r1_G0;
		p0 := r1_P0;
		-- d=8
		CLA(g0, p0, g1, p1);
		CSV(g0, p0, s0, t0);
		-- d=9
		y1 := p0 xor s0;
		z1 := p0 xor t0;
		-- 8-bit output
		Popc08 <= (others => '0');
		for i in WIDTH/8-1 downto 0 loop
			Popc08(8*i+3 downto 8*i) <= y1(4*i+3 downto 4*i);
		end loop;
		-- 16-bit adder
		xx := (others => '0');
		yy := (others => '0');
		for i in WIDTH/16-1 downto 0 loop
			xx(8*i+3 downto 8*i) := y1(8*i+7 downto 8*i+4);
			yy(8*i+3 downto 8*i) := y1(8*i+3 downto 8*i+0);
		end loop;
		-- d=10
		g0 := xx and yy;
		p0 := xx xor yy;
		-- d=12
		CLA(g0, p0, g1, p1);
		CSV(g0, p0, s0, t0);

		if to_X01(Rst) = '1' then
			r2_P0 <= (others => '0');
			r2_S0 <= (others => '0');
			r2_G1 <= (others => '0');
			r2_En <= '0';
		elsif rising_edge(Clk) then
			if to_X01(r1_En) = '1' then
				r2_P0 <= p0;
				r2_S0 <= s0;
				r2_G1 <= g1;
			end if;
			r2_En <= r1_En;
		end if;
	end process;

	popcount_3 : process (r2_P0, r2_S0, r2_G1, Clk, Rst, r2_En)
		variable p_0 : std_ulogic_vector(WIDTH/2-1 downto 0);
		variable g_1 : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable s_0 : std_ulogic_vector(WIDTH/2-1 downto 0);
		variable y_1 : std_ulogic_vector(WIDTH/2-1 downto 0);
		variable xx, yy : std_ulogic_vector(WIDTH/4-1 downto 0);
		variable g0, p0 : std_ulogic_vector(WIDTH/4-1 downto 0);
		variable g1, p1 : std_ulogic_vector(WIDTH/16-1 downto 0);
		variable s0, t0 : std_ulogic_vector(WIDTH/4-1 downto 0);
		variable y1, z1 : std_ulogic_vector(WIDTH/4-1 downto 0);
	begin
		-- d=12
		p_0 := r2_P0;
		s_0 := r2_S0;
		g_1 := r2_G1;
		-- d=13
		y_1 := p_0 xor s_0;
		-- 16-bit output
		Popc16 <= (others => '0');
		for i in WIDTH/16-1 downto 0 loop
			Popc16(16*i+3 downto 16*i) <= y_1(8*i+3 downto 8*i);
			Popc16(16*i+4) <= g_1(2*i);
		end loop;
		-- 32-bit adder
		xx := (others => '0');
		yy := (others => '0');
		for i in WIDTH/32-1 downto 0 loop
			xx(8*i+3 downto 8*i) := y_1(16*i+11 downto 16*i+8);
			xx(8*i+4) := g_1(4*i+2);
			yy(8*i+3 downto 8*i) := y_1(16*i+ 3 downto 16*i+0);
			yy(8*i+4) := g_1(4*i+0);
		end loop;
		-- d=14
		g0 := xx and yy;
		p0 := xx xor yy;
		-- d=16
		CLA(g0, p0, g1, p1);
		CSV(g0, p0, s0, t0);
		-- d=17
		y1 := p0 xor s0;
		z1 := p0 xor t0;
		-- d=18
		for i in WIDTH/32-1 downto 0 loop
			if to_X01(g1(2*i)) = '1' then
				y1(8*i+7 downto 8*i+4) := z1(8*i+7 downto 8*i+4);
			end if;
		end loop;
		-- 32-bit output
		Popc32 <= (others => '0');
		for i in WIDTH/32-1 downto 0 loop
			Popc32(32*i+5 downto 32*i) <= y1(8*i+5 downto 8*i);
		end loop;

		if to_X01(Rst) = '1' then
			r3_Y1 <= (others => '0');
			r3_En <= '0';
		elsif rising_edge(Clk) then
			if to_X01(r2_En) = '1' then
				r3_Y1 <= y1;
			end if;
			r3_En <= r2_En;
		end if;
	end process;

	popcount_4 : process (r3_Y1, Clk, Rst, r3_En)
		variable y_1 : std_ulogic_vector(WIDTH/4-1 downto 0);
		variable xx, yy : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable g0, p0 : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable g1, p1 : std_ulogic_vector(WIDTH/32-1 downto 0);
		variable s0, t0 : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable y1, z1 : std_ulogic_vector(WIDTH/8-1 downto 0);
	begin
		-- d=18
		y_1 := r3_Y1;
		-- 64-bit adder
		xx := (others => '0');
		yy := (others => '0');
		for i in WIDTH/64-1 downto 0 loop
			xx(8*i+5 downto 8*i) := y_1(16*i+13 downto 16*i+8);
			yy(8*i+5 downto 8*i) := y_1(16*i+ 5 downto 16*i+0);
		end loop;
		-- d=19
		g0 := xx and yy;
		p0 := xx xor yy;
		-- d=21
		CLA(g0, p0, g1, p1);
		CSV(g0, p0, s0, t0);
		-- d=22
		y1 := p0 xor s0;
		z1 := p0 xor t0;
		-- d=23
		for i in WIDTH/64-1 downto 0 loop
			if to_X01(g1(2*i)) = '1' then
				y1(8*i+7 downto 8*i+4) := z1(8*i+7 downto 8*i+4);
			end if;
		end loop;
		-- 64-bit output
		Popc64 <= (others => '0');
		for i in WIDTH/64-1 downto 0 loop
			Popc64(64*i+6 downto 64*i) <= y1(8*i+6 downto 8*i);
		end loop;
	end process;
end Behave_1;

-- vi: set ts=4 sw=4 equalprg="fmt -72 -p--": please
