-- shuffle64.vhdl -- 64-Bit F-CPU Bit Shuffling Unit
-- Copyright (C) 2001 - 2003 Michael Riepe <michael@stud.uni-hannover.de>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

-- @(#) $Id: shuffle64.vhdl,v 1.32 2003/03/25 14:05:17 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;
use work.Bit_Manipulation.all;

entity Shuffle64 is
	generic (
		WIDTH : natural := 64
	);
	port (
		-- shiftee
		A : in std_ulogic_vector(WIDTH-1 downto 0);
		-- shift count
		B : in std_ulogic_vector(WIDTH-1 downto 0);
		-- common shift count on/off switch
		CommonShiftCount : in std_ulogic;
		-- operating mode (mutually exclusive)
		ShiftL : in std_ulogic;
		ShiftR : in std_ulogic;
		ShiftRA : in std_ulogic;
		RotL : in std_ulogic;
		RotR : in std_ulogic;
		Bitrev : in std_ulogic;
		Byterev : in std_ulogic;
		Permute : in std_ulogic;
		Mix : in std_ulogic;
		Expand : in std_ulogic;
		Cshift : in std_ulogic;
		-- SIMD mode flags
		U : in std_ulogic_vector(2 downto 0);
		-- clock/reset/enable inputs (unused)
		Clk : in std_ulogic;
		Rst : in std_ulogic;
		En : in std_ulogic;
	--
		-- output
		Y : out std_ulogic_vector(WIDTH-1 downto 0);
		-- alt. output (for double-width shifts)
		Z : out std_ulogic_vector(WIDTH-1 downto 0)
	);
--pragma synthesis_off
begin
	assert WIDTH = 64
		report "WIDTH must be 64"
		severity failure;
--pragma synthesis_on
end Shuffle64;

architecture Behave_1 of Shuffle64 is
	-- single omega stage
	function omega_1 (A, B : in std_ulogic_vector) return std_ulogic_vector is
		constant w : natural := A'length;
		alias aa : std_ulogic_vector(w-1 downto 0) is A;
		alias bb : std_ulogic_vector(B'length-1 downto 0) is B;
		variable yy : std_ulogic_vector(w-1 downto 0);
		variable j, k : natural;
	begin
--pragma synthesis_off
		assert A'length = w;
		assert 2 * B'length = w;
--pragma synthesis_on
		for i in 0 to w-1 loop
			j := 2 * i;
			if j < w then
				k := j + 1;
			else
				k := j - w;
				j := k + 1;
			end if;
			if to_X01(bb(j / 2)) = '1' then
				yy(i) := aa(k);
			else
				yy(i) := aa(j);
			end if;
		end loop;
		return yy;
	end omega_1;

	function shift_mask (A : in std_ulogic_vector) return std_ulogic_vector is
		constant w : natural := 2 ** A'length;
		constant hw : natural := w / 2;
		alias aa : std_ulogic_vector(A'length-1 downto 0) is A;
		variable yy : std_ulogic_vector(w-1 downto 0);
		variable tt : std_ulogic_vector(hw-1 downto 0);
	begin
		if A'length = 0 then
			yy(0) := '0';
		elsif A'length = 1 then
			yy(1) := '0';
			yy(0) := aa(0);
		elsif A'length = 2 then
			yy(3) := '0';
			yy(2) := aa(1) and aa(0);
			yy(1) := aa(1);
			yy(0) := aa(1) or aa(0);
		elsif A'length = 3 then
			yy(7) := '0';
			yy(6) := aa(2) and aa(1) and aa(0);
			yy(5) := aa(2) and aa(1);
			yy(4) := aa(2) and (aa(1) or aa(0));
			yy(3) := aa(2);
			yy(2) := aa(2) or (aa(1) and aa(0));
			yy(1) := aa(2) or aa(1);
			yy(0) := aa(2) or aa(1) or aa(0);
		else
			tt := shift_mask(aa(A'length-2 downto 0));
			for i in 0 to hw-1 loop
				yy(hw+i) := tt(i) and aa(A'length-1);
				yy(i) := tt(i) or aa(A'length-1);
			end loop;
		end if;
		return yy;
	end shift_mask;

	function omega_ctrl (B : in std_ulogic_vector;
						 Rev, Right : in std_ulogic) return std_ulogic_vector is
		constant w : natural := B'length;
		constant ww : natural := 2**(w-1);
		alias bb : std_ulogic_vector(w-1 downto 0) is B;
		variable bx : std_ulogic_vector(w-1 downto 0);
		variable yy : std_ulogic_vector(ww-1 downto 0);
	begin
--pragma synthesis_off
		assert w > 0;
--pragma synthesis_on
		yy := (others => bb(w-1) xor Rev);
		if w > 1 then
			bx := (others => Rev or Right);
			bx := bx xor bb;
			yy := yy xor shift_mask(bx(w-2 downto 0));
			if to_X01(Rev or Right) = '1' then
				yy := lshift(not yy, 1, yy(ww-1));
			end if;
		end if;
		return yy;
	end omega_ctrl;

	function rotate (A : in std_ulogic_vector;
					 B : in std_ulogic_vector;
					 U : in std_ulogic_vector;
					 Rev, Right : in std_ulogic) return std_ulogic_vector is
		constant w : natural := A'length;
		constant hw : natural := w / 2;
		constant chunks : natural := w / 8;
		alias aa : std_ulogic_vector(w-1 downto 0) is A;
		alias bb : std_ulogic_vector(w-1 downto 0) is B;
		alias uu : std_ulogic_vector(U'length-1 downto 0) is U;
		variable yy : std_ulogic_vector(w-1 downto 0);
		variable xx : std_ulogic_vector(hw-1 downto 0);
		variable xt : std_ulogic_vector(hw-1 downto 0);
		variable pi : natural;
	begin
--pragma synthesis_off
		assert w = 64;
		assert B'length = w;
		assert U'length >= 3;
--pragma synthesis_on
		-- omega stages
		yy := aa;
		for i in 0 to 2 loop
			pi := 2**i;
			for j in chunks-1 downto 0 loop
				xt(pi-1 downto 0) := omega_ctrl(bb(8*j+i downto 8*j), Rev, Right);
				for k in pi-1 downto 0 loop
					xx(chunks*k+j) := xt(k);
				end loop;
			end loop;
			xx := bit_duplicate(xx(chunks*pi-1 downto 0), hw/8/pi);
			yy := omega_1(yy, xx);
		end loop;
		for i in 3 to 5 loop
			pi := 2**i;
			for j in hw/pi-1 downto 0 loop
				xt(pi-1 downto 0) := omega_ctrl(bb(2*pi*j+i downto 2*pi*j), Rev, Right);
				for k in pi-1 downto 0 loop
					xx((hw/pi)*k+j) := xt(k) and uu(i-3);
				end loop;
			end loop;
			yy := omega_1(yy, xx);
		end loop;
		return yy;
	end rotate;

	-- byte-wide 16:1 mux
	function ab_sel (aa : in std_ulogic_vector(WIDTH-1 downto 0);
					 bb : in std_ulogic_vector(WIDTH-1 downto 0);
					 se : in std_ulogic_vector(3 downto 0)) return std_ulogic_vector is
		variable yy : std_ulogic_vector(7 downto 0);
	begin
		case se is
			when X"0" => yy := aa( 7 downto  0);
			when X"1" => yy := aa(15 downto  8);
			when X"2" => yy := aa(23 downto 16);
			when X"3" => yy := aa(31 downto 24);
			when X"4" => yy := aa(39 downto 32);
			when X"5" => yy := aa(47 downto 40);
			when X"6" => yy := aa(55 downto 48);
			when X"7" => yy := aa(63 downto 56);
			when X"8" => yy := bb( 7 downto  0);
			when X"9" => yy := bb(15 downto  8);
			when X"A" => yy := bb(23 downto 16);
			when X"B" => yy := bb(31 downto 24);
			when X"C" => yy := bb(39 downto 32);
			when X"D" => yy := bb(47 downto 40);
			when X"E" => yy := bb(55 downto 48);
			when X"F" => yy := bb(63 downto 56);
			when others => yy := (others => 'X'); -- don't care
		end case;
		return yy;
	end ab_sel;

	subtype mode_type is std_ulogic_vector(10 downto 0);

	constant MODE_SHIFTL  : mode_type := "10000000000";
	constant MODE_SHIFTR  : mode_type := "01000000000";
	constant MODE_SHIFTRA : mode_type := "00100000000";
	constant MODE_ROTL    : mode_type := "00010000000";
	constant MODE_ROTR    : mode_type := "00001000000";
	constant MODE_BITREV  : mode_type := "00000100000";
	constant MODE_BYTEREV : mode_type := "00000010000";
	constant MODE_PERMUTE : mode_type := "00000001000";
	constant MODE_MIX     : mode_type := "00000000100";
	constant MODE_EXPAND  : mode_type := "00000000010";
	constant MODE_CSHIFT  : mode_type := "00000000001";

	signal Mode : mode_type;

	signal Y_Bitwise  : std_ulogic_vector(WIDTH-1 downto 0);
	signal Y_BitExt   : std_ulogic_vector(WIDTH-1 downto 0);
	signal Y_Bytewise : std_ulogic_vector(WIDTH-1 downto 0);
	signal Y_ByteExt  : std_ulogic_vector(WIDTH-1 downto 0);
begin
	-- mode vector
	Mode <= (
		10 => ShiftL,
		9 => ShiftR,
		8 => ShiftRA,
		7 => RotL,
		6 => RotR,
		5 => Bitrev,
		4 => Byterev,
		3 => Permute,
		2 => Mix,
		1 => Expand,
		0 => Cshift,
		others => 'X'
	);

	-- shift / rotate / bitrev
	process (A, B, U, Mode)
		function chunk_mask (B : in std_ulogic_vector;
							 Rev, Right : in std_ulogic) return std_ulogic_vector is
			constant w : natural := B'length;
			constant pw : natural := 2 ** w;
			alias bb : std_ulogic_vector(w-1 downto 0) is B;
			variable bx : std_ulogic_vector(w-1 downto 0);
			variable yy : std_ulogic_vector(pw-1 downto 0);
		begin
			bx := (others => Rev or Right);
			yy := shift_mask(bb xor bx);
			if to_X01(Rev or Right) = '1' then
				yy := lshift(not yy, 1);
			end if;
			return yy;
		end chunk_mask;

		procedure bitwise (A, B, U : in std_ulogic_vector;
						   M : in mode_type;
						   Y, Z : out std_ulogic_vector) is
			constant L : natural := A'length;
			alias aa : std_ulogic_vector(L-1 downto 0) is A;
			alias bb : std_ulogic_vector(L-1 downto 0) is B;
			variable bx : std_ulogic_vector(L-1 downto 0);
			variable ee : std_ulogic_vector(L-1 downto 0);
			variable mm : std_ulogic_vector(L-1 downto 0);
			variable uu : std_ulogic_vector(U'length-1 downto 0);
			variable xx : std_ulogic_vector(L-1 downto 0);
			variable Right : std_ulogic;
		begin
--pragma synthesis_off
			assert L = 64;
			assert A'length = L;
			assert B'length = L;
			assert Y'length = L;
			assert Z'length = L;
--pragma synthesis_on

			-- inputs
			-- d=0
			uu := to_X01(U);
			-- d=1
			Right := M(9) or M(8) or M(6);

			-- SIMD replicator
			if (uu(2) or to_X01(CommonShiftCount)) = '1' then
				for i in WIDTH-1 downto 0 loop
					bx(i) := bb(i - i rem 64 + i rem 8);
				end loop;
			elsif uu(1) = '1' then
				for i in WIDTH-1 downto 0 loop
					bx(i) := bb(i - i rem 32 + i rem 8);
				end loop;
			elsif uu(0) = '1' then
				for i in WIDTH-1 downto 0 loop
					bx(i) := bb(i - i rem 16 + i rem 8);
				end loop;
			else
				bx := bb;
			end if;

			-- omega network (SIMD left/right rotate)
			xx := rotate(aa, bx, uu, M(5), Right);

			-- SIMD mask replication / sign extension
			ee := (others => M(8));
			if uu(2) = '1' then
				mm(63 downto  0) := chunk_mask(bx( 5 downto  0), M(5), Right);
				ee := ee and bit_duplicate(bit_extract(aa, 64, 63), 64);
			elsif uu(1) = '1' then
				mm(31 downto  0) := chunk_mask(bx( 4 downto  0), M(5), Right);
				mm(63 downto 32) := chunk_mask(bx(36 downto 32), M(5), Right);
				ee := ee and bit_duplicate(bit_extract(aa, 32, 31), 32);
			elsif uu(0) = '1' then
				mm(15 downto  0) := chunk_mask(bx( 3 downto  0), M(5), Right);
				mm(31 downto 16) := chunk_mask(bx(19 downto 16), M(5), Right);
				mm(47 downto 32) := chunk_mask(bx(35 downto 32), M(5), Right);
				mm(63 downto 48) := chunk_mask(bx(51 downto 48), M(5), Right);
				ee := ee and bit_duplicate(bit_extract(aa, 16, 15), 16);
			else
				mm( 7 downto  0) := chunk_mask(bx( 2 downto  0), M(5), Right);
				mm(15 downto  8) := chunk_mask(bx(10 downto  8), M(5), Right);
				mm(23 downto 16) := chunk_mask(bx(18 downto 16), M(5), Right);
				mm(31 downto 24) := chunk_mask(bx(26 downto 24), M(5), Right);
				mm(39 downto 32) := chunk_mask(bx(34 downto 32), M(5), Right);
				mm(47 downto 40) := chunk_mask(bx(42 downto 40), M(5), Right);
				mm(55 downto 48) := chunk_mask(bx(50 downto 48), M(5), Right);
				mm(63 downto 56) := chunk_mask(bx(58 downto 56), M(5), Right);
				ee := ee and bit_duplicate(bit_extract(aa,  8,  7),  8);
			end if;

			-- select operation
			if to_X01(M(6) or M(7)) = '1' then
				-- rotate operation
				Y := xx;
				-- there are no overflow bits
				Z := (WIDTH-1 downto 0 => 'X');
			else
				-- shift operation
				Y := (xx and not mm) or (ee and mm);
				-- overflow bits
				Z := xx and mm;
			end if;
		end bitwise;

		variable yy : std_ulogic_vector(WIDTH-1 downto 0);
		variable zz : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		-- independent chunks
		for i in WIDTH/64-1 downto 0 loop
			bitwise(A(64*i+63 downto 64*i), B(64*i+63 downto 64*i), U, Mode,
				yy(64*i+63 downto 64*i), zz(64*i+63 downto 64*i));
		end loop;

		-- output signals
		Y_Bitwise <= yy;
		Y_BitExt <= zz;
	end process;

	-- bytewise stuff (byterev, permute, mix, expand)
	process (A, B, U, Mode)
		variable sel : std_ulogic_vector(7 downto 0);
		variable xx : std_ulogic_vector(WIDTH-1 downto 0);
		variable yy : std_ulogic_vector(WIDTH-1 downto 0);
		variable zz : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		sel(7 downto 3) := Mode(4 downto 0);
		sel(2 downto 0) := U;
		sel := to_X01(sel);
		case sel is
			-- Y = byterev(A), Z = byterev(B)
			when "10000000" => xx := X"FEDCBA9876543210"; -- no-op
			when "10000001" => xx := X"EFCDAB8967452301";
			when "10000011" => xx := X"CDEF89AB45670123";
			when "10000111" => xx := X"89ABCDEF01234567";
			-- Y = permute(A, B), Z = sdup(A, B)
			when "01000000" => xx := X"0000000000000000";
				for i in 0 to WIDTH/8-1 loop
					xx(4*i+0) := B(8*(i/1)+0);
					xx(4*i+1) := B(8*(i/1)+1);
					xx(4*i+2) := B(8*(i/1)+2);
					xx(WIDTH/2+4*i+0) := B(0);
					xx(WIDTH/2+4*i+1) := B(1);
					xx(WIDTH/2+4*i+2) := B(2);
				end loop;
			when "01000001" => xx := X"1010101010101010";
				for i in 0 to WIDTH/8-1 loop
					xx(4*i+1) := B(16*(i/2)+0);
					xx(4*i+2) := B(16*(i/2)+1);
					xx(WIDTH/2+4*i+1) := B(0);
					xx(WIDTH/2+4*i+2) := B(1);
				end loop;
			when "01000011" => xx := X"3210321032103210";
				for i in 0 to WIDTH/8-1 loop
					xx(4*i+2) := B(32*(i/4)+0);
					xx(WIDTH/2+4*i+2) := B(0);
				end loop;
			when "01000111" => xx := X"7654321076543210"; -- no-op
				-- nothing to do here for a 64-bit unit
			-- Y = mixl(A, B), Z = mixh(A, B)
			-- XXX: swap input registers?
			when "00100000" => xx := X"F7E6D5C4B3A29180";
			when "00100001" => xx := X"FE76DC54BA329810";
			when "00100011" => xx := X"FEDC7654BA983210";
			when "00100111" => xx := X"FEDCBA9876543210"; -- XXX: undefined
			-- Y = expandl(A, B), Z = expandh(A, B)
			-- XXX: swap input registers?
			when "00010000" => xx := X"F7D5B391E6C4A280";
			when "00010001" => xx := X"FE76BA32DC549810";
			when "00010011" => xx := X"FEDC7654BA983210";
			when "00010111" => xx := X"FEDCBA9876543210"; -- XXX: undefined
			-- Y = cshiftl(A, B); Z = cshiftr(A, B);
			when "00001000" => xx := X"8765432165432108";
			when "00001001" => xx := X"9876543254321098";
			when "00001011" => xx := X"BA9876543210BA98";
			when "00001111" => xx := X"FEDCBA98FEDCBA98";
			-- don't care
			when others => xx := (others => 'X');
		end case;
		for i in WIDTH/8-1 downto 0 loop
			yy(8*i+7 downto 8*i) := ab_sel(A, B, xx(4*i+ 3 downto 4*i+ 0));
			zz(8*i+7 downto 8*i) := ab_sel(A, B, xx(4*i+35 downto 4*i+32));
		end loop;

		-- output signals
		Y_Bytewise <= yy;
		Y_ByteExt <= zz;
	end process;

	-- output mux
	process (Mode, Y_Bitwise, Y_BitExt, Y_Bytewise, Y_ByteExt)
		variable mm : mode_type;
		variable yy : std_ulogic_vector(WIDTH-1 downto 0);
		variable zz : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		mm := to_X01(Mode);
		case mm is
			when MODE_CSHIFT | MODE_BYTEREV | MODE_PERMUTE
			   | MODE_MIX | MODE_EXPAND =>
				-- bytewise operations
				yy := Y_Bytewise;
				zz := Y_ByteExt;
			when MODE_SHIFTL | MODE_SHIFTR | MODE_SHIFTRA
			   | MODE_ROTL | MODE_ROTR | MODE_BITREV =>
				-- bitwise operations
				yy := Y_Bitwise;
				zz := Y_BitExt;
			when others =>
				-- don't care
				yy := (others => 'X');
				zz := (others => 'X');
		end case;

		-- output signals
		Y <= yy;
		Z <= zz;
	end process;
end Behave_1;

-- vi: set ts=4 sw=4 equalprg="fmt -72 -p--": please
