-- shuffle64.vhdl -- 64-Bit F-CPU Bit Shuffling Unit
-- Copyright (C) 2001 Michael Riepe <michael@stud.uni-hannover.de>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

-- $Id: shuffle64.vhdl,v 1.10 2001/11/26 19:39:18 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;
use work.Bit_Manipulation.all;

entity Shuffle64 is
	generic (
		WIDTH : natural := 64
	);
	port (
		-- shiftee
		A : in std_ulogic_vector(WIDTH-1 downto 0);
		-- shift count
		B : in std_ulogic_vector(WIDTH-1 downto 0);
		-- operating mode (mutually exclusive)
		ShiftL : in std_ulogic;
		ShiftR : in std_ulogic;
		ShiftRA : in std_ulogic;
		RotL : in std_ulogic;
		RotR : in std_ulogic;
		Bitrev : in std_ulogic;
		Byterev : in std_ulogic;
		Sdup : in std_ulogic;
		Mix : in std_ulogic;
		Expand : in std_ulogic;
		-- SIMD mode flags
		U : in std_ulogic_vector(2 downto 0);
		-- clock/reset/enable inputs (unused)
		Clk : in std_ulogic;
		Rst : in std_ulogic;
		En : in std_ulogic;
	--
		-- output
		Y : out std_ulogic_vector(WIDTH-1 downto 0);
		-- alt. output (for double-width shifts)
		Y2 : out std_ulogic_vector(WIDTH-1 downto 0)
	);
--pragma synthesis_off
begin
	assert WIDTH = 64
		report "WIDTH must be 64"
		severity failure;
--pragma synthesis_on
end Shuffle64;

architecture Behave_1 of Shuffle64 is
	-- single omega stage
	function omega_1 (A, B : in std_ulogic_vector) return std_ulogic_vector is
		constant w : natural := A'length;
		alias aa : std_ulogic_vector(w-1 downto 0) is A;
		alias bb : std_ulogic_vector(B'length-1 downto 0) is B;
		variable yy : std_ulogic_vector(w-1 downto 0);
		variable j, k : natural;
	begin
--pragma synthesis_off
		assert A'length = w;
		assert 2 * B'length = w;
--pragma synthesis_on
		for i in 0 to w-1 loop
			j := 2 * i;
			if j < w then
				k := j + 1;
			else
				k := j - w;
				j := k + 1;
			end if;
			if to_X01(bb(j / 2)) = '1' then
				yy(i) := aa(k);
			else
				yy(i) := aa(j);
			end if;
		end loop;
		return yy;
	end omega_1;

	function shift_mask (A : in std_ulogic_vector) return std_ulogic_vector is
		constant w : natural := 2 ** A'length;
		alias aa : std_ulogic_vector(A'length-1 downto 0) is A;
		variable yy : std_ulogic_vector(w-1 downto 0);
		variable tt : std_ulogic_vector(w/2-1 downto 0);
	begin
		if A'length = 0 then
			yy(0) := '0';
		elsif A'length = 1 then
			yy(1) := '0';
			yy(0) := aa(0);
		elsif A'length = 2 then
			yy(3) := '0';
			yy(2) := aa(1) and aa(0);
			yy(1) := aa(1);
			yy(0) := aa(1) or aa(0);
		elsif A'length = 3 then
			yy(7) := '0';
			yy(6) := aa(2) and aa(1) and aa(0);
			yy(5) := aa(2) and aa(1);
			yy(4) := aa(2) and (aa(1) or aa(0));
			yy(3) := aa(2);
			yy(2) := aa(2) or (aa(1) and aa(0));
			yy(1) := aa(2) or aa(1);
			yy(0) := aa(2) or aa(1) or aa(0);
		else
			tt := shift_mask(aa(A'length-2 downto 0));
			for i in 0 to w/2-1 loop
				yy(w/2+i) := tt(i) and aa(A'length-1);
				yy(i) := tt(i) or aa(A'length-1);
			end loop;
		end if;
		return yy;
	end shift_mask;

	function bit_expand (A : in std_ulogic_vector;
						 N : natural) return std_ulogic_vector is
		constant w : natural := A'length;
		alias aa : std_ulogic_vector(w-1 downto 0) is A;
		variable yy : std_ulogic_vector(N*w-1 downto 0);
	begin
		for i in N*w-1 downto 0 loop
			yy(i) := aa(i/N);
		end loop;
		return yy;
	end bit_expand;

	function rotate (A : in std_ulogic_vector;
					 B : in std_ulogic_vector;
					 U : in std_ulogic_vector;
					 Rev, Right : in std_ulogic) return std_ulogic_vector is
		constant w : natural := A'length;
		alias aa : std_ulogic_vector(w-1 downto 0) is A;
		alias bb : std_ulogic_vector(B'length-1 downto 0) is B;
		alias uu : std_ulogic_vector(U'length-1 downto 0) is U;
		variable yy : std_ulogic_vector(w-1 downto 0);
		variable xx : std_ulogic_vector(w/2-1 downto 0);
		variable xt : std_ulogic_vector(w/2-1 downto 0);
		variable cc : std_ulogic_vector(5 downto 0);
		variable t : std_ulogic;
	begin
--pragma synthesis_off
		assert w = 64;
		assert B'length >= 6;
		assert U'length >= 3;
--pragma synthesis_on
		-- shift count:
		-- cc := -bb if Right = '1', cc := bb otherwise
		-- NOTE: ripple-carry should be fast enough here
		cc(0) := bb(0);
		t := Right and not bb(0);
		for i in 1 to 5 loop
			cc(i) := (bb(i) xor Right) xor t;
			t := t and not bb(i);
		end loop;

		-- first stage (easy decoding)
		xx := (others => cc(0) xor Rev);
		yy := omega_1(aa, xx);

		-- other stages (more hairy decoding)
		for i in 1 to 5 loop
			xx := (others => cc(i) xor Rev);
			xx := xx xor bit_expand(shift_mask(cc(i-1 downto 0)), 32/2**i);
			if to_X01(Rev) = '1' then
				xx := bit_reverse(xx);
			end if;
			if i >= 3 then
				xt := (others => uu(i-3));
				xx := xx and xt;
			end if;
			yy := omega_1(yy, xx);
		end loop;
		return yy;
	end rotate;

	-- byte-wide 16:1 mux
	function ab_sel (aa : in std_ulogic_vector(WIDTH-1 downto 0);
					 bb : in std_ulogic_vector(WIDTH-1 downto 0);
					 se : in std_ulogic_vector(3 downto 0)) return std_ulogic_vector is
		variable yy : std_ulogic_vector(7 downto 0);
	begin
		case se is
			when X"0" => yy := aa( 7 downto  0);
			when X"1" => yy := aa(15 downto  8);
			when X"2" => yy := aa(23 downto 16);
			when X"3" => yy := aa(31 downto 24);
			when X"4" => yy := aa(39 downto 32);
			when X"5" => yy := aa(47 downto 40);
			when X"6" => yy := aa(55 downto 48);
			when X"7" => yy := aa(63 downto 56);
			when X"8" => yy := bb( 7 downto  0);
			when X"9" => yy := bb(15 downto  8);
			when X"A" => yy := bb(23 downto 16);
			when X"B" => yy := bb(31 downto 24);
			when X"C" => yy := bb(39 downto 32);
			when X"D" => yy := bb(47 downto 40);
			when X"E" => yy := bb(55 downto 48);
			when X"F" => yy := bb(63 downto 56);
			when others => yy := (others => 'X'); -- don't care
		end case;
		return yy;
	end ab_sel;

	-- byte-wide 12:1 mux (for upper part of bit-shifted data)
	function hi_sel (aa : in std_ulogic_vector(WIDTH-1 downto 0);
					 xx : in std_ulogic_vector(WIDTH/8-1 downto 0);
					 se : in std_ulogic_vector(3 downto 0)) return std_ulogic_vector is
		variable yy : std_ulogic_vector(7 downto 0);
	begin
		case se is
			when X"0" => yy := aa( 7 downto  0);
			when X"1" => yy := aa(15 downto  8);
			when X"2" => yy := aa(23 downto 16);
			when X"3" => yy := aa(31 downto 24);
			when X"4" => yy := aa(39 downto 32);
			when X"5" => yy := aa(47 downto 40);
			when X"6" => yy := aa(55 downto 48);
			when X"7" => yy := aa(63 downto 56);
			when X"9" => yy := (others => xx(1));
			when X"B" => yy := (others => xx(3));
			when X"D" => yy := (others => xx(5));
			when X"F" => yy := (others => xx(7));
			when others => yy := (others => 'X'); -- don't care
		end case;
		return yy;
	end hi_sel;

	-- byte-wide 9:1 mux (for lower part of bit-shifted data)
	function lo_sel (aa : in std_ulogic_vector(WIDTH-1 downto 0);
					 se : in std_ulogic_vector(3 downto 0)) return std_ulogic_vector is
		variable yy : std_ulogic_vector(7 downto 0);
	begin
		case se is
			when X"0" => yy := aa( 7 downto  0);
			when X"1" => yy := aa(15 downto  8);
			when X"2" => yy := aa(23 downto 16);
			when X"3" => yy := aa(31 downto 24);
			when X"4" => yy := aa(39 downto 32);
			when X"5" => yy := aa(47 downto 40);
			when X"6" => yy := aa(55 downto 48);
			when X"7" => yy := aa(63 downto 56);
			when X"8" => yy := (others => '0');
			when others => yy := (others => 'X'); -- don't care
		end case;
		return yy;
	end lo_sel;

	subtype mode_type is std_ulogic_vector(9 downto 0);

	constant MODE_SHIFTL  : mode_type := "1000000000";
	constant MODE_SHIFTR  : mode_type := "0100000000";
	constant MODE_SHIFTRA : mode_type := "0010000000";
	constant MODE_ROTL    : mode_type := "0001000000";
	constant MODE_ROTR    : mode_type := "0000100000";
	constant MODE_BITREV  : mode_type := "0000010000";
	constant MODE_BYTEREV : mode_type := "0000001000";
	constant MODE_SDUP    : mode_type := "0000000100";
	constant MODE_MIX     : mode_type := "0000000010";
	constant MODE_EXPAND  : mode_type := "0000000001";

	signal Mode : mode_type;

	signal Y_Bitwise  : std_ulogic_vector(WIDTH-1 downto 0);
	signal Y_BitExt   : std_ulogic_vector(WIDTH-1 downto 0);
	signal Y_Bytewise : std_ulogic_vector(WIDTH-1 downto 0);
	signal Y_ByteExt  : std_ulogic_vector(WIDTH-1 downto 0);
begin
	-- mode vector
	Mode <= (
		9 => ShiftL,
		8 => ShiftR,
		7 => ShiftRA,
		6 => RotL,
		5 => RotR,
		4 => Bitrev,
		3 => Byterev,
		2 => Sdup,
		1 => Mix,
		0 => Expand,
		others => 'X'
	);

	-- shift / rotate / bitrev
	process (A, B, U, Bitrev, ShiftL, ShiftR, ShiftRA, RotR)
		variable aa : std_ulogic_vector(WIDTH-1 downto 0);
		variable bb : std_ulogic_vector(5 downto 0);
		variable ee : std_ulogic_vector(WIDTH-1 downto 0);
		variable mm : std_ulogic_vector(WIDTH-1 downto 0);
		variable uu : std_ulogic_vector(2 downto 0);
		variable xx : std_ulogic_vector(WIDTH-1 downto 0);
		variable yy : std_ulogic_vector(WIDTH-1 downto 0);
		variable zz : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		-- inputs
		aa := A;
		bb := B(5 downto 0);
		uu := to_X01(U);

		-- omega network (SIMD left/right rotate)
		xx := rotate(aa, bb, U, Bitrev, ShiftR or ShiftRA or RotR);

		-- mask for shift operations
		bb(5 downto 3) := bb(5 downto 3) and U(2 downto 0);
		mm := shift_mask(bb);

		-- sign extension
		for i in WIDTH-1 downto 0 loop
			ee(i) := aa(8*(i/8)+7) and ShiftRA;
		end loop;

		-- SIMD mask replication / sign extension
		case uu is
			when "000" =>
				mm(15 downto  8) := mm( 7 downto 0);
				mm(31 downto 16) := mm(15 downto 0);
				mm(63 downto 32) := mm(31 downto 0);
			when "001" =>
				mm(31 downto 16) := mm(15 downto 0);
				mm(63 downto 32) := mm(31 downto 0);
				ee(55 downto 48) := ee(63 downto 56);
				ee(39 downto 32) := ee(47 downto 40);
				ee(23 downto 16) := ee(31 downto 24);
				ee( 7 downto  0) := ee(15 downto  8);
			when "011" =>
				mm(63 downto 32) := mm(31 downto 0);
				ee(55 downto 48) := ee(63 downto 56);
				ee(47 downto 32) := ee(63 downto 48);
				ee(23 downto 16) := ee(31 downto 24);
				ee(15 downto  0) := ee(31 downto 16);
			when "111" =>
				ee(55 downto 48) := ee(63 downto 56);
				ee(47 downto 32) := ee(63 downto 48);
				ee(31 downto  0) := ee(63 downto 32);
			when others =>
				-- don't care
				mm := (others => 'X');
				ee := (others => 'X');
		end case;

		-- select operation
		if to_X01(ShiftL) = '1' then
			-- normal result
			yy := xx and not mm;
			-- overflow bits
			zz := xx and mm;
		elsif to_X01(Bitrev or ShiftR or ShiftRA) = '1' then
			mm := bit_reverse(mm);
			-- normal result
			yy := (xx and not mm) or (ee and mm);
			-- overflow bits
			zz := xx and mm;
		else -- rotate left/right
			-- normal result
			yy := xx;
			-- there are no overflow bits
			zz := (others => 'X');
		end if;

		-- output signals
		Y_Bitwise <= yy;
		Y_BitExt <= zz;
	end process;

	-- bytewise stuff (byterev, sdup, mix, expand)
	process (A, B, U, Byterev, Sdup, Mix, Expand)
		variable sel : std_ulogic_vector(6 downto 0);
		variable xx : std_ulogic_vector(WIDTH-1 downto 0);
		variable yy : std_ulogic_vector(WIDTH-1 downto 0);
		variable zz : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		sel(6) := Byterev;
		sel(5) := Sdup;
		sel(4) := Mix;
		sel(3) := Expand;
		sel(2 downto 0) := U;
		sel := to_X01(sel);
		case sel is
			-- Y = byterev(A), Y2 = byterev(B)
			when "1000000" => xx := X"FEDCBA9876543210"; -- no-op
			when "1000001" => xx := X"EFCDAB8967452301";
			when "1000011" => xx := X"CDEF89AB45670123";
			when "1000111" => xx := X"89ABCDEF01234567";
			-- Y = sdup(A), Y2 = sdup(B)
			when "0100000" => xx := X"8888888800000000";
			when "0100001" => xx := X"9898989810101010";
			when "0100011" => xx := X"BA98BA9832103210";
			when "0100111" => xx := X"FEDCBA9876543210"; -- no-op
			-- Y = mixl(A, B), Y2 = mixh(A, B)
			-- XXX: swap input registers?
			when "0010000" => xx := X"F7E6D5C4B3A29180";
			when "0010001" => xx := X"FE76DC54BA329810";
			when "0010011" => xx := X"FEDC7654BA983210";
			when "0010111" => xx := X"FEDCBA9876543210"; -- XXX: undefined
			-- Y = expandl(A, B), Y2 = expandh(A, B)
			-- XXX: swap input registers?
			when "0001000" => xx := X"F7D5B391E6C4A280";
			when "0001001" => xx := X"FE76BA32DC549810";
			when "0001011" => xx := X"FEDC7654BA983210";
			when "0001111" => xx := X"FEDCBA9876543210"; -- XXX: undefined
			-- don't care
			when others => xx := (others => 'X');
		end case;
		for i in WIDTH/8-1 downto 0 loop
			yy(8*i+7 downto 8*i) := ab_sel(A, B, xx(4*i+ 3 downto 4*i+ 0));
			zz(8*i+7 downto 8*i) := ab_sel(A, B, xx(4*i+35 downto 4*i+32));
		end loop;

		-- output signals
		Y_Bytewise <= yy;
		Y_ByteExt <= zz;
	end process;

	-- output mux
	process (Mode, Y_Bitwise, Y_BitExt, Y_Bytewise, Y_ByteExt)
		variable mm : mode_type;
		variable yy : std_ulogic_vector(WIDTH-1 downto 0);
		variable zz : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		mm := to_X01(Mode);
		case mm is
			when MODE_BYTEREV | MODE_SDUP | MODE_MIX | MODE_EXPAND =>
				-- bytewise operations
				yy := Y_Bytewise;
				zz := Y_ByteExt;
			when MODE_SHIFTL | MODE_SHIFTR | MODE_SHIFTRA
			   | MODE_ROTL | MODE_ROTR | MODE_BITREV =>
				-- bitwise operations
				yy := Y_Bitwise;
				zz := Y_BitExt;
			when others =>
				-- don't care
				yy := (others => 'X');
				zz := (others => 'X');
		end case;

		-- output signals
		Y <= yy;
		Y2 <= zz;
	end process;
end Behave_1;

-- vi: set ts=4 sw=4 equalprg="fmt -72 -p--": please
