-- imul.vhdl - F-CPU Integer Multiplication Unit
-- Copyright (C) 2000 Michael Riepe <michael@s...>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

-- $Id: imul.vhdl,v 1.3 2000/10/13 22:10:43 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;

entity IMul is
        generic (
                WIDTH : natural := 64   -- do not change!
        );
        port (
                -- normal inputs
                A, B : in std_ulogic_vector(WIDTH-1 downto 0);
                -- optional adder input
                X : in std_ulogic_vector(WIDTH-1 downto 0) := (others => '0');
                -- mode switch
                SignedMode : in std_ulogic;
                -- SIMD switches
                U8, U16, U32 : in std_ulogic;
                -- 8-bit SIMD result
                Y8l, Y8h : out std_ulogic_vector(WIDTH-1 downto 0);
                -- 16-bit SIMD result
                Y16l, Y16h : out std_ulogic_vector(WIDTH-1 downto 0);
                -- 32-bit SIMD result
                Y32l, Y32h : out std_ulogic_vector(WIDTH-1 downto 0);
                -- 64-bit result
                Y : out std_ulogic_vector(2*WIDTH-1 downto 0)
        );
end IMul;

architecture Arch_1 of IMul is
        constant ROWS : natural := WIDTH / 8;
        type Matrix16 is array (natural range <>, natural range <>)
                of std_ulogic_vector(15 downto 0);
        type Matrix32 is array (natural range <>, natural range <>)
                of std_ulogic_vector(31 downto 0);
        type Matrix64 is array (natural range <>, natural range <>)
                of std_ulogic_vector(63 downto 0);
        signal Xtmp : Matrix16(ROWS-1 downto 0, ROWS-1 downto 0);
        signal Y16tmp : Matrix16(ROWS-1 downto 0, ROWS-1 downto 0);
        signal Y32tmp : Matrix32(ROWS/2-1 downto 0, ROWS/2-1 downto 0);
        signal Y64tmp : Matrix64(ROWS/4-1 downto 0, ROWS/4-1 downto 0);
        signal S : std_ulogic_vector(ROWS-1 downto 0);
begin
        --
        -- mode switching
        --
        S <= (
                7 => SignedMode,
                3 => SignedMode and not U32,
                5 | 1 => SignedMode and not U16,
                6 | 4 | 2 | 0 => SignedMode and not U8
        );

        --
        -- multiply-and-add inputs
        --
        process (X, U8, U16, U32)
        begin
                --
                -- This is tricky.  Only the most significant partial
                -- product may be added with a full-size summand in
                -- order to avoid overflows in the partial products.
                -- Half-size summands (high part set to zero) are ok
                -- because the result of an `8*8+8' operation will not
                -- overflow when at least one of the multiplicands
                -- is treated as an unsigned quantity (which is the
                -- case for all partial products except the most
                -- significant one):
                --
                --    s * u: y <= 127 * 255 + 255 = 128 * 255 < 32768
                --    u * u: y <= 255 * 255 + 255 = 256 * 255 < 65536
                --
                Xtmp <= (others => (others => (others => '0')));
                if U8 /= '1' then
                        -- 8-bit mode: use A0B0, A1B1 ... A7B7
                        -- macl
                        Xtmp(0, 0)(15 downto 0) <= X(15 downto  0);
                        Xtmp(1, 1)(15 downto 0) <= X(31 downto 16);
                        Xtmp(2, 2)(15 downto 0) <= X(47 downto 32);
                        Xtmp(3, 3)(15 downto 0) <= X(63 downto 48);
                        -- mach
                        Xtmp(4, 4)(15 downto 0) <= X(15 downto  0);
                        Xtmp(5, 5)(15 downto 0) <= X(31 downto 16);
                        Xtmp(6, 6)(15 downto 0) <= X(47 downto 32);
                        Xtmp(7, 7)(15 downto 0) <= X(63 downto 48);
                elsif U16 /= '1' then
                        -- 16-bit mode: use A0B0-A0B1-A1B1, A2B2-A2B3-A3B3, ...
                        -- macl
                        Xtmp(0, 0)( 7 downto 0) <= X( 7 downto  0);
                        Xtmp(0, 1)( 7 downto 0) <= X(15 downto  8);
                        Xtmp(1, 1)(15 downto 0) <= X(31 downto 16);
                        Xtmp(2, 2)( 7 downto 0) <= X(39 downto 32);
                        Xtmp(2, 3)( 7 downto 0) <= X(47 downto 40);
                        Xtmp(3, 3)(15 downto 0) <= X(63 downto 48);
                        -- mach
                        Xtmp(4, 4)( 7 downto 0) <= X( 7 downto  0);
                        Xtmp(4, 5)( 7 downto 0) <= X(15 downto  8);
                        Xtmp(5, 5)(15 downto 0) <= X(31 downto 16);
                        Xtmp(6, 6)( 7 downto 0) <= X(39 downto 32);
                        Xtmp(6, 7)( 7 downto 0) <= X(47 downto 40);
                        Xtmp(7, 7)(15 downto 0) <= X(63 downto 48);
                elsif U32 /= '1' then
                        -- 32-bit mode: use A0B0-A0B3-A3B3 and A4B4-A4B7-A7B7
                        -- macl
                        Xtmp(0, 0)( 7 downto 0) <= X( 7 downto  0);
                        Xtmp(0, 1)( 7 downto 0) <= X(15 downto  8);
                        Xtmp(0, 2)( 7 downto 0) <= X(23 downto 16);
                        Xtmp(0, 3)( 7 downto 0) <= X(31 downto 24);
                        Xtmp(1, 3)( 7 downto 0) <= X(39 downto 32);
                        Xtmp(2, 3)( 7 downto 0) <= X(47 downto 40);
                        Xtmp(3, 3)(15 downto 0) <= X(63 downto 48);
                        -- mach
                        Xtmp(4, 4)( 7 downto 0) <= X( 7 downto  0);
                        Xtmp(4, 5)( 7 downto 0) <= X(15 downto  8);
                        Xtmp(4, 6)( 7 downto 0) <= X(23 downto 16);
                        Xtmp(4, 7)( 7 downto 0) <= X(31 downto 24);
                        Xtmp(5, 7)( 7 downto 0) <= X(39 downto 32);
                        Xtmp(6, 7)( 7 downto 0) <= X(47 downto 40);
                        Xtmp(7, 7)(15 downto 0) <= X(63 downto 48);
                else
                        -- 64-bit mode: use A0B0-A0B7
                        -- macl only
                        Xtmp(0, 0)( 7 downto 0) <= X( 7 downto  0);
                        Xtmp(0, 1)( 7 downto 0) <= X(15 downto  8);
                        Xtmp(0, 2)( 7 downto 0) <= X(23 downto 16);
                        Xtmp(0, 3)( 7 downto 0) <= X(31 downto 24);
                        Xtmp(0, 4)( 7 downto 0) <= X(39 downto 32);
                        Xtmp(0, 5)( 7 downto 0) <= X(47 downto 40);
                        Xtmp(0, 6)( 7 downto 0) <= X(55 downto 48);
                        Xtmp(0, 7)( 7 downto 0) <= X(63 downto 56);
                end if;
        end process;

        --
        -- 8x8 multipliers en masse
        --
        rows8 : for j in ROWS-1 downto 0 generate
                cols8 : for i in ROWS-1 downto 0 generate
                        mul8 : entity work.Mul8xN
                                generic map (BWIDTH => 8)
                                port map (
                                        A => A(8*j+7 downto 8*j),
                                        B => B(8*i+7 downto 8*i),
                                        X => Xtmp(j, i),
                                        SignedA => S(j),
                                        SignedB => S(i),
                                        Y => Y16tmp(j, i)
                                );
                end generate;
        end generate;

        --
        -- calculate 16x16-bit products
        --
        rows16 : for j in ROWS/2-1 downto 0 generate
                cols16 : for i in ROWS/2-1 downto 0 generate
                        add16 : entity work.Wallace
                                generic map (WIDTH => 16)
                                port map (
                                        A0B0 => Y16tmp(2*j+0, 2*i+0),
                                        A0B1 => Y16tmp(2*j+0, 2*i+1),
                                        A1B0 => Y16tmp(2*j+1, 2*i+0),
                                        A1B1 => Y16tmp(2*j+1, 2*i+1),
                                        Signed01 => S(2*i+1),
                                        Signed10 => S(2*j+1),
                                        Y => Y32tmp(j, i)
                                );
                end generate;
        end generate;

        --
        -- calculate 32x32-bit products
        --
        rows32 : for j in ROWS/4-1 downto 0 generate
                cols32 : for i in ROWS/4-1 downto 0 generate
                        add32 : entity work.Wallace
                                generic map (WIDTH => 32)
                                port map (
                                        A0B0 => Y32tmp(2*j+0, 2*i+0),
                                        A0B1 => Y32tmp(2*j+0, 2*i+1),
                                        A1B0 => Y32tmp(2*j+1, 2*i+0),
                                        A1B1 => Y32tmp(2*j+1, 2*i+1),
                                        Signed01 => S(4*i+3),
                                        Signed10 => S(4*j+3),
                                        Y => Y64tmp(j, i)
                                );
                end generate;
        end generate;

        --
        -- calculate 64x64-bit products
        --
        add64 : entity work.Wallace
                generic map (WIDTH => 64)
                port map (
                        A0B0 => Y64tmp(0, 0),
                        A0B1 => Y64tmp(0, 1),
                        A1B0 => Y64tmp(1, 0),
                        A1B1 => Y64tmp(1, 1),
                        Signed01 => SignedMode,
                        Signed10 => SignedMode,
                        Y => Y
                );

        --
        -- 8-bit SIMD result
        -- TODO: need reordering for macl/mach
        --
        Y8l <= Y16tmp(7, 7)(7 downto 0)
             & Y16tmp(6, 6)(7 downto 0)
             & Y16tmp(5, 5)(7 downto 0)
             & Y16tmp(4, 4)(7 downto 0)
             & Y16tmp(3, 3)(7 downto 0)
             & Y16tmp(2, 2)(7 downto 0)
             & Y16tmp(1, 1)(7 downto 0)
             & Y16tmp(0, 0)(7 downto 0);
        Y8h <= Y16tmp(7, 7)(15 downto 8)
             & Y16tmp(6, 6)(15 downto 8)
             & Y16tmp(5, 5)(15 downto 8)
             & Y16tmp(4, 4)(15 downto 8)
             & Y16tmp(3, 3)(15 downto 8)
             & Y16tmp(2, 2)(15 downto 8)
             & Y16tmp(1, 1)(15 downto 8)
             & Y16tmp(0, 0)(15 downto 8);

        --
        -- 16-bit SIMD result
        -- TODO: need reordering for macl/mach
        --
        Y16l <= Y32tmp(3, 3)(15 downto 0)
              & Y32tmp(2, 2)(15 downto 0)
              & Y32tmp(1, 1)(15 downto 0)
              & Y32tmp(0, 0)(15 downto 0);
        Y16h <= Y32tmp(3, 3)(31 downto 16)
              & Y32tmp(2, 2)(31 downto 16)
              & Y32tmp(1, 1)(31 downto 16)
              & Y32tmp(0, 0)(31 downto 16);

        --
        -- 32-bit SIMD result
        -- TODO: need reordering for macl/mach
        --
        Y32l <= Y64tmp(1, 1)(31 downto 0)
              & Y64tmp(0, 0)(31 downto 0);
        Y32h <= Y64tmp(1, 1)(63 downto 32)
              & Y64tmp(0, 0)(63 downto 32);

end Arch_1;
