-- intersect_main.vhd (Main triangle intersection processor) -- 'intersect' ray-triangle intersection soft core -- -- Copyright (C) 2007 Wenzel Jakob -- -- 01000000000000100000000 -- 00001000011111000001000 -- 00000001100000111000000 -- 01000010001110000101001 -- 00000110010001000100010 -- 00100110010011001100000 -- 10000100011000011000100 -- 00000110001111100000000 -- 00100011100000000010000 -- 00010000111111110000000 -- 00010000000000000001000 -- -- This program is free software; you can redistribute it and/or -- modify it under the terms of the GNU General Public License -- as published by the Free Software Foundation; either version 2 -- of the License, or (at your option) any later version. -- -- This program is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program; if not, write to the Free Software Foundation -- Inc. 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA library ieee, work; use ieee.std_logic_1164.all; use ieee.numeric_std.all; use work.common.all; use work.fplib_pkg.all; use work.cache_pkg.all; use work.intersect_pkg.all; entity intersect_main is port ( -- Internal clock input clk0 : in std_ulogic; -- Synchronous reset input rst : in std_ulogic; -- Data input flag data_i : in std_ulogic; -- Request id input reqid_i : in reqid_t; -- Data acknowledge flag data_ack_o : out std_ulogic; -- Is the unit ready for data? ready_o : out std_ulogic; -- Ray origin input ro_i : in fpval3; -- Ray direction input rd_i : in fpval3; -- Shadow ray input flag shadow_i : in std_ulogic; -- Cache control output cache_o : out rcache_i_t; -- Cache data input cache_i : in rcache_o_t; -- Calculation done flag; done_o : out std_ulogic; -- Distance travelled along the ray -- (overflow flag set if there is no intersection) t_o : out fpval; -- U coordinate output u_o : out fpval; -- V coordinate output v_o : out fpval; -- Shadow ray flag output shadow_o : out std_ulogic; -- Triangle index idx_o : out triidx_t; -- Request id output reqid_o : out reqid_t; -- Busy output busy_o : out std_ulogic; -- Clear performance counter input p_clear_i : in std_ulogic; -- Performance counter (full) p_full_o : out std_logic_vector(31 downto 0); -- Performance counter (empty) p_empty_o : out std_logic_vector(31 downto 0); -- Performance counter (cache hit) p_hit_o : out std_logic_vector(31 downto 0) ); end intersect_main; architecture rtl of intersect_main is constant INTERSECT_PIPELINE_DEPTH : natural := 32; constant INTERSECT_THREAD_STACK_DEPTH : natural := 32; constant INTERSECT_TOTAL_STACK_DEPTH : natural := INTERSECT_PIPELINE_DEPTH * INTERSECT_THREAD_STACK_DEPTH; subtype thread_id is natural range 0 to INTERSECT_PIPELINE_DEPTH-1; type stack_t is record -- Node address in memory addr : memptr_t; -- Start of the checked interval mint : fpval; -- End of the checked interval maxt : fpval; end record; subtype stack_flat_t is std_logic_vector(2 * FP_WIDTH + 23 - 1 downto 0); subtype stack_addr_t is natural range 0 to INTERSECT_TOTAL_STACK_DEPTH - 1; subtype thrstack_addr_t is natural range 0 to INTERSECT_THREAD_STACK_DEPTH - 1; type intersection_t is record -- Thread id id : thread_id; -- Request id reqid : reqid_t; -- Top of the stack top : stack_t; -- Does this stage contain a bubble or valid data? valid : boolean; -- Does this stage contain a shadow ray? shadow : boolean; -- Was the data fetch sucessful? fetch_ok : boolean; -- Does this stage contain a kd-tree node? kd : boolean; -- Stack pointer sp : thrstack_addr_t; -- Ray origin ro : fpval3; -- Ray direction rd : fpval3; -- U coordinate u : fpval; -- V coordinate v : fpval; -- Was there an intersection? hit : boolean; -- Index of the current triangle tri_idx : triidx_t; -- Index of the best triangle best_idx : triidx_t; end record; type stack_mem_t is array(0 to INTERSECT_TOTAL_STACK_DEPTH-1) of stack_flat_t; type pipeline_t is array(1 to INTERSECT_PIPELINE_DEPTH-1) of intersection_t; type matrix_t is array(0 to 11) of fpval; -- Intersection pipeline signal pipe_sched_x : intersection_t; signal pipeline_r : pipeline_t; signal pipe_last_r : intersection_t; signal triNeedsPop_r : std_ulogic; -- Stack memory shared variable stack_r : stack_mem_t := (others => (others => '0')); signal stack_read_r : stack_t; signal stack_write_x : stack_flat_t := (others => '0'); signal stack_we_x : std_ulogic := '0'; signal stack_wraddr_r : stack_addr_t := 0; signal stack_rdaddr_r : stack_addr_t := 0; -- Triangle processor input signal tri_matrix_x : matrix_t; -- Triangle processor output signal tri_t_r : fpval; signal tri_u_r : fpval; signal tri_v_r : fpval; signal tri_valid_r : std_ulogic; signal tri_nextaddr_r : memptr_t; signal tri_idx_r : triidx_t; -- KD-tree processor input signal kd_node_x : std_logic_vector(4*CACHE_WORD_SIZE - 1 downto 0); signal kd_isnode_r : boolean; signal kd_axis_r : natural range 0 to 2; signal kd_left_r : memptr_t; signal kd_right_r : memptr_t; signal kd_split_r : fpval := FP_ZERO; -- KD-tree processor output signal kd_near_r, kd_far_r : memptr_t := (others => '0'); signal kd_mint_r, kd_midt_r : fpval := FP_ZERO; signal kd_maxt_r : fpval := FP_ZERO; signal kd_far_valid_r : std_ulogic := '0'; -- Performance counters signal perfCtr_full_r : std_logic_vector(35 downto 0); signal perfCtr_empty_r : std_logic_vector(35 downto 0); signal perfCtr_hit_r : std_logic_vector(35 downto 0); begin -- * -------------------------------------------------------------------- * -- kd-Tree traversal processor -- * -------------------------------------------------------------------- * kd_processor : intersect_kd generic map ( PIPELINE_LENGTH => 29 ) port map ( clk0 => clk0, ro => pipeline_r(2).ro, rd => pipeline_r(2).rd, mint => pipeline_r(2).top.mint, maxt => pipeline_r(2).top.maxt, axis => kd_axis_r, split => kd_split_r, left => kd_left_r, right => kd_right_r, near_o => kd_near_r, far_o => kd_far_r, far_valid_o => kd_far_valid_r, mint_o => kd_mint_r, midt_o => kd_midt_r, maxt_o => kd_maxt_r ); -- * -------------------------------------------------------------------- * -- Triangle processor unit (insanely parallel version) -- * -------------------------------------------------------------------- * tri_processor : intersect_tri_insane port map ( clk => clk0, ro => pipeline_r(1).ro, rd => pipeline_r(1).rd, mint => pipeline_r(1).top.mint, maxt => pipeline_r(1).top.maxt, a11 => tri_matrix_x(0), a12 => tri_matrix_x(1), a13 => tri_matrix_x(2), a14 => tri_matrix_x(3), a21 => tri_matrix_x(4), a22 => tri_matrix_x(5), a23 => tri_matrix_x(6), a24 => tri_matrix_x(7), a31 => tri_matrix_x(8), a32 => tri_matrix_x(9), a33 => tri_matrix_x(10), a34 => tri_matrix_x(11), t => tri_t_r, u => tri_u_r, v => tri_v_r, valid => tri_valid_r ); -- * -------------------------------------------------------------------- * -- Schedules intersection requests if there is incoming data and -- a pipeline bubble in the last pipeline stage. Otherwise, it -- continues processing that request. -- * -------------------------------------------------------------------- * schedule: process(ro_i, rd_i, data_i, pipe_last_r, shadow_i, reqid_i) begin pipe_sched_x <= pipe_last_r; if pipe_last_r.valid = false and data_i = '1' then pipe_sched_x.top.mint <= fp_pack(X"350624"); -- 0.001 if shadow_i = '1' then pipe_sched_x.top.maxt <= fp_pack(X"3efae1"); -- 0.99 pipe_sched_x.shadow <= true; else pipe_sched_x.top.maxt <= fp_pack(X"602a05"); -- 10^10 pipe_sched_x.shadow <= false; end if; pipe_sched_x.hit <= false; pipe_sched_x.fetch_ok <= false; pipe_sched_x.tri_idx <= (others => '0'); pipe_sched_x.best_idx <= (others => '0'); pipe_sched_x.reqid <= reqid_i; pipe_sched_x.ro <= ro_i; pipe_sched_x.rd <= rd_i; pipe_sched_x.top.addr <= (others => '0'); pipe_sched_x.sp <= 0; pipe_sched_x.u <= FP_ZERO; pipe_sched_x.v <= FP_ZERO; pipe_sched_x.valid <= true; end if; end process schedule; ready_o <= '1' when pipe_last_r.valid = false else '0'; -- * -------------------------------------------------------------------- * -- Fetch stage - Request the address in top.addr if the state does -- not contain a bubble -- * -------------------------------------------------------------------- * cache_o.addr <= pipe_sched_x.top.addr(MEMORY_ADDRESS_WIDTH-1 downto 0); cache_o.read <= '1' when pipe_sched_x.valid else '0'; -- * -------------------------------------------------------------------- * -- Decode stage - Select the proper word from the cache block -- and decode it. -- * -------------------------------------------------------------------- * select_kdnode: for j in 0 to CACHE_WORD_COUNT/4 - 1 generate kd_node_x <= cache_i.block_data((j+1) * CACHE_WORD_SIZE * 4 - 1 downto j * CACHE_WORD_SIZE * 4) when unsigned(pipeline_r(1).top.addr(CACHE_OFFSET_SIZE - 1 downto 2)) = to_unsigned(j, CACHE_OFFSET_SIZE - 2) else (others => 'Z'); end generate; tri_matrix: for i in 0 to 11 generate tri_matrix_x(i) <= fp_pack(cache_i.block_data( (i+2) * CACHE_WORD_SIZE - 1 downto (i+1) * CACHE_WORD_SIZE) ); end generate; decode: process(kd_node_x, clk0) begin if rising_edge(clk0) then if kd_node_x(0) = '1' then kd_isnode_r <= true; else kd_isnode_r <= false; end if; case kd_node_x(2 downto 1) is when "00" => kd_axis_r <= 0; when "01" => kd_axis_r <= 1; when others => kd_axis_r <= 2; end case; kd_split_r <= fp_pack(kd_node_x(2 * CACHE_WORD_SIZE - 1 downto CACHE_WORD_SIZE)); kd_left_r <= kd_node_x(3 * CACHE_WORD_SIZE - 2 downto CACHE_WORD_SIZE * 2); kd_right_r <= kd_node_x(4 * CACHE_WORD_SIZE - 2 downto CACHE_WORD_SIZE * 3); tri_nextaddr_r <= cache_i.block_data(23 downto 1); tri_idx_r <= cache_i.block_data(333 downto 312); end if; end process decode; -- * -------------------------------------------------------------------- * -- Stack read stage -- * -------------------------------------------------------------------- * stack_read: process(clk0) is variable data : stack_flat_t; begin if rising_edge(clk0) then if pipeline_r(29).sp = 0 then stack_rdaddr_r <= pipeline_r(29).id * INTERSECT_THREAD_STACK_DEPTH; else stack_rdaddr_r <= pipeline_r(29).id * INTERSECT_THREAD_STACK_DEPTH + pipeline_r(29).sp - 1; end if; data := stack_r(stack_rdaddr_r); stack_read_r.mint <= fp_pack(data(23 downto 0)); stack_read_r.maxt <= fp_pack(data(47 downto 24)); stack_read_r.addr <= data(70 downto 48); end if; end process stack_read; -- * -------------------------------------------------------------------- * -- Decision stage - Checks kd-tree/triangle processor results and -- performs stack push/pop operations -- * -------------------------------------------------------------------- * decision: process(clk0) begin if rising_edge(clk0) then done_o <= '0'; if rst = '1' then pipe_last_r.id <= 0; pipe_last_r.valid <= false; else pipe_last_r <= pipeline_r(31); if pipeline_r(30).top.addr = ONES(22 downto 0) then triNeedsPop_r <= '1'; else triNeedsPop_r <= '0'; end if; if pipeline_r(31).valid and pipeline_r(31).fetch_ok and pipeline_r(31).kd then pipe_last_r.top.mint <= kd_mint_r; pipe_last_r.top.maxt <= kd_midt_r; pipe_last_r.top.addr <= kd_near_r; end if; if stack_we_x = '1' then pipe_last_r.sp <= pipeline_r(31).sp + 1; end if; if pipeline_r(31).valid and pipeline_r(31).fetch_ok and not pipeline_r(31).kd then if triNeedsPop_r = '1' then if pipeline_r(31).hit or pipeline_r(31).sp = 0 then done_o <= '1'; pipe_last_r.valid <= false; else pipe_last_r.top <= stack_read_r; pipe_last_r.sp <= pipeline_r(31).sp - 1; if tri_valid_r = '1' then pipe_last_r.valid <= false; done_o <= '1'; end if; end if; end if; if tri_valid_r = '1' then pipe_last_r.best_idx <= pipeline_r(31).tri_idx; pipe_last_r.top.maxt <= tri_t_r; pipe_last_r.u <= tri_u_r; pipe_last_r.v <= tri_v_r; pipe_last_r.hit <= true; if pipeline_r(31).shadow then pipe_last_r.valid <= false; done_o <= '1'; end if; end if; end if; end if; end if; end process decision; reqid_o <= pipe_last_r.reqid; t_o.m <= pipe_last_r.top.maxt.m; u_o <= pipe_last_r.u; v_o <= pipe_last_r.v; t_o.e <= pipe_last_r.top.maxt.e; t_o.s <= pipe_last_r.top.maxt.s; t_o.ovf <= '0' when pipe_last_r.hit else '1'; shadow_o <= '1' when pipe_last_r.shadow else '0'; idx_o <= pipe_last_r.best_idx; -- * -------------------------------------------------------------------- * -- Stack write stage -- * -------------------------------------------------------------------- * stack_we_x <= '1' when pipeline_r(31).valid and pipeline_r(31).fetch_ok and pipeline_r(31).kd and kd_far_valid_r = '1' else '0'; stack_write_x(23 downto 0) <= fp_unpack(kd_midt_r); stack_write_x(47 downto 24) <= fp_unpack(kd_maxt_r); stack_write_x(70 downto 48) <= kd_far_r; stack_write: process(clk0) is begin if rising_edge(clk0) then stack_wraddr_r <= pipeline_r(30).id * INTERSECT_THREAD_STACK_DEPTH + pipeline_r(30).sp; if stack_we_x = '1' then stack_r(stack_wraddr_r) := stack_write_x; end if; end if; end process stack_write; -- * -------------------------------------------------------------------- * -- Update stages - Move commonly used data through the pipeline -- * -------------------------------------------------------------------- * update: process(clk0) variable busy : std_ulogic; begin if rising_edge(clk0) then data_ack_o <= '0'; if rst = '1' then for i in 1 to INTERSECT_PIPELINE_DEPTH-1 loop pipeline_r(i).valid <= false; -- Reverse order, so that the execution order is 0,1,2... pipeline_r(i).id <= INTERSECT_PIPELINE_DEPTH - i; end loop; else -- Acknowledge incoming data if pipe_last_r.valid = false and data_i = '1' then data_ack_o <= '1'; end if; -- Move data through the pipeline pipeline_r(1) <= pipe_sched_x; for i in 2 to INTERSECT_PIPELINE_DEPTH-1 loop pipeline_r(i) <= pipeline_r(i-1); end loop; pipeline_r(3).kd <= kd_isnode_r; if cache_i.valid = '1' then pipeline_r(2).fetch_ok <= true; else pipeline_r(2).fetch_ok <= false; end if; if pipeline_r(2).valid and pipeline_r(2).fetch_ok and not kd_isnode_r then pipeline_r(3).top.addr <= tri_nextaddr_r; end if; pipeline_r(3).tri_idx <= tri_idx_r; end if; -- Generate the busy signal busy := '0'; for i in 1 to INTERSECT_PIPELINE_DEPTH-1 loop if pipeline_r(i).valid then busy := '1'; end if; end loop; if pipe_last_r.valid then busy := '1'; end if; busy_o <= busy; end if; end process update; -- * -------------------------------------------------------------------- * -- Performance counters - record how often the pipeline is full or empty -- * -------------------------------------------------------------------- * perf: process(clk0) begin if rising_edge(clk0) then if rst = '1' or p_clear_i = '1' then perfCtr_full_r <= (others => '0'); perfCtr_empty_r <= (others => '0'); perfCtr_hit_r <= (others => '0'); else if pipeline_r(31).valid then perfCtr_full_r <= std_logic_vector(unsigned(perfCtr_full_r) + 1); if pipeline_r(31).fetch_ok then perfCtr_hit_r <= std_logic_vector(unsigned(perfCtr_hit_r) + 1); end if; else perfCtr_empty_r <= std_logic_vector(unsigned(perfCtr_empty_r) + 1); end if; end if; end if; end process perf; p_full_o(30 downto 0) <= perfCtr_full_r(35 downto 5); p_full_o(31) <= '0'; p_empty_o(30 downto 0) <= perfCtr_empty_r(35 downto 5); p_empty_o(31) <= '0'; p_hit_o(30 downto 0) <= perfCtr_hit_r(35 downto 5); p_hit_o(31) <= '0'; end rtl;