/****************************************************************************
|
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
*
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
* copy of this software and associated documentation files (the "Software"),
|
* to deal in the Software without restriction, including without limitation
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
* and/or sell copies of the Software, and to permit persons to whom the
|
* Software is furnished to do so, subject to the following conditions:
|
*
|
* The above copyright notice and this permission notice (including the next
|
* paragraph) shall be included in all copies or substantial portions of the
|
* Software.
|
*
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
* IN THE SOFTWARE.
|
*
|
* @file rasterizer.cpp
|
*
|
* @brief Implementation for the rasterizer.
|
*
|
******************************************************************************/
|
|
#include <vector>
|
#include <algorithm>
|
|
#include "rasterizer.h"
|
#include "rdtsc_core.h"
|
#include "backend.h"
|
#include "utils.h"
|
#include "frontend.h"
|
#include "tilemgr.h"
|
#include "memory/tilingtraits.h"
|
|
extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
|
|
template <uint32_t numSamples = 1>
|
void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
|
template <typename RT>
|
void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers);
|
template <typename RT>
|
void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
|
|
#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
|
static const __m256d gMaskToVecpd[] =
|
{
|
MASKTOVEC(0, 0, 0, 0),
|
MASKTOVEC(0, 0, 0, 1),
|
MASKTOVEC(0, 0, 1, 0),
|
MASKTOVEC(0, 0, 1, 1),
|
MASKTOVEC(0, 1, 0, 0),
|
MASKTOVEC(0, 1, 0, 1),
|
MASKTOVEC(0, 1, 1, 0),
|
MASKTOVEC(0, 1, 1, 1),
|
MASKTOVEC(1, 0, 0, 0),
|
MASKTOVEC(1, 0, 0, 1),
|
MASKTOVEC(1, 0, 1, 0),
|
MASKTOVEC(1, 0, 1, 1),
|
MASKTOVEC(1, 1, 0, 0),
|
MASKTOVEC(1, 1, 0, 1),
|
MASKTOVEC(1, 1, 1, 0),
|
MASKTOVEC(1, 1, 1, 1),
|
};
|
|
struct POS
|
{
|
int32_t x, y;
|
};
|
|
struct EDGE
|
{
|
double a, b; // a, b edge coefficients in fix8
|
double stepQuadX; // step to adjacent horizontal quad in fix16
|
double stepQuadY; // step to adjacent vertical quad in fix16
|
double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
|
double stepRasterTileY; // step to adjacent vertical raster tile in fix16
|
|
__m256d vQuadOffsets; // offsets for 4 samples of a quad
|
__m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief rasterize a raster tile partially covered by the triangle
|
/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
|
/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
|
/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
|
/// Used to step between quads when sweeping over the raster tile.
|
template<uint32_t NumEdges, typename EdgeMaskT>
|
INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
|
{
|
uint64_t coverageMask = 0;
|
|
__m256d vEdges[NumEdges];
|
__m256d vStepX[NumEdges];
|
__m256d vStepY[NumEdges];
|
|
for (uint32_t e = 0; e < NumEdges; ++e)
|
{
|
// Step to the pixel sample locations of the 1st quad
|
vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
|
|
// compute step to next quad (mul by 2 in x and y direction)
|
vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
|
vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
|
}
|
|
// fast unrolled version for 8x8 tile
|
#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
|
int edgeMask[NumEdges];
|
uint64_t mask;
|
|
auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
|
auto update_lambda = [&](int e){mask &= edgeMask[e];};
|
auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
|
auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
|
auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
|
|
// evaluate which pixels in the quad are covered
|
#define EVAL \
|
UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
|
|
// update coverage mask
|
// if edge 0 is degenerate and will be skipped; init the mask
|
#define UPDATE_MASK(bit) \
|
if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
|
mask = 0xf;\
|
}\
|
else{\
|
mask = edgeMask[0]; \
|
}\
|
UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
|
coverageMask |= (mask << bit);
|
|
// step in the +x direction to the next quad
|
#define INCX \
|
UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
|
|
// step in the +y direction to the next quad
|
#define INCY \
|
UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
|
|
// step in the -x direction to the next quad
|
#define DECX \
|
UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
|
|
// sweep 2x2 quad back and forth through the raster tile,
|
// computing coverage masks for the entire tile
|
|
// raster tile
|
// 0 1 2 3 4 5 6 7
|
// x x
|
// x x ------------------>
|
// x x |
|
// <-----------------x x V
|
// ..
|
|
// row 0
|
EVAL;
|
UPDATE_MASK(0);
|
INCX;
|
EVAL;
|
UPDATE_MASK(4);
|
INCX;
|
EVAL;
|
UPDATE_MASK(8);
|
INCX;
|
EVAL;
|
UPDATE_MASK(12);
|
INCY;
|
|
//row 1
|
EVAL;
|
UPDATE_MASK(28);
|
DECX;
|
EVAL;
|
UPDATE_MASK(24);
|
DECX;
|
EVAL;
|
UPDATE_MASK(20);
|
DECX;
|
EVAL;
|
UPDATE_MASK(16);
|
INCY;
|
|
// row 2
|
EVAL;
|
UPDATE_MASK(32);
|
INCX;
|
EVAL;
|
UPDATE_MASK(36);
|
INCX;
|
EVAL;
|
UPDATE_MASK(40);
|
INCX;
|
EVAL;
|
UPDATE_MASK(44);
|
INCY;
|
|
// row 3
|
EVAL;
|
UPDATE_MASK(60);
|
DECX;
|
EVAL;
|
UPDATE_MASK(56);
|
DECX;
|
EVAL;
|
UPDATE_MASK(52);
|
DECX;
|
EVAL;
|
UPDATE_MASK(48);
|
#else
|
uint32_t bit = 0;
|
for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
|
{
|
__m256d vStartOfRowEdge[NumEdges];
|
for (uint32_t e = 0; e < NumEdges; ++e)
|
{
|
vStartOfRowEdge[e] = vEdges[e];
|
}
|
|
for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
|
{
|
int edgeMask[NumEdges];
|
for (uint32_t e = 0; e < NumEdges; ++e)
|
{
|
edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
|
}
|
|
uint64_t mask = edgeMask[0];
|
for (uint32_t e = 1; e < NumEdges; ++e)
|
{
|
mask &= edgeMask[e];
|
}
|
coverageMask |= (mask << bit);
|
|
// step to the next pixel in the x
|
for (uint32_t e = 0; e < NumEdges; ++e)
|
{
|
vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
|
}
|
bit+=4;
|
}
|
|
// step to the next row
|
for (uint32_t e = 0; e < NumEdges; ++e)
|
{
|
vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
|
}
|
}
|
#endif
|
return coverageMask;
|
|
}
|
// Top left rule:
|
// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
|
// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
|
// Top left: a sample is in if it is a top or left edge.
|
// Out: !(horizontal && above) = !horizontal && below
|
// Out: !horizontal && left = !(!horizontal && left) = horizontal and right
|
INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge)
|
{
|
// if vA < 0, vC--
|
// if vA == 0 && vB < 0, vC--
|
|
__m256d vEdgeOut = vEdge;
|
__m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
|
|
// if vA < 0 (line is not horizontal and below)
|
int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
|
|
// if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
|
__m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
|
int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
|
msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
|
|
// if either of these are true and we're on the line (edge == 0), bump it outside the line
|
vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
|
}
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief calculates difference in precision between the result of manh
|
/// calculation and the edge precision, based on compile time trait values
|
template<typename RT>
|
constexpr int64_t ManhToEdgePrecisionAdjust()
|
{
|
static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
|
"Inadequate precision of result of manh calculation ");
|
return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
|
}
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @struct adjustEdgeConservative
|
/// @brief Primary template definition used for partially specializing
|
/// the adjustEdgeConservative function. This struct should never
|
/// be instantiated.
|
/// @tparam RT: rasterizer traits
|
/// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
|
template <typename RT, typename ConservativeEdgeOffsetT>
|
struct adjustEdgeConservative
|
{
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Performs calculations to adjust each edge of a triangle away
|
/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
|
/// direction.
|
///
|
/// Uncertainty regions arise from fixed point rounding, which
|
/// can snap a vertex +/- by min fixed point value.
|
/// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
|
/// This allows the rasterizer to test for coverage only at the pixel center,
|
/// instead of having to test individual pixel corners for conservative coverage
|
INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
|
{
|
// Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
|
// from the pixel center (in the direction of the edge normal A/B)
|
|
// edge = Ax + Bx + C - (manh/e)
|
// manh = manhattan distance = abs(A) + abs(B)
|
// e = absolute rounding error from snapping from float to fixed point precision
|
|
// 'fixed point' multiply (in double to be avx1 friendly)
|
// need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
|
__m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
|
__m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
|
_mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
|
|
static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
|
"Inadequate precision of result of manh calculation ");
|
|
// rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
|
// since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
|
manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
|
|
// move the edge away from the pixel center by the required conservative precision + 1/2 pixel
|
// this allows the rasterizer to do a single conservative coverage test to see if the primitive
|
// intersects the pixel at all
|
vEdge = _mm256_sub_pd(vEdge, manh);
|
};
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief adjustEdgeConservative specialization where no edge offset is needed
|
template <typename RT>
|
struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
|
{
|
INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief calculates the distance a degenerate BBox needs to be adjusted
|
/// for conservative rast based on compile time trait values
|
template<typename RT>
|
constexpr int64_t ConservativeScissorOffset()
|
{
|
static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
|
// if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
|
typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
|
// 1/2 pixel edge offset + conservative offset - degenerateTriangle
|
return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
|
}
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Performs calculations to adjust each a vector of evaluated edges out
|
/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
|
/// direction.
|
template <typename RT>
|
INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
|
{
|
int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
|
int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
|
vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Performs calculations to adjust each a scalar evaluated edge out
|
/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
|
/// direction.
|
template <typename RT, typename OffsetT>
|
INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
|
{
|
int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
|
int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
|
return (Edge - manh);
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Perform any needed adjustments to evaluated triangle edges
|
template <typename RT, typename EdgeOffsetT>
|
struct adjustEdgesFix16
|
{
|
INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
|
{
|
static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
|
"Edge equation expected to be in x.16 fixed point");
|
|
static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
|
|
// need to apply any edge offsets before applying the top-left rule
|
adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
|
|
adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
|
}
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Perform top left adjustments to evaluated triangle edges
|
template <typename RT>
|
struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
|
{
|
INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
|
{
|
adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
|
}
|
};
|
|
// max(abs(dz/dx), abs(dz,dy)
|
INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
|
{
|
/*
|
// evaluate i,j at (0,0)
|
float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
|
float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
|
|
// evaluate i,j at (1,0)
|
float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
|
float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
|
|
// compute dz/dx
|
float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
|
float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
|
float dzdx = abs(d10 - d00);
|
|
// evaluate i,j at (0,1)
|
float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
|
float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
|
|
float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
|
float dzdy = abs(d01 - d00);
|
*/
|
|
// optimized version of above
|
float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
|
float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
|
|
return std::max(dzdx, dzdy);
|
}
|
|
INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
|
{
|
if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
|
{
|
return (1.0f / (1 << 24));
|
}
|
else if (pState->depthFormat == R16_UNORM)
|
{
|
return (1.0f / (1 << 16));
|
}
|
else
|
{
|
SWR_ASSERT(pState->depthFormat == R32_FLOAT);
|
|
// for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
|
float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
|
uint32_t zMaxInt = *(uint32_t*)&zMax;
|
zMaxInt &= 0x7f800000;
|
zMax = *(float*)&zMaxInt;
|
|
return zMax * (1.0f / (1 << 23));
|
}
|
}
|
|
INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
|
{
|
if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
|
{
|
return 0.0f;
|
}
|
|
float scale = pState->slopeScaledDepthBias;
|
if (scale != 0.0f)
|
{
|
scale *= ComputeMaxDepthSlope(pTri);
|
}
|
|
float bias = pState->depthBias;
|
if (!pState->depthBiasPreAdjusted)
|
{
|
bias *= ComputeBiasFactor(pState, pTri, z);
|
}
|
bias += scale;
|
|
if (pState->depthBiasClamp > 0.0f)
|
{
|
bias = std::min(bias, pState->depthBiasClamp);
|
}
|
else if (pState->depthBiasClamp < 0.0f)
|
{
|
bias = std::max(bias, pState->depthBiasClamp);
|
}
|
|
return bias;
|
}
|
|
// Prevent DCE by writing coverage mask from rasterizer to volatile
|
#if KNOB_ENABLE_TOSS_POINTS
|
__declspec(thread) volatile uint64_t gToss;
|
#endif
|
|
static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
|
// try to avoid _chkstk insertions; make this thread local
|
static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
|
|
INLINE
|
void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
|
{
|
edge.a = a;
|
edge.b = b;
|
|
// compute constant steps to adjacent quads
|
edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
|
edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
|
|
// compute constant steps to adjacent raster tiles
|
edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
|
edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
|
|
// compute quad offsets
|
const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
|
const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
|
|
__m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
|
__m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
|
edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
|
|
// compute raster tile offsets
|
const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
|
const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
|
|
__m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
|
__m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
|
edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
|
}
|
|
INLINE
|
void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
|
{
|
ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
|
}
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Primary template definition used for partially specializing
|
/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
|
/// corner to sample position, and test for coverage
|
/// @tparam sampleCount: multisample count
|
template <typename NumSamplesT>
|
INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
|
int32_t &mask0, int32_t &mask1, int32_t &mask2)
|
{
|
__m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
|
// evaluate edge equations at the tile multisample bounding box
|
vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
|
vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
|
vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
|
mask0 = _mm256_movemask_pd(vSampleBboxTest0);
|
mask1 = _mm256_movemask_pd(vSampleBboxTest1);
|
mask2 = _mm256_movemask_pd(vSampleBboxTest2);
|
}
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
|
/// when only rasterizing a single coverage test point
|
template <>
|
INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
|
int32_t &mask0, int32_t &mask1, int32_t &mask2)
|
{
|
mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
|
mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
|
mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
|
}
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @struct ComputeScissorEdges
|
/// @brief Primary template definition. Allows the function to be generically
|
/// called. When paired with below specializations, will result in an empty
|
/// inlined function if scissor is not enabled
|
/// @tparam RasterScissorEdgesT: is scissor enabled?
|
/// @tparam IsConservativeT: is conservative rast enabled?
|
/// @tparam RT: rasterizer traits
|
template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
|
struct ComputeScissorEdges
|
{
|
INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
|
EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
|
/// specialization. Instantiated when conservative rast and scissor are enabled
|
template <typename RT>
|
struct ComputeScissorEdges<std::true_type, std::true_type, RT>
|
{
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
|
/// evaluate edge equations and offset them away from pixel center.
|
INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
|
EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
|
{
|
// if conservative rasterizing, triangle bbox intersected with scissor bbox is used
|
SWR_RECT scissor;
|
scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
|
scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
|
scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
|
scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
|
|
POS topLeft{scissor.xmin, scissor.ymin};
|
POS bottomLeft{scissor.xmin, scissor.ymax};
|
POS topRight{scissor.xmax, scissor.ymin};
|
POS bottomRight{scissor.xmax, scissor.ymax};
|
|
// construct 4 scissor edges in ccw direction
|
ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
|
ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
|
ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
|
ComputeEdgeData(topRight, topLeft, rastEdges[6]);
|
|
vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
|
vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
|
vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
|
vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
|
|
// if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
|
adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
|
adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
|
adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
|
adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
|
|
// Upper left rule for scissor
|
vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
|
vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
|
}
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
|
/// specialization. Instantiated when scissor is enabled and conservative rast
|
/// is disabled.
|
template <typename RT>
|
struct ComputeScissorEdges<std::true_type, std::false_type, RT>
|
{
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Compute scissor edge vectors and evaluate edge equations
|
INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
|
EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
|
{
|
const SWR_RECT &scissor = scissorBBox;
|
POS topLeft{scissor.xmin, scissor.ymin};
|
POS bottomLeft{scissor.xmin, scissor.ymax};
|
POS topRight{scissor.xmax, scissor.ymin};
|
POS bottomRight{scissor.xmax, scissor.ymax};
|
|
// construct 4 scissor edges in ccw direction
|
ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
|
ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
|
ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
|
ComputeEdgeData(topRight, topLeft, rastEdges[6]);
|
|
vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
|
vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
|
vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
|
vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
|
|
// Upper left rule for scissor
|
vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
|
vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
|
}
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Primary function template for TrivialRejectTest. Should
|
/// never be called, but TemplateUnroller instantiates a few unused values,
|
/// so it calls a runtime assert instead of a static_assert.
|
template <typename ValidEdgeMaskT>
|
INLINE bool TrivialRejectTest(const int, const int, const int)
|
{
|
SWR_INVALID("Primary templated function should never be called");
|
return false;
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
|
/// and edge 1 for trivial coverage reject
|
template <>
|
INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
|
{
|
return (!(mask0 && mask1)) ? true : false;
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
|
/// and edge 2 for trivial coverage reject
|
template <>
|
INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
|
{
|
return (!(mask0 && mask2)) ? true : false;
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
|
/// and edge 2 for trivial coverage reject
|
template <>
|
INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
|
{
|
return (!(mask1 && mask2)) ? true : false;
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
|
/// primitive edges for trivial coverage reject
|
template <>
|
INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
|
{
|
return (!(mask0 && mask1 && mask2)) ? true : false;;
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
|
/// point, so return false and rasterize against conservative BBox
|
template <>
|
INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
|
{
|
return false;
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Primary function template for TrivialAcceptTest. Always returns
|
/// false, since it will only be called for degenerate tris, and as such
|
/// will never cover the entire raster tile
|
template <typename ScissorEnableT>
|
INLINE bool TrivialAcceptTest(const int, const int, const int)
|
{
|
return false;
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
|
/// edge masks for a fully covered raster tile
|
template <>
|
INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
|
{
|
return ((mask0 & mask1 & mask2) == 0xf);
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Primary function template for GenerateSVInnerCoverage. Results
|
/// in an empty function call if SVInnerCoverage isn't requested
|
template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
|
struct GenerateSVInnerCoverage
|
{
|
INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t &){};
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Specialization of GenerateSVInnerCoverage where all edges
|
/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
|
/// edge values from OuterConservative to InnerConservative and rasterizes.
|
template <typename RT>
|
struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
|
{
|
INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges, uint64_t &innerCoverageMask)
|
{
|
SWR_CONTEXT *pContext = pDC->pContext;
|
|
double startQuadEdgesAdj[RT::NumEdgesT::value];
|
for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
|
{
|
startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
|
}
|
|
// not trivial accept or reject, must rasterize full tile
|
AR_BEGIN(BERasterizePartial, pDC->drawId);
|
innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
|
AR_END(BERasterizePartial, 0);
|
}
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
|
/// in an empty function call if SVInnerCoverage isn't requested
|
template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
|
struct UpdateEdgeMasksInnerConservative
|
{
|
INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
|
const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
|
/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
|
/// evaluated at raster tile corners to inner conservative position and
|
/// updates edge masks
|
template <typename RT>
|
struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
|
{
|
INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
|
const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
|
{
|
__m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
|
|
// instead of keeping 2 copies of evaluated edges around, just compensate for the outer
|
// conservative evaluated edge when adjusting the edge in for inner conservative tests
|
adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
|
adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
|
adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
|
|
UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
|
}
|
};
|
|
//////////////////////////////////////////////////////////////////////////
|
/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
|
/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
|
/// cover an entire raster tile, set mask0 to 0 to force it down the
|
/// rastierizePartialTile path
|
template <typename RT, typename ValidEdgeMaskT>
|
struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
|
{
|
INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
|
const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
|
{
|
// set one mask to zero to force the triangle down the rastierizePartialTile path
|
mask0 = 0;
|
}
|
};
|
|
template <typename RT>
|
void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
|
{
|
SWR_CONTEXT *pContext = pDC->pContext;
|
const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
|
#if KNOB_ENABLE_TOSS_POINTS
|
if (KNOB_TOSS_BIN_TRIS)
|
{
|
return;
|
}
|
#endif
|
AR_BEGIN(BERasterizeTriangle, pDC->drawId);
|
AR_BEGIN(BETriangleSetup, pDC->drawId);
|
|
const API_STATE &state = GetApiState(pDC);
|
const SWR_RASTSTATE &rastState = state.rastState;
|
const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
|
|
OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
|
triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
|
|
__m128 vX, vY, vZ, vRecipW;
|
|
// pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
|
// eg: vX = [x0 x1 x2 dc]
|
vX = _mm_load_ps(workDesc.pTriBuffer);
|
vY = _mm_load_ps(workDesc.pTriBuffer + 4);
|
vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
|
vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
|
|
// convert to fixed point
|
static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
|
__m128i vXi = fpToFixedPoint(vX);
|
__m128i vYi = fpToFixedPoint(vY);
|
|
// quantize floating point position to fixed point precision
|
// to prevent attribute creep around the triangle vertices
|
vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
|
vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
|
|
// triangle setup - A and B edge equation coefs
|
__m128 vA, vB;
|
triangleSetupAB(vX, vY, vA, vB);
|
|
__m128i vAi, vBi;
|
triangleSetupABInt(vXi, vYi, vAi, vBi);
|
|
// determinant
|
float det = calcDeterminantInt(vAi, vBi);
|
|
// Verts in Pixel Coordinate Space at this point
|
// Det > 0 = CW winding order
|
// Convert CW triangles to CCW
|
if (det > 0.0)
|
{
|
vA = _mm_mul_ps(vA, _mm_set1_ps(-1));
|
vB = _mm_mul_ps(vB, _mm_set1_ps(-1));
|
vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
|
vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
|
det = -det;
|
}
|
|
__m128 vC;
|
// Finish triangle setup - C edge coef
|
triangleSetupC(vX, vY, vA, vB, vC);
|
|
if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
|
{
|
// If we have degenerate edge(s) to rasterize, set I and J coefs
|
// to 0 for constant interpolation of attributes
|
triDesc.I[0] = 0.0f;
|
triDesc.I[1] = 0.0f;
|
triDesc.I[2] = 0.0f;
|
triDesc.J[0] = 0.0f;
|
triDesc.J[1] = 0.0f;
|
triDesc.J[2] = 0.0f;
|
|
// Degenerate triangles have no area
|
triDesc.recipDet = 0.0f;
|
}
|
else
|
{
|
// only extract coefs for 2 of the barycentrics; the 3rd can be
|
// determined from the barycentric equation:
|
// i + j + k = 1 <=> k = 1 - j - i
|
_MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
|
_MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
|
_MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
|
_MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
|
_MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
|
_MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
|
|
// compute recipDet, used to calculate barycentric i and j in the backend
|
triDesc.recipDet = 1.0f/det;
|
}
|
|
OSALIGNSIMD(float) oneOverW[4];
|
_mm_store_ps(oneOverW, vRecipW);
|
triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
|
triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
|
triDesc.OneOverW[2] = oneOverW[2];
|
|
// calculate perspective correct coefs per vertex attrib
|
float* pPerspAttribs = perspAttribsTLS;
|
float* pAttribs = workDesc.pAttribs;
|
triDesc.pPerspAttribs = pPerspAttribs;
|
triDesc.pAttribs = pAttribs;
|
float *pRecipW = workDesc.pTriBuffer + 12;
|
triDesc.pRecipW = pRecipW;
|
__m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
|
__m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
|
__m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
|
for(uint32_t i = 0; i < workDesc.numAttribs; i++)
|
{
|
__m128 attribA = _mm_load_ps(pAttribs);
|
__m128 attribB = _mm_load_ps(pAttribs+=4);
|
__m128 attribC = _mm_load_ps(pAttribs+=4);
|
pAttribs+=4;
|
|
attribA = _mm_mul_ps(attribA, vOneOverWV0);
|
attribB = _mm_mul_ps(attribB, vOneOverWV1);
|
attribC = _mm_mul_ps(attribC, vOneOverWV2);
|
|
_mm_store_ps(pPerspAttribs, attribA);
|
_mm_store_ps(pPerspAttribs+=4, attribB);
|
_mm_store_ps(pPerspAttribs+=4, attribC);
|
pPerspAttribs+=4;
|
}
|
|
// compute bary Z
|
// zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
|
OSALIGNSIMD(float) a[4];
|
_mm_store_ps(a, vZ);
|
triDesc.Z[0] = a[0] - a[2];
|
triDesc.Z[1] = a[1] - a[2];
|
triDesc.Z[2] = a[2];
|
|
// add depth bias
|
triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
|
|
// Calc bounding box of triangle
|
OSALIGNSIMD(SWR_RECT) bbox;
|
calcBoundingBoxInt(vXi, vYi, bbox);
|
|
const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
|
|
if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
|
{
|
// If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
|
bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++;
|
SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
|
"Conservative rast degenerate handling requires a valid scissor rect");
|
}
|
|
// Intersect with scissor/viewport
|
OSALIGNSIMD(SWR_RECT) intersect;
|
intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
|
intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
|
intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
|
intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
|
|
triDesc.triFlags = workDesc.triFlags;
|
|
// further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
|
uint32_t macroX, macroY;
|
MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
|
int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
|
int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
|
int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
|
int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
|
|
intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
|
intersect.ymin = std::max(intersect.ymin, macroBoxTop);
|
intersect.xmax = std::min(intersect.xmax, macroBoxRight);
|
intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
|
|
SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
|
|
AR_END(BETriangleSetup, 0);
|
|
// update triangle desc
|
uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
|
uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
|
uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
|
uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
|
uint32_t numTilesX = maxTileX - minTileX + 1;
|
uint32_t numTilesY = maxTileY - minTileY + 1;
|
|
if (numTilesX == 0 || numTilesY == 0)
|
{
|
RDTSC_EVENT(BEEmptyTriangle, 1, 0);
|
AR_END(BERasterizeTriangle, 1);
|
return;
|
}
|
|
AR_BEGIN(BEStepSetup, pDC->drawId);
|
|
// Step to pixel center of top-left pixel of the triangle bbox
|
// Align intersect bbox (top/left) to raster tile's (top/left).
|
int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
|
int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
|
|
// convenience typedef
|
typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
|
|
// single sample rasterization evaluates edges at pixel center,
|
// multisample evaluates edges UL pixel corner and steps to each sample position
|
if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
|
{
|
// Add 0.5, in fixed point, to offset to pixel center
|
x += (FIXED_POINT_SCALE / 2);
|
y += (FIXED_POINT_SCALE / 2);
|
}
|
|
__m128i vTopLeftX = _mm_set1_epi32(x);
|
__m128i vTopLeftY = _mm_set1_epi32(y);
|
|
// evaluate edge equations at top-left pixel using 64bit math
|
//
|
// line = Ax + By + C
|
// solving for C:
|
// C = -Ax - By
|
// we know x0 and y0 are on the line; plug them in:
|
// C = -Ax0 - By0
|
// plug C back into line equation:
|
// line = Ax - By - Ax0 - By0
|
// line = A(x - x0) + B(y - y0)
|
// dX = (x-x0), dY = (y-y0)
|
// so all this simplifies to
|
// edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
|
|
__m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
|
__m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
|
|
// evaluate A(dx) and B(dY) for all points
|
__m256d vAipd = _mm256_cvtepi32_pd(vAi);
|
__m256d vBipd = _mm256_cvtepi32_pd(vBi);
|
__m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
|
__m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
|
|
__m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
|
__m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
|
__m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
|
|
// apply any edge adjustments(top-left, crast, etc)
|
adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
|
|
// broadcast respective edge results to all lanes
|
double* pEdge = (double*)&vEdge;
|
__m256d vEdgeFix16[7];
|
vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
|
vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
|
vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
|
|
OSALIGNSIMD(int32_t) aAi[4], aBi[4];
|
_mm_store_si128((__m128i*)aAi, vAi);
|
_mm_store_si128((__m128i*)aBi, vBi);
|
EDGE rastEdges[RT::NumEdgesT::value];
|
|
// Compute and store triangle edge data
|
ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
|
ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
|
ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
|
|
// Compute and store triangle edge data if scissor needs to rasterized
|
ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
|
(bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
|
|
// Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
|
// used to for testing if entire raster tile is inside a triangle
|
for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
|
{
|
vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
|
}
|
|
// at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
|
// step sample positions to the raster tile bbox of multisample points
|
// min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
|
// | |
|
// | |
|
// min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
|
__m256d vEdgeTileBbox[3];
|
if (NumCoverageSamplesT::value > 1)
|
{
|
const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
|
const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
|
const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
|
|
__m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
|
__m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
|
|
// step edge equation tests from Tile
|
// used to for testing if entire raster tile is inside a triangle
|
for (uint32_t e = 0; e < 3; ++e)
|
{
|
__m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
|
__m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
|
vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
|
|
// adjust for msaa tile bbox edges outward for conservative rast, if enabled
|
adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
|
}
|
}
|
|
AR_END(BEStepSetup, 0);
|
|
uint32_t tY = minTileY;
|
uint32_t tX = minTileX;
|
uint32_t maxY = maxTileY;
|
uint32_t maxX = maxTileX;
|
|
RenderOutputBuffers renderBuffers, currentRenderBufferRow;
|
GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
|
currentRenderBufferRow = renderBuffers;
|
|
// rasterize and generate coverage masks per sample
|
for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
|
{
|
__m256d vStartOfRowEdge[RT::NumEdgesT::value];
|
for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
|
{
|
vStartOfRowEdge[e] = vEdgeFix16[e];
|
}
|
|
for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
|
{
|
triDesc.anyCoveredSamples = 0;
|
|
// is the corner of the edge outside of the raster tile? (vEdge < 0)
|
int mask0, mask1, mask2;
|
UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
|
|
for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
|
{
|
// trivial reject, at least one edge has all 4 corners of raster tile outside
|
bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
|
|
if (!trivialReject)
|
{
|
// trivial accept mask
|
triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
|
|
// Update the raster tile edge masks based on inner conservative edge offsets, if enabled
|
UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
|
(vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
|
|
// @todo Make this a bit smarter to allow use of trivial accept when:
|
// 1) scissor/vp intersection rect is raster tile aligned
|
// 2) raster tile is entirely within scissor/vp intersection rect
|
if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
|
{
|
// trivial accept, all 4 corners of all 3 edges are negative
|
// i.e. raster tile completely inside triangle
|
triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
|
if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
|
{
|
triDesc.innerCoverageMask = 0xffffffffffffffffULL;
|
}
|
RDTSC_EVENT(BETrivialAccept, 1, 0);
|
}
|
else
|
{
|
__m256d vEdgeAtSample[RT::NumEdgesT::value];
|
if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
|
{
|
// should get optimized out for single sample case (global value numbering or copy propagation)
|
for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
|
{
|
vEdgeAtSample[e] = vEdgeFix16[e];
|
}
|
}
|
else
|
{
|
const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
|
__m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
|
__m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
|
__m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
|
__m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
|
|
// step edge equation tests from UL tile corner to pixel sample position
|
for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
|
{
|
__m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
|
__m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
|
vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
|
vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
|
}
|
}
|
|
double startQuadEdges[RT::NumEdgesT::value];
|
const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
|
for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
|
{
|
_mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
|
}
|
|
// not trivial accept or reject, must rasterize full tile
|
AR_BEGIN(BERasterizePartial, pDC->drawId);
|
triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
|
AR_END(BERasterizePartial, 0);
|
|
triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
|
|
// Output SV InnerCoverage, if needed
|
GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
|
}
|
}
|
else
|
{
|
// if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
|
if(NumCoverageSamplesT::value > 1)
|
{
|
triDesc.coverageMask[sampleNum] = 0;
|
}
|
RDTSC_EVENT(BETrivialReject, 1, 0);
|
}
|
}
|
|
#if KNOB_ENABLE_TOSS_POINTS
|
if(KNOB_TOSS_RS)
|
{
|
gToss = triDesc.coverageMask[0];
|
}
|
else
|
#endif
|
if(triDesc.anyCoveredSamples)
|
{
|
// if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
|
// copy conservative coverage result to all samples
|
if(RT::IsConservativeT::value)
|
{
|
auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
|
UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
|
}
|
|
AR_BEGIN(BEPixelBackend, pDC->drawId);
|
backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
|
AR_END(BEPixelBackend, 0);
|
}
|
|
// step to the next tile in X
|
for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
|
{
|
vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
|
}
|
StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers);
|
}
|
|
// step to the next tile in Y
|
for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
|
{
|
vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
|
}
|
StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
|
}
|
|
AR_END(BERasterizeTriangle, 1);
|
}
|
|
// Get pointers to hot tile memory for color RT, depth, stencil
|
template <uint32_t numSamples>
|
void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
|
{
|
const API_STATE& state = GetApiState(pDC);
|
SWR_CONTEXT *pContext = pDC->pContext;
|
|
uint32_t mx, my;
|
MacroTileMgr::getTileIndices(macroID, mx, my);
|
tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
|
tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
|
|
// compute tile offset for active hottile buffers
|
const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
|
uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
|
offset*=numSamples;
|
|
unsigned long rtSlot = 0;
|
uint32_t colorHottileEnableMask = state.colorHottileEnable;
|
while(_BitScanForward(&rtSlot, colorHottileEnableMask))
|
{
|
HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true,
|
numSamples, renderTargetArrayIndex);
|
pColor->state = HOTTILE_DIRTY;
|
renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
|
|
colorHottileEnableMask &= ~(1 << rtSlot);
|
}
|
if(state.depthHottileEnable)
|
{
|
const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
|
uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
|
offset*=numSamples;
|
HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true,
|
numSamples, renderTargetArrayIndex);
|
pDepth->state = HOTTILE_DIRTY;
|
SWR_ASSERT(pDepth->pBuffer != nullptr);
|
renderBuffers.pDepth = pDepth->pBuffer + offset;
|
}
|
if(state.stencilHottileEnable)
|
{
|
const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
|
uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
|
offset*=numSamples;
|
HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true,
|
numSamples, renderTargetArrayIndex);
|
pStencil->state = HOTTILE_DIRTY;
|
SWR_ASSERT(pStencil->pBuffer != nullptr);
|
renderBuffers.pStencil = pStencil->pBuffer + offset;
|
}
|
}
|
|
template <typename RT>
|
INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers)
|
{
|
DWORD rt = 0;
|
while (_BitScanForward(&rt, colorHotTileMask))
|
{
|
colorHotTileMask &= ~(1 << rt);
|
buffers.pColor[rt] += RT::colorRasterTileStep;
|
}
|
|
buffers.pDepth += RT::depthRasterTileStep;
|
buffers.pStencil += RT::stencilRasterTileStep;
|
}
|
|
template <typename RT>
|
INLINE void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
|
{
|
DWORD rt = 0;
|
while (_BitScanForward(&rt, colorHotTileMask))
|
{
|
colorHotTileMask &= ~(1 << rt);
|
startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
|
buffers.pColor[rt] = startBufferRow.pColor[rt];
|
}
|
startBufferRow.pDepth += RT::depthRasterTileRowStep;
|
buffers.pDepth = startBufferRow.pDepth;
|
|
startBufferRow.pStencil += RT::stencilRasterTileRowStep;
|
buffers.pStencil = startBufferRow.pStencil;
|
}
|