|
/*
|
* Copyright © 2014 Broadcom
|
*
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
* copy of this software and associated documentation files (the "Software"),
|
* to deal in the Software without restriction, including without limitation
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
* and/or sell copies of the Software, and to permit persons to whom the
|
* Software is furnished to do so, subject to the following conditions:
|
*
|
* The above copyright notice and this permission notice (including the next
|
* paragraph) shall be included in all copies or substantial portions of the
|
* Software.
|
*
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
* IN THE SOFTWARE.
|
*/
|
|
#include "vc4_qpu.h"
|
|
static void
|
fail_instr(uint64_t inst, const char *msg)
|
{
|
fprintf(stderr, "vc4_qpu_validate: %s: ", msg);
|
vc4_qpu_disasm(&inst, 1);
|
fprintf(stderr, "\n");
|
abort();
|
}
|
|
static bool
|
writes_reg(uint64_t inst, uint32_t w)
|
{
|
return (QPU_GET_FIELD(inst, QPU_WADDR_ADD) == w ||
|
QPU_GET_FIELD(inst, QPU_WADDR_MUL) == w);
|
}
|
|
static bool
|
_reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
|
{
|
struct {
|
uint32_t mux, addr;
|
} src_regs[] = {
|
{ QPU_GET_FIELD(inst, QPU_ADD_A) },
|
{ QPU_GET_FIELD(inst, QPU_ADD_B) },
|
{ QPU_GET_FIELD(inst, QPU_MUL_A) },
|
{ QPU_GET_FIELD(inst, QPU_MUL_B) },
|
};
|
|
/* Branches only reference raddr_a (no mux), and we don't use that
|
* feature of branching.
|
*/
|
if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
|
return false;
|
|
/* Load immediates don't read any registers. */
|
if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
|
return false;
|
|
for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
|
if (!ignore_a &&
|
src_regs[i].mux == QPU_MUX_A &&
|
(QPU_GET_FIELD(inst, QPU_RADDR_A) == r))
|
return true;
|
|
if (!ignore_b &&
|
QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&
|
src_regs[i].mux == QPU_MUX_B &&
|
(QPU_GET_FIELD(inst, QPU_RADDR_B) == r))
|
return true;
|
}
|
|
return false;
|
}
|
|
static bool
|
reads_reg(uint64_t inst, uint32_t r)
|
{
|
return _reads_reg(inst, r, false, false);
|
}
|
|
static bool
|
reads_a_reg(uint64_t inst, uint32_t r)
|
{
|
return _reads_reg(inst, r, false, true);
|
}
|
|
static bool
|
reads_b_reg(uint64_t inst, uint32_t r)
|
{
|
return _reads_reg(inst, r, true, false);
|
}
|
|
static bool
|
writes_sfu(uint64_t inst)
|
{
|
return (writes_reg(inst, QPU_W_SFU_RECIP) ||
|
writes_reg(inst, QPU_W_SFU_RECIPSQRT) ||
|
writes_reg(inst, QPU_W_SFU_EXP) ||
|
writes_reg(inst, QPU_W_SFU_LOG));
|
}
|
|
/**
|
* Checks for the instruction restrictions from page 37 ("Summary of
|
* Instruction Restrictions").
|
*/
|
void
|
vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
|
{
|
bool scoreboard_locked = false;
|
bool threaded = false;
|
|
/* We don't want to do validation in release builds, but we want to
|
* keep compiling the validation code to make sure it doesn't get
|
* broken.
|
*/
|
#ifndef DEBUG
|
return;
|
#endif
|
|
for (int i = 0; i < num_inst; i++) {
|
uint64_t inst = insts[i];
|
uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
|
|
if (sig != QPU_SIG_PROG_END) {
|
if (qpu_inst_is_tlb(inst))
|
scoreboard_locked = true;
|
|
if (sig == QPU_SIG_THREAD_SWITCH ||
|
sig == QPU_SIG_LAST_THREAD_SWITCH) {
|
threaded = true;
|
}
|
|
continue;
|
}
|
|
/* "The Thread End instruction must not write to either physical
|
* regfile A or B."
|
*/
|
if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 ||
|
QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) {
|
fail_instr(inst, "write to phys reg in thread end");
|
}
|
|
/* Can't trigger an implicit wait on scoreboard in the program
|
* end instruction.
|
*/
|
if (qpu_inst_is_tlb(inst) && !scoreboard_locked)
|
fail_instr(inst, "implicit sb wait in program end");
|
|
/* Two delay slots will be executed. */
|
assert(i + 2 <= num_inst);
|
|
for (int j = i; j < i + 2; j++) {
|
/* "The last three instructions of any program
|
* (Thread End plus the following two delay-slot
|
* instructions) must not do varyings read, uniforms
|
* read or any kind of VPM, VDR, or VDW read or
|
* write."
|
*/
|
if (writes_reg(insts[j], QPU_W_VPM) ||
|
reads_reg(insts[j], QPU_R_VARY) ||
|
reads_reg(insts[j], QPU_R_UNIF) ||
|
reads_reg(insts[j], QPU_R_VPM)) {
|
fail_instr(insts[j], "last 3 instructions "
|
"using fixed functions");
|
}
|
|
/* "The Thread End instruction and the following two
|
* delay slot instructions must not write or read
|
* address 14 in either regfile A or B."
|
*/
|
if (writes_reg(insts[j], 14) ||
|
reads_reg(insts[j], 14)) {
|
fail_instr(insts[j], "last 3 instructions "
|
"must not use r14");
|
}
|
}
|
|
/* "The final program instruction (the second delay slot
|
* instruction) must not do a TLB Z write."
|
*/
|
if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) {
|
fail_instr(insts[i + 2], "final instruction doing "
|
"Z write");
|
}
|
}
|
|
/* "A scoreboard wait must not occur in the first two instructions of
|
* a fragment shader. This is either the explicit Wait for Scoreboard
|
* signal or an implicit wait with the first tile-buffer read or
|
* write instruction."
|
*/
|
for (int i = 0; i < 2; i++) {
|
uint64_t inst = insts[i];
|
|
if (qpu_inst_is_tlb(inst))
|
fail_instr(inst, "sb wait in first two insts");
|
}
|
|
/* "If TMU_NOSWAP is written, the write must be three instructions
|
* before the first TMU write instruction. For example, if
|
* TMU_NOSWAP is written in the first shader instruction, the first
|
* TMU write cannot occur before the 4th shader instruction."
|
*/
|
int last_tmu_noswap = -10;
|
for (int i = 0; i < num_inst; i++) {
|
uint64_t inst = insts[i];
|
|
if ((i - last_tmu_noswap) <= 3 &&
|
(writes_reg(inst, QPU_W_TMU0_S) ||
|
writes_reg(inst, QPU_W_TMU1_S))) {
|
fail_instr(inst, "TMU write too soon after TMU_NOSWAP");
|
}
|
|
if (writes_reg(inst, QPU_W_TMU_NOSWAP))
|
last_tmu_noswap = i;
|
}
|
|
/* "An instruction must not read from a location in physical regfile A
|
* or B that was written to by the previous instruction."
|
*/
|
for (int i = 0; i < num_inst - 1; i++) {
|
uint64_t inst = insts[i];
|
uint32_t add_waddr = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
|
uint32_t mul_waddr = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
|
uint32_t waddr_a, waddr_b;
|
|
if (inst & QPU_WS) {
|
waddr_b = add_waddr;
|
waddr_a = mul_waddr;
|
} else {
|
waddr_a = add_waddr;
|
waddr_b = mul_waddr;
|
}
|
|
if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) ||
|
(waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) {
|
fail_instr(insts[i + 1],
|
"Reads physical reg too soon after write");
|
}
|
}
|
|
/* "After an SFU lookup instruction, accumulator r4 must not be read
|
* in the following two instructions. Any other instruction that
|
* results in r4 being written (that is, TMU read, TLB read, SFU
|
* lookup) cannot occur in the two instructions following an SFU
|
* lookup."
|
*/
|
int last_sfu_inst = -10;
|
for (int i = 0; i < num_inst - 1; i++) {
|
uint64_t inst = insts[i];
|
uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
|
|
if (i - last_sfu_inst <= 2 &&
|
(writes_sfu(inst) ||
|
sig == QPU_SIG_LOAD_TMU0 ||
|
sig == QPU_SIG_LOAD_TMU1 ||
|
sig == QPU_SIG_COLOR_LOAD)) {
|
fail_instr(inst, "R4 write too soon after SFU write");
|
}
|
|
if (writes_sfu(inst))
|
last_sfu_inst = i;
|
}
|
|
for (int i = 0; i < num_inst - 1; i++) {
|
uint64_t inst = insts[i];
|
|
if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM &&
|
QPU_GET_FIELD(inst, QPU_SMALL_IMM) >=
|
QPU_SMALL_IMM_MUL_ROT) {
|
uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
|
uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
|
|
/* "The full horizontal vector rotate is only
|
* available when both of the mul ALU input arguments
|
* are taken from accumulators r0-r3."
|
*/
|
if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) {
|
fail_instr(inst,
|
"MUL rotate using non-accumulator "
|
"input");
|
}
|
|
if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) ==
|
QPU_SMALL_IMM_MUL_ROT) {
|
/* "An instruction that does a vector rotate
|
* by r5 must not immediately follow an
|
* instruction that writes to r5."
|
*/
|
if (writes_reg(insts[i - 1], QPU_W_ACC5)) {
|
fail_instr(inst,
|
"vector rotate by r5 "
|
"immediately after r5 write");
|
}
|
}
|
|
/* "An instruction that does a vector rotate must not
|
* immediately follow an instruction that writes to the
|
* accumulator that is being rotated."
|
*/
|
if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) ||
|
writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) {
|
fail_instr(inst,
|
"vector rotate of value "
|
"written in previous instruction");
|
}
|
}
|
}
|
|
/* "An instruction that does a vector rotate must not immediately
|
* follow an instruction that writes to the accumulator that is being
|
* rotated.
|
*
|
* XXX: TODO.
|
*/
|
|
/* "After an instruction that does a TLB Z write, the multisample mask
|
* must not be read as an instruction input argument in the following
|
* two instruction. The TLB Z write instruction can, however, be
|
* followed immediately by a TLB color write."
|
*/
|
for (int i = 0; i < num_inst - 1; i++) {
|
uint64_t inst = insts[i];
|
if (writes_reg(inst, QPU_W_TLB_Z) &&
|
(reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) ||
|
reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) {
|
fail_instr(inst, "TLB Z write followed by MS mask read");
|
}
|
}
|
|
/*
|
* "A single instruction can only perform a maximum of one of the
|
* following closely coupled peripheral accesses in a single
|
* instruction: TMU write, TMU read, TLB write, TLB read, TLB
|
* combined color read and write, SFU write, Mutex read or Semaphore
|
* access."
|
*/
|
for (int i = 0; i < num_inst - 1; i++) {
|
uint64_t inst = insts[i];
|
|
if (qpu_num_sf_accesses(inst) > 1)
|
fail_instr(inst, "Single instruction writes SFU twice");
|
}
|
|
/* "The uniform base pointer can be written (from SIMD element 0) by
|
* the processor to reset the stream, there must be at least two
|
* nonuniform-accessing instructions following a pointer change
|
* before uniforms can be accessed once more."
|
*/
|
int last_unif_pointer_update = -3;
|
for (int i = 0; i < num_inst; i++) {
|
uint64_t inst = insts[i];
|
uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
|
uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
|
|
if (reads_reg(inst, QPU_R_UNIF) &&
|
i - last_unif_pointer_update <= 2) {
|
fail_instr(inst,
|
"uniform read too soon after pointer update");
|
}
|
|
if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
|
waddr_mul == QPU_W_UNIFORMS_ADDRESS)
|
last_unif_pointer_update = i;
|
}
|
|
if (threaded) {
|
bool last_thrsw_found = false;
|
bool scoreboard_locked = false;
|
int tex_samples_outstanding = 0;
|
int last_tex_samples_outstanding = 0;
|
int thrsw_ip = -1;
|
|
for (int i = 0; i < num_inst; i++) {
|
uint64_t inst = insts[i];
|
uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
|
|
if (i == thrsw_ip) {
|
/* In order to get texture results back in the
|
* correct order, before a new thrsw we have
|
* to read all the texture results from before
|
* the previous thrsw.
|
*
|
* FIXME: Is collecting the remaining results
|
* during the delay slots OK, or should we do
|
* this at THRSW signal time?
|
*/
|
if (last_tex_samples_outstanding != 0) {
|
fail_instr(inst, "THRSW with texture "
|
"results from the previous "
|
"THRSW still in the FIFO.");
|
}
|
|
last_tex_samples_outstanding =
|
tex_samples_outstanding;
|
tex_samples_outstanding = 0;
|
}
|
|
if (qpu_inst_is_tlb(inst))
|
scoreboard_locked = true;
|
|
switch (sig) {
|
case QPU_SIG_THREAD_SWITCH:
|
case QPU_SIG_LAST_THREAD_SWITCH:
|
/* No thread switching with the scoreboard
|
* locked. Doing so means we may deadlock
|
* when the other thread tries to lock
|
* scoreboard.
|
*/
|
if (scoreboard_locked) {
|
fail_instr(inst, "THRSW with the "
|
"scoreboard locked.");
|
}
|
|
/* No thread switching after lthrsw, since
|
* lthrsw means that we get delayed until the
|
* other shader is ready for us to terminate.
|
*/
|
if (last_thrsw_found) {
|
fail_instr(inst, "THRSW after a "
|
"previous LTHRSW");
|
}
|
|
if (sig == QPU_SIG_LAST_THREAD_SWITCH)
|
last_thrsw_found = true;
|
|
/* No THRSW while we already have a THRSW
|
* queued.
|
*/
|
if (i < thrsw_ip) {
|
fail_instr(inst,
|
"THRSW with a THRSW queued.");
|
}
|
|
thrsw_ip = i + 3;
|
break;
|
|
case QPU_SIG_LOAD_TMU0:
|
case QPU_SIG_LOAD_TMU1:
|
if (last_tex_samples_outstanding == 0) {
|
fail_instr(inst, "TMU load with nothing "
|
"in the results fifo from "
|
"the previous THRSW.");
|
}
|
|
last_tex_samples_outstanding--;
|
break;
|
}
|
|
uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
|
uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
|
if (waddr_add == QPU_W_TMU0_S ||
|
waddr_add == QPU_W_TMU1_S ||
|
waddr_mul == QPU_W_TMU0_S ||
|
waddr_mul == QPU_W_TMU1_S) {
|
tex_samples_outstanding++;
|
}
|
}
|
}
|
}
|