/*
|
* Copyright © 2014 Broadcom
|
*
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
* copy of this software and associated documentation files (the "Software"),
|
* to deal in the Software without restriction, including without limitation
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
* and/or sell copies of the Software, and to permit persons to whom the
|
* Software is furnished to do so, subject to the following conditions:
|
*
|
* The above copyright notice and this permission notice (including the next
|
* paragraph) shall be included in all copies or substantial portions of the
|
* Software.
|
*
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
* IN THE SOFTWARE.
|
*/
|
|
#include "util/ralloc.h"
|
#include "util/register_allocate.h"
|
#include "vc4_context.h"
|
#include "vc4_qir.h"
|
#include "vc4_qpu.h"
|
|
#define QPU_R(file, index) { QPU_MUX_##file, index }
|
|
static const struct qpu_reg vc4_regs[] = {
|
{ QPU_MUX_R0, 0},
|
{ QPU_MUX_R1, 0},
|
{ QPU_MUX_R2, 0},
|
{ QPU_MUX_R3, 0},
|
{ QPU_MUX_R4, 0},
|
QPU_R(A, 0),
|
QPU_R(B, 0),
|
QPU_R(A, 1),
|
QPU_R(B, 1),
|
QPU_R(A, 2),
|
QPU_R(B, 2),
|
QPU_R(A, 3),
|
QPU_R(B, 3),
|
QPU_R(A, 4),
|
QPU_R(B, 4),
|
QPU_R(A, 5),
|
QPU_R(B, 5),
|
QPU_R(A, 6),
|
QPU_R(B, 6),
|
QPU_R(A, 7),
|
QPU_R(B, 7),
|
QPU_R(A, 8),
|
QPU_R(B, 8),
|
QPU_R(A, 9),
|
QPU_R(B, 9),
|
QPU_R(A, 10),
|
QPU_R(B, 10),
|
QPU_R(A, 11),
|
QPU_R(B, 11),
|
QPU_R(A, 12),
|
QPU_R(B, 12),
|
QPU_R(A, 13),
|
QPU_R(B, 13),
|
QPU_R(A, 14),
|
QPU_R(B, 14),
|
QPU_R(A, 15),
|
QPU_R(B, 15),
|
QPU_R(A, 16),
|
QPU_R(B, 16),
|
QPU_R(A, 17),
|
QPU_R(B, 17),
|
QPU_R(A, 18),
|
QPU_R(B, 18),
|
QPU_R(A, 19),
|
QPU_R(B, 19),
|
QPU_R(A, 20),
|
QPU_R(B, 20),
|
QPU_R(A, 21),
|
QPU_R(B, 21),
|
QPU_R(A, 22),
|
QPU_R(B, 22),
|
QPU_R(A, 23),
|
QPU_R(B, 23),
|
QPU_R(A, 24),
|
QPU_R(B, 24),
|
QPU_R(A, 25),
|
QPU_R(B, 25),
|
QPU_R(A, 26),
|
QPU_R(B, 26),
|
QPU_R(A, 27),
|
QPU_R(B, 27),
|
QPU_R(A, 28),
|
QPU_R(B, 28),
|
QPU_R(A, 29),
|
QPU_R(B, 29),
|
QPU_R(A, 30),
|
QPU_R(B, 30),
|
QPU_R(A, 31),
|
QPU_R(B, 31),
|
};
|
#define ACC_INDEX 0
|
#define ACC_COUNT 5
|
#define AB_INDEX (ACC_INDEX + ACC_COUNT)
|
#define AB_COUNT 64
|
|
static void
|
vc4_alloc_reg_set(struct vc4_context *vc4)
|
{
|
assert(vc4_regs[AB_INDEX].addr == 0);
|
assert(vc4_regs[AB_INDEX + 1].addr == 0);
|
STATIC_ASSERT(ARRAY_SIZE(vc4_regs) == AB_INDEX + 64);
|
|
if (vc4->regs)
|
return;
|
|
vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), true);
|
|
/* The physical regfiles split us into two classes, with [0] being the
|
* whole space and [1] being the bottom half (for threaded fragment
|
* shaders).
|
*/
|
for (int i = 0; i < 2; i++) {
|
vc4->reg_class_any[i] = ra_alloc_reg_class(vc4->regs);
|
vc4->reg_class_a_or_b[i] = ra_alloc_reg_class(vc4->regs);
|
vc4->reg_class_a_or_b_or_acc[i] = ra_alloc_reg_class(vc4->regs);
|
vc4->reg_class_r4_or_a[i] = ra_alloc_reg_class(vc4->regs);
|
vc4->reg_class_a[i] = ra_alloc_reg_class(vc4->regs);
|
}
|
vc4->reg_class_r0_r3 = ra_alloc_reg_class(vc4->regs);
|
|
/* r0-r3 */
|
for (uint32_t i = ACC_INDEX; i < ACC_INDEX + 4; i++) {
|
ra_class_add_reg(vc4->regs, vc4->reg_class_r0_r3, i);
|
ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i);
|
ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i);
|
}
|
|
/* R4 gets a special class because it can't be written as a general
|
* purpose register. (it's TMU_NOSWAP as a write address).
|
*/
|
for (int i = 0; i < 2; i++) {
|
ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[i],
|
ACC_INDEX + 4);
|
ra_class_add_reg(vc4->regs, vc4->reg_class_any[i],
|
ACC_INDEX + 4);
|
}
|
|
/* A/B */
|
for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i ++) {
|
/* Reserve ra14/rb14 for spilling fixup_raddr_conflict() in
|
* vc4_qpu_emit.c
|
*/
|
if (vc4_regs[i].addr == 14)
|
continue;
|
|
ra_class_add_reg(vc4->regs, vc4->reg_class_any[0], i);
|
ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[0], i);
|
ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i);
|
|
if (vc4_regs[i].addr < 16) {
|
ra_class_add_reg(vc4->regs, vc4->reg_class_any[1], i);
|
ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[1], i);
|
ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i);
|
}
|
|
|
/* A only */
|
if (((i - AB_INDEX) & 1) == 0) {
|
ra_class_add_reg(vc4->regs, vc4->reg_class_a[0], i);
|
ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[0], i);
|
|
if (vc4_regs[i].addr < 16) {
|
ra_class_add_reg(vc4->regs,
|
vc4->reg_class_a[1], i);
|
ra_class_add_reg(vc4->regs,
|
vc4->reg_class_r4_or_a[1], i);
|
}
|
}
|
}
|
|
ra_set_finalize(vc4->regs, NULL);
|
}
|
|
struct node_to_temp_map {
|
uint32_t temp;
|
uint32_t priority;
|
};
|
|
static int
|
node_to_temp_priority(const void *in_a, const void *in_b)
|
{
|
const struct node_to_temp_map *a = in_a;
|
const struct node_to_temp_map *b = in_b;
|
|
return a->priority - b->priority;
|
}
|
|
#define CLASS_BIT_A (1 << 0)
|
#define CLASS_BIT_B (1 << 1)
|
#define CLASS_BIT_R4 (1 << 2)
|
#define CLASS_BIT_R0_R3 (1 << 4)
|
|
struct vc4_ra_select_callback_data {
|
uint32_t next_acc;
|
uint32_t next_ab;
|
};
|
|
static unsigned int
|
vc4_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
|
{
|
struct vc4_ra_select_callback_data *vc4_ra = data;
|
|
/* If r4 is available, always choose it -- few other things can go
|
* there, and choosing anything else means inserting a mov.
|
*/
|
if (BITSET_TEST(regs, ACC_INDEX + 4))
|
return ACC_INDEX + 4;
|
|
/* Choose an accumulator if possible (no delay between write and
|
* read), but round-robin through them to give post-RA instruction
|
* selection more options.
|
*/
|
for (int i = 0; i < ACC_COUNT; i++) {
|
int acc_off = (vc4_ra->next_acc + i) % ACC_COUNT;
|
int acc = ACC_INDEX + acc_off;
|
|
if (BITSET_TEST(regs, acc)) {
|
vc4_ra->next_acc = acc_off + 1;
|
return acc;
|
}
|
}
|
|
for (int i = 0; i < AB_COUNT; i++) {
|
int ab_off = (vc4_ra->next_ab + i) % AB_COUNT;
|
int ab = AB_INDEX + ab_off;
|
|
if (BITSET_TEST(regs, ab)) {
|
vc4_ra->next_ab = ab_off + 1;
|
return ab;
|
}
|
}
|
|
unreachable("RA must pass us at least one possible reg.");
|
}
|
|
/**
|
* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
|
*
|
* The return value should be freed by the caller.
|
*/
|
struct qpu_reg *
|
vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
|
{
|
struct node_to_temp_map map[c->num_temps];
|
uint32_t temp_to_node[c->num_temps];
|
uint8_t class_bits[c->num_temps];
|
struct qpu_reg *temp_registers = calloc(c->num_temps,
|
sizeof(*temp_registers));
|
struct vc4_ra_select_callback_data callback_data = {
|
.next_acc = 0,
|
.next_ab = 0,
|
};
|
|
/* If things aren't ever written (undefined values), just read from
|
* r0.
|
*/
|
for (uint32_t i = 0; i < c->num_temps; i++)
|
temp_registers[i] = qpu_rn(0);
|
|
vc4_alloc_reg_set(vc4);
|
|
struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
|
c->num_temps);
|
|
/* Compute the live ranges so we can figure out interference. */
|
qir_calculate_live_intervals(c);
|
|
ra_set_select_reg_callback(g, vc4_ra_select_callback, &callback_data);
|
|
for (uint32_t i = 0; i < c->num_temps; i++) {
|
map[i].temp = i;
|
map[i].priority = c->temp_end[i] - c->temp_start[i];
|
}
|
qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
|
for (uint32_t i = 0; i < c->num_temps; i++) {
|
temp_to_node[map[i].temp] = i;
|
}
|
|
/* Figure out our register classes and preallocated registers. We
|
* start with any temp being able to be in any file, then instructions
|
* incrementally remove bits that the temp definitely can't be in.
|
*/
|
memset(class_bits,
|
CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3,
|
sizeof(class_bits));
|
|
int ip = 0;
|
qir_for_each_inst_inorder(inst, c) {
|
if (qir_writes_r4(inst)) {
|
/* This instruction writes r4 (and optionally moves
|
* its result to a temp), so nothing else can be
|
* stored in r4 across it.
|
*/
|
for (int i = 0; i < c->num_temps; i++) {
|
if (c->temp_start[i] < ip && c->temp_end[i] > ip)
|
class_bits[i] &= ~CLASS_BIT_R4;
|
}
|
|
/* If we're doing a conditional write of something
|
* writing R4 (math, tex results), then make sure that
|
* we store in a temp so that we actually
|
* conditionally move the result.
|
*/
|
if (inst->cond != QPU_COND_ALWAYS)
|
class_bits[inst->dst.index] &= ~CLASS_BIT_R4;
|
} else {
|
/* R4 can't be written as a general purpose
|
* register. (it's TMU_NOSWAP as a write address).
|
*/
|
if (inst->dst.file == QFILE_TEMP)
|
class_bits[inst->dst.index] &= ~CLASS_BIT_R4;
|
}
|
|
switch (inst->op) {
|
case QOP_FRAG_Z:
|
ra_set_node_reg(g, temp_to_node[inst->dst.index],
|
AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2 + 1);
|
break;
|
|
case QOP_FRAG_W:
|
ra_set_node_reg(g, temp_to_node[inst->dst.index],
|
AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
|
break;
|
|
case QOP_ROT_MUL:
|
assert(inst->src[0].file == QFILE_TEMP);
|
class_bits[inst->src[0].index] &= CLASS_BIT_R0_R3;
|
break;
|
|
case QOP_THRSW:
|
/* All accumulators are invalidated across a thread
|
* switch.
|
*/
|
for (int i = 0; i < c->num_temps; i++) {
|
if (c->temp_start[i] < ip && c->temp_end[i] > ip)
|
class_bits[i] &= ~(CLASS_BIT_R0_R3 |
|
CLASS_BIT_R4);
|
}
|
break;
|
|
default:
|
break;
|
}
|
|
if (inst->dst.pack && !qir_is_mul(inst)) {
|
/* The non-MUL pack flags require an A-file dst
|
* register.
|
*/
|
class_bits[inst->dst.index] &= CLASS_BIT_A;
|
}
|
|
/* Apply restrictions for src unpacks. The integer unpacks
|
* can only be done from regfile A, while float unpacks can be
|
* either A or R4.
|
*/
|
for (int i = 0; i < qir_get_nsrc(inst); i++) {
|
if (inst->src[i].file == QFILE_TEMP &&
|
inst->src[i].pack) {
|
if (qir_is_float_input(inst)) {
|
class_bits[inst->src[i].index] &=
|
CLASS_BIT_A | CLASS_BIT_R4;
|
} else {
|
class_bits[inst->src[i].index] &=
|
CLASS_BIT_A;
|
}
|
}
|
}
|
|
ip++;
|
}
|
|
for (uint32_t i = 0; i < c->num_temps; i++) {
|
int node = temp_to_node[i];
|
|
switch (class_bits[i]) {
|
case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3:
|
ra_set_node_class(g, node,
|
vc4->reg_class_any[c->fs_threaded]);
|
break;
|
case CLASS_BIT_A | CLASS_BIT_B:
|
ra_set_node_class(g, node,
|
vc4->reg_class_a_or_b[c->fs_threaded]);
|
break;
|
case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R0_R3:
|
ra_set_node_class(g, node,
|
vc4->reg_class_a_or_b_or_acc[c->fs_threaded]);
|
break;
|
case CLASS_BIT_A | CLASS_BIT_R4:
|
ra_set_node_class(g, node,
|
vc4->reg_class_r4_or_a[c->fs_threaded]);
|
break;
|
case CLASS_BIT_A:
|
ra_set_node_class(g, node,
|
vc4->reg_class_a[c->fs_threaded]);
|
break;
|
case CLASS_BIT_R0_R3:
|
ra_set_node_class(g, node, vc4->reg_class_r0_r3);
|
break;
|
|
default:
|
/* DDX/DDY used across thread switched might get us
|
* here.
|
*/
|
if (c->fs_threaded) {
|
c->failed = true;
|
free(temp_registers);
|
return NULL;
|
}
|
|
fprintf(stderr, "temp %d: bad class bits: 0x%x\n",
|
i, class_bits[i]);
|
abort();
|
break;
|
}
|
}
|
|
for (uint32_t i = 0; i < c->num_temps; i++) {
|
for (uint32_t j = i + 1; j < c->num_temps; j++) {
|
if (!(c->temp_start[i] >= c->temp_end[j] ||
|
c->temp_start[j] >= c->temp_end[i])) {
|
ra_add_node_interference(g,
|
temp_to_node[i],
|
temp_to_node[j]);
|
}
|
}
|
}
|
|
bool ok = ra_allocate(g);
|
if (!ok) {
|
if (!c->fs_threaded) {
|
fprintf(stderr, "Failed to register allocate:\n");
|
qir_dump(c);
|
}
|
|
c->failed = true;
|
free(temp_registers);
|
return NULL;
|
}
|
|
for (uint32_t i = 0; i < c->num_temps; i++) {
|
temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])];
|
|
/* If the value's never used, just write to the NOP register
|
* for clarity in debug output.
|
*/
|
if (c->temp_start[i] == c->temp_end[i])
|
temp_registers[i] = qpu_ra(QPU_W_NOP);
|
}
|
|
ralloc_free(g);
|
|
return temp_registers;
|
}
|