/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
you may not use this file except in compliance with the License.
|
You may obtain a copy of the License at
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
Unless required by applicable law or agreed to in writing, software
|
distributed under the License is distributed on an "AS IS" BASIS,
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
See the License for the specific language governing permissions and
|
limitations under the License.
|
==============================================================================*/
|
|
// Implements convolution operations with other kernels baked into the
|
// processing, to optimize latency and memory usage:
|
// - Conv2D + BiasAdd + <Activation>
|
// - Conv2D + FusedBatchNorm + <Activation>
|
//
|
// Activation: Relu, Relu6, Elu, etc...
|
//
|
// Kernels for convolutions fused with image transformations (resize and mirror
|
// padding) defined in `conv_ops_fused_image_transform.cc`.
|
//
|
// For the CPU device we implement fusion with an Eigen tensor contraction
|
// output kernel. For the GPU device we rely on CuDNN primitives.
|
//
|
// NOTE: GPU only supports fusion of Conv2D + BiasAdd + <optional Relu>.
|
|
#ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_
|
#define TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_
|
|
#define USE_EIGEN_TENSOR
|
#define EIGEN_USE_THREADS
|
|
#if GOOGLE_CUDA
|
#define EIGEN_USE_GPU
|
#endif // GOOGLE_CUDA
|
|
#include <string>
|
#include <vector>
|
|
#include "absl/strings/str_cat.h"
|
#include "absl/strings/str_join.h"
|
#include "absl/strings/substitute.h"
|
#include "tensorflow/core/framework/bounds_check.h"
|
#include "tensorflow/core/framework/op_kernel.h"
|
#include "tensorflow/core/framework/register_types.h"
|
#include "tensorflow/core/framework/tensor.h"
|
#include "tensorflow/core/framework/tensor_shape.h"
|
#include "tensorflow/core/kernels/conv_2d.h"
|
#include "tensorflow/core/kernels/conv_ops.h"
|
#include "tensorflow/core/kernels/ops_util.h"
|
#include "tensorflow/core/util/tensor_format.h"
|
#include "tensorflow/core/util/use_cudnn.h"
|
|
#if GOOGLE_CUDA
|
#include "cuda/include/cudnn.h"
|
#include "tensorflow/core/kernels/conv_ops_gpu.h"
|
#include "tensorflow/core/platform/stream_executor.h"
|
#include "tensorflow/core/util/proto/proto_utils.h"
|
#endif // GOOGLE_CUDA
|
|
namespace tensorflow {
|
|
class AutotuneResult;
|
|
typedef Eigen::ThreadPoolDevice CPUDevice;
|
typedef Eigen::GpuDevice GPUDevice;
|
|
// Supported Conv2D fusions. Not all of them supported on all type of devices.
|
enum class FusedComputationType {
|
// NOTE(ezhulenev): CuDNN `cudnnConvolutionBiasActivationForward` supports
|
// identity activation function, it in theory should allow to fuse convolution
|
// with BiasAdd, but in practice it doesn't work, cuDNN ignores this parameter
|
// and always does Relu activation.
|
kBiasAdd, // CPU
|
kBiasAddWithRelu, // CPU and GPU
|
kFusedBatchNorm, // CPU only
|
kFusedBatchNormWithRelu // CPU only
|
};
|
|
// We have to pass around additional arguments for all possible fusion types.
|
struct FusedComputationArgs {
|
float epsilon = 0.0; // Used by `FusedBatchNorm` fusion only
|
};
|
|
template <typename Device, typename T>
|
struct LaunchFusedConv2DOp {
|
void operator()(OpKernelContext* context, bool use_cudnn,
|
bool cudnn_use_autotune, const Tensor& input,
|
const Tensor& filter, FusedComputationType fusion,
|
const FusedComputationArgs& fusion_args,
|
const Conv2DParameters& params,
|
const Conv2DDimensions& dimensions, Tensor* output);
|
};
|
|
// Type aliases for the unaligned tensors (tensor maps) used in output kernels.
|
template <typename T>
|
struct Unaligned {
|
// There is no guarantee that the output block passed to the output kernel
|
// will be aligned.
|
|
using Tensor =
|
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>,
|
Eigen::Unaligned>;
|
|
using ConstTensor = Eigen::TensorMap<
|
Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>,
|
Eigen::Unaligned>;
|
};
|
|
// Type alias for the tensor contraction output mapper.
|
template <typename Scalar, typename Index>
|
using ContractionOutputMapper =
|
Eigen::internal::blas_data_mapper<Scalar, Index, Eigen::ColMajor>;
|
|
// Returns input expression without any transformations.
|
struct Identity {
|
template <typename XprType>
|
static auto apply(XprType expr) -> XprType {
|
return expr;
|
};
|
};
|
|
// Applies `Relu` to the passed input expression.
|
struct Relu {
|
template <typename XprType>
|
static auto apply(XprType expr)
|
-> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>())) {
|
return expr.cwiseMax(static_cast<typename XprType::Scalar>(0));
|
};
|
};
|
|
// TensorContraction swaps lhs with rhs, and changes layout from RowMajor
|
// (default in Tensorflow) to ColMajor (preferred in Eigen), and computes matmul
|
// using these tensors.
|
//
|
// TensorContraction output matrix (before reshape) has a ColMajor layout, and
|
// has dimensions:
|
// - rows: output_channels
|
// - cols: all other dimensions
|
//
|
// First element in every column is:
|
// [batch ??, height ??, width ??, out_channel = i]
|
//
|
// We do not know what are the values of the 'batch', 'height', and 'width' here
|
// (if we know original dimensions, they can be computed from 'j').
|
//
|
// Each column of an output block is a continuous slice along the output channel
|
// dimension, so we can use it to efficiently compute any transformation that
|
// depends only on a channel value (e.g. add channel bias).
|
|
// Output kernel that fuses BiasAdd operation into the output of tensor
|
// contraction + activation function defined by Activation.
|
template <typename T, typename Activation = Identity>
|
struct BiasAddOutputKernel {
|
explicit BiasAddOutputKernel(const T* bias_data) : bias_data(bias_data) {}
|
|
template <typename Index, typename Scalar>
|
EIGEN_ALWAYS_INLINE void operator()(
|
const ContractionOutputMapper<Scalar, Index>& output_mapper,
|
const Eigen::TensorContractionParams& params, Index i, Index j,
|
Index num_rows, Index num_cols) const {
|
DCHECK(params.swapped_arguments);
|
|
const T* bias_base = bias_data + i;
|
typename Unaligned<T>::ConstTensor bias(bias_base, num_rows);
|
|
for (int col = 0; col < num_cols; ++col) {
|
T* output_base = &output_mapper(0, col);
|
typename Unaligned<T>::Tensor output(output_base, num_rows);
|
const auto expr = output + bias;
|
output = Activation::template apply<decltype(expr)>(expr);
|
}
|
}
|
|
private:
|
const T* bias_data;
|
};
|
|
// Output kernel that fuses FusedBatchNorm operation into the output of tensor
|
// contraction + activation function defined by Activation.
|
template <typename T, typename Activation = Identity>
|
struct FusedBatchNormOutputKernel {
|
FusedBatchNormOutputKernel(T epsilon, const T* scaling_factor_data,
|
const T* offset_data, const T* estimated_mean_data)
|
: epsilon(epsilon),
|
scaling_factor_data(scaling_factor_data),
|
offset_data(offset_data),
|
estimated_mean_data(estimated_mean_data) {}
|
|
template <typename Index, typename Scalar>
|
EIGEN_ALWAYS_INLINE void operator()(
|
const ContractionOutputMapper<Scalar, Index>& output_mapper,
|
const Eigen::TensorContractionParams& params, Index i, Index j,
|
Index num_rows, Index num_cols) const {
|
DCHECK(params.swapped_arguments);
|
|
const T* scaling_factor_base = scaling_factor_data + i;
|
const T* offset_base = offset_data + i;
|
const T* mean_base = estimated_mean_data + i;
|
|
typename Unaligned<T>::ConstTensor scaling_factor(scaling_factor_base,
|
num_rows);
|
typename Unaligned<T>::ConstTensor offset(offset_base, num_rows);
|
typename Unaligned<T>::ConstTensor mean(mean_base, num_rows);
|
|
for (int col = 0; col < num_cols; ++col) {
|
T* output_base = &output_mapper(0, col);
|
typename Unaligned<T>::Tensor output(output_base, num_rows);
|
|
auto scaled = (output - mean) * scaling_factor;
|
auto shifted = scaled + offset;
|
|
output = Activation::template apply<decltype(shifted)>(shifted);
|
}
|
}
|
|
private:
|
T epsilon;
|
const T* scaling_factor_data;
|
const T* offset_data;
|
const T* estimated_mean_data;
|
};
|
|
// Type aliases for the output kernels, purely for the sake of better launch
|
// dispatching code readability.
|
template <typename T>
|
using WithBiasAdd = BiasAddOutputKernel<T>;
|
template <typename T>
|
using WithBiasAddAndRelu = BiasAddOutputKernel<T, Relu>;
|
template <typename T>
|
using WithFusedBatchNorm = FusedBatchNormOutputKernel<T>;
|
template <typename T>
|
using WithFusedBatchNormAndRelu = FusedBatchNormOutputKernel<T, Relu>;
|
|
// This is CPU-only implementation that uses Eigen contraction output kernels.
|
//
|
// Dispatch 2D convolution to the appropriate primitive operation:
|
// (1) MatMul for the case of 1x1 convolution.
|
// (2) MatMul for the case when filter size equals to the input size.
|
// (3) General spatial 2D convolution for all other cases.
|
template <typename T>
|
class LaunchFusedConv2DWithOutputKernel {
|
public:
|
LaunchFusedConv2DWithOutputKernel(int row_stride, int col_stride, //
|
int row_dilation, int col_dilation, //
|
Padding padding)
|
: row_stride_(row_stride),
|
col_stride_(col_stride),
|
row_dilation_(row_dilation),
|
col_dilation_(col_dilation),
|
padding_(padding) {}
|
|
template <typename OutputKernel>
|
void operator()(const OutputKernel& output_kernel, OpKernelContext* ctx,
|
const Tensor& input, const Tensor& filter, Tensor* output) {
|
if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 &&
|
row_stride_ == 1 && col_stride_ == 1) {
|
int conv_width = 1; // Width for the convolution step.
|
for (int i = 0; i < 3; ++i) {
|
conv_width *= output->dim_size(i);
|
}
|
|
Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
|
dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
|
functor::MatMulConvFunctor<CPUDevice, T, OutputKernel>()(
|
ctx->eigen_device<CPUDevice>(),
|
output->shaped<T, 2>({conv_width, filter.dim_size(3)}),
|
input.shaped<T, 2>({conv_width, filter.dim_size(2)}),
|
filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}),
|
dim_pair, output_kernel);
|
|
} else if (filter.dim_size(0) == input.dim_size(1) &&
|
filter.dim_size(1) == input.dim_size(2) && row_dilation_ == 1 &&
|
col_dilation_ == 1 && padding_ == VALID) {
|
// If the input data and filter have the same height/width,
|
// reduce the 2D convolution to matrix multiplication.
|
const auto k = // Length of reduction dimension.
|
filter.dim_size(0) * filter.dim_size(1) * filter.dim_size(2);
|
|
Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
|
dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
|
functor::MatMulConvFunctor<CPUDevice, T, OutputKernel>()(
|
ctx->eigen_device<CPUDevice>(),
|
output->shaped<T, 2>({input.dim_size(0), filter.dim_size(3)}),
|
input.shaped<T, 2>({input.dim_size(0), k}),
|
filter.shaped<T, 2>({k, filter.dim_size(3)}), dim_pair,
|
output_kernel);
|
|
} else {
|
functor::SpatialConvolution<CPUDevice, T, OutputKernel>()(
|
ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
|
input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride_, col_stride_,
|
row_dilation_, col_dilation_, BrainPadding2EigenPadding(padding_),
|
output_kernel);
|
}
|
}
|
|
private:
|
int row_stride_;
|
int col_stride_;
|
int row_dilation_;
|
int col_dilation_;
|
const Padding padding_;
|
};
|
|
template <typename T>
|
struct LaunchFusedConv2DOp<CPUDevice, T> {
|
void operator()(OpKernelContext* context, bool use_cudnn,
|
bool cudnn_use_autotune, const Tensor& input,
|
const Tensor& filter, const FusedComputationType fusion,
|
const FusedComputationArgs& fusion_args,
|
const Conv2DParameters& params,
|
const Conv2DDimensions& dimensions, Tensor* output) {
|
OP_REQUIRES(context, dimensions.in_depth == filter.dim_size(2),
|
errors::Unimplemented("Fused conv implementation does not "
|
"support grouped convolutions for now."));
|
OP_REQUIRES(context, params.data_format == FORMAT_NHWC,
|
errors::Unimplemented("Fused conv implementation only supports "
|
"NHWC tensor format for now."));
|
|
BiasAddArgs bias_add;
|
FusedBatchNormArgs fused_batch_norm;
|
|
LaunchFusedConv2DWithOutputKernel<T> conv2d(
|
dimensions.stride_rows, dimensions.stride_cols,
|
dimensions.dilation_rows, dimensions.dilation_cols, params.padding);
|
|
switch (fusion) {
|
case FusedComputationType::kBiasAdd:
|
OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add));
|
conv2d(WithBiasAdd<T>(bias_add.bias_add_data), context, input, filter,
|
output);
|
break;
|
|
case FusedComputationType::kBiasAddWithRelu:
|
OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add));
|
conv2d(WithBiasAddAndRelu<T>(bias_add.bias_add_data), context, input,
|
filter, output);
|
break;
|
|
case FusedComputationType::kFusedBatchNorm:
|
OP_REQUIRES_OK(context,
|
InitFusedBatchNormArgs(context, fusion_args.epsilon,
|
&fused_batch_norm));
|
conv2d(WithFusedBatchNorm<T>(fusion_args.epsilon,
|
fused_batch_norm.scaling_factor.data(),
|
fused_batch_norm.offset_data,
|
fused_batch_norm.estimated_mean_data),
|
context, input, filter, output);
|
break;
|
|
case FusedComputationType::kFusedBatchNormWithRelu:
|
OP_REQUIRES_OK(context,
|
InitFusedBatchNormArgs(context, fusion_args.epsilon,
|
&fused_batch_norm));
|
conv2d(WithFusedBatchNormAndRelu<T>(
|
fusion_args.epsilon, fused_batch_norm.scaling_factor.data(),
|
fused_batch_norm.offset_data,
|
fused_batch_norm.estimated_mean_data),
|
context, input, filter, output);
|
break;
|
}
|
}
|
|
private:
|
struct BiasAddArgs {
|
const T* bias_add_data = nullptr;
|
};
|
|
struct FusedBatchNormArgs {
|
const T* scale_data = nullptr;
|
const T* offset_data = nullptr;
|
const T* estimated_mean_data = nullptr;
|
const T* estimated_variance_data = nullptr;
|
|
// Precomputed expression:
|
// scaling_factor = (estimated_variance + epsilon).rsqrt() * scale
|
Eigen::Tensor<T, 1, Eigen::RowMajor> scaling_factor;
|
};
|
|
#define TF_REQUIRES(EXP, STATUS) \
|
if (!TF_PREDICT_TRUE(EXP)) return (STATUS)
|
|
void InitDataPtr(const Tensor& tensor, const T** ptr) const {
|
*ptr = reinterpret_cast<const T*>(tensor.tensor_data().data());
|
}
|
|
Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs* args) const {
|
// Bias of the following dimensions: [ output_depth ]
|
const Tensor& bias = context->input(2);
|
|
TF_REQUIRES(bias.dims() == 1,
|
errors::InvalidArgument("bias must be 1-dimensional",
|
bias.shape().DebugString()));
|
|
InitDataPtr(bias, &args->bias_add_data);
|
|
return Status::OK();
|
}
|
|
Status InitFusedBatchNormArgs(OpKernelContext* context, float epsilon,
|
FusedBatchNormArgs* args) const {
|
const Tensor& scale = context->input(2);
|
const Tensor& offset = context->input(3);
|
const Tensor& estimated_mean = context->input(4);
|
const Tensor& estimated_variance = context->input(5);
|
|
TF_REQUIRES(scale.dims() == 1,
|
errors::InvalidArgument("scale must be 1-dimensional",
|
scale.shape().DebugString()));
|
TF_REQUIRES(offset.dims() == 1,
|
errors::InvalidArgument("offset must be 1-dimensional",
|
offset.shape().DebugString()));
|
TF_REQUIRES(estimated_mean.dims() == 1,
|
errors::InvalidArgument("estimated_mean must be 1-dimensional",
|
estimated_mean.shape().DebugString()));
|
TF_REQUIRES(
|
estimated_variance.dims() == 1,
|
errors::InvalidArgument("estimated_variance must be 1-dimensional",
|
estimated_variance.shape().DebugString()));
|
|
InitDataPtr(scale, &args->scale_data);
|
InitDataPtr(offset, &args->offset_data);
|
InitDataPtr(estimated_mean, &args->estimated_mean_data);
|
InitDataPtr(estimated_variance, &args->estimated_variance_data);
|
|
// Precompute scaling factor once for all output blocks (kernels).
|
args->scaling_factor =
|
(estimated_variance.flat<T>() + static_cast<T>(epsilon)).rsqrt() *
|
scale.flat<T>();
|
|
return Status::OK();
|
}
|
|
#undef TF_REQUIRES
|
};
|
|
#if GOOGLE_CUDA
|
|
// Encapsulate the default shape information that is used by the convolution
|
// operation, and add an activation mode for the fusion.
|
class FusedConvParameters : public ConvParameters {
|
public:
|
FusedConvParameters(const ConvParameters& base,
|
const se::dnn::ActivationMode activation_mode)
|
: ConvParameters(base), activation_mode_(activation_mode) {}
|
|
string ToString() const {
|
return absl::StrCat(ConvParameters::ToString(), ", ", activation_mode_);
|
}
|
|
private:
|
friend bool operator==(const FusedConvParameters& lhs,
|
const FusedConvParameters& rhs);
|
|
using ParameterDataType =
|
std::tuple<ConvParameters::ParameterDataType, se::dnn::ActivationMode>;
|
|
ParameterDataType get_data_as_tuple() const {
|
return std::make_tuple(ConvParameters::get_data_as_tuple(),
|
activation_mode_);
|
}
|
|
se::dnn::ActivationMode activation_mode_;
|
};
|
|
inline bool operator==(const FusedConvParameters& lhs,
|
const FusedConvParameters& rhs) {
|
return lhs.get_data_as_tuple() == rhs.get_data_as_tuple();
|
}
|
|
inline bool operator!=(const FusedConvParameters& lhs,
|
const FusedConvParameters& rhs) {
|
return !(lhs == rhs);
|
}
|
|
// A dummy type to group forward convolution autotune results together.
|
struct FusedConvAutoTuneGroup {
|
static string name() { return "FusedConv"; }
|
};
|
|
using AutoTuneFusedConv =
|
AutoTuneSingleton<FusedConvAutoTuneGroup, FusedConvParameters,
|
se::dnn::AlgorithmConfig>;
|
|
inline int64 ConvolveScratchSize() {
|
static int64 convolve_scratch_size = GetDnnWorkspaceLimit(
|
// default value is in bytes despite the name of the environment variable
|
"TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB
|
);
|
return convolve_scratch_size;
|
}
|
|
// Finds the best convolutiun algorithm for the given ConvLaunch (cuda
|
// convolution on the stream) and parameters, by running all possible
|
// algorithms and measuring execution time.
|
// TODO(ezhulenev): Move it to conv_ops_gpu.h and share with conv_ops.cc.
|
template <typename T, typename ConvLaunch, typename LogFunc>
|
Status FindBestConvolveAlgorithm(const FusedConvParameters& params,
|
const ConvLaunch launch,
|
OpKernelContext* context, se::Stream* stream,
|
const LogFunc& log,
|
se::dnn::AlgorithmConfig* algorithm_config) {
|
// Check if we already have an algorithm selected for the given parameters.
|
if (AutoTuneFusedConv::GetInstance()->Find(params, algorithm_config)) {
|
return Status::OK();
|
}
|
|
// Find all candidate algorithms.
|
std::vector<se::dnn::AlgorithmDesc> algorithms;
|
if (!stream->parent()->GetConvolveAlgorithms(
|
params.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
|
&algorithms)) {
|
return errors::Unknown(
|
"Failed to get convolution algorithm. This is probably "
|
"because cuDNN failed to initialize, so try looking to "
|
"see if a warning log message was printed above.");
|
}
|
|
std::vector<tensorflow::AutotuneResult> results;
|
for (auto profile_algorithm : algorithms) {
|
DnnScratchAllocator scratch_allocator(ConvolveScratchSize(), context);
|
se::dnn::ProfileResult profile_result;
|
|
bool cudnn_launch_status =
|
launch(se::dnn::AlgorithmConfig(profile_algorithm), &scratch_allocator,
|
&profile_result);
|
|
if (cudnn_launch_status && profile_result.is_valid()) {
|
results.emplace_back();
|
auto& result = results.back();
|
result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
|
result.mutable_conv()->set_tensor_ops_enabled(
|
profile_algorithm.tensor_ops_enabled());
|
result.mutable_success()->set_scratch_bytes(
|
scratch_allocator.TotalByteSize());
|
*result.mutable_success()->mutable_run_time() =
|
proto_utils::ToDurationProto(
|
absl::Milliseconds(profile_result.elapsed_time_in_ms()));
|
}
|
}
|
// Only log on an AutoTuneFusedConv cache miss.
|
log(results);
|
TF_RETURN_IF_ERROR(BestCudnnConvAlgorithm(results, algorithm_config));
|
AutoTuneFusedConv::GetInstance()->Insert(params, *algorithm_config);
|
return Status::OK();
|
}
|
|
template <typename T>
|
struct LaunchFusedConv2DOp<GPUDevice, T> {
|
void operator()(OpKernelContext* context, bool use_cudnn,
|
bool cudnn_use_autotune, const Tensor& input_param,
|
const Tensor& filter, FusedComputationType fusion,
|
const FusedComputationArgs& fusion_args,
|
const Conv2DParameters& params,
|
const Conv2DDimensions& dimensions, Tensor* output) {
|
OP_REQUIRES(
|
context,
|
params.data_format == FORMAT_NHWC || params.data_format == FORMAT_NCHW,
|
errors::Unimplemented("Fused conv implementation only supports "
|
"NHWC and HCHW tensor formats for now."));
|
|
auto* stream = context->op_device_context()->stream();
|
OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
|
OP_REQUIRES(
|
context, use_cudnn,
|
errors::Unimplemented("FusedConv2D for GPU is not currently supported "
|
"without cudnn"));
|
|
OP_REQUIRES(
|
context, fusion == FusedComputationType::kBiasAddWithRelu,
|
errors::Unimplemented("FusedConv2D implementation only supports "
|
"fusing with `BiasAdd + Relu` for now."));
|
|
Tensor input = input_param;
|
|
const int64 in_batch = GetTensorDim(input, params.data_format, 'N');
|
int64 in_rows = GetTensorDim(input, params.data_format, 'H');
|
int64 in_cols = GetTensorDim(input, params.data_format, 'W');
|
const int64 in_depths = GetTensorDim(input, params.data_format, 'C');
|
|
const int64 patch_rows = filter.dim_size(0);
|
const int64 patch_cols = filter.dim_size(1);
|
const int64 patch_depths = filter.dim_size(2);
|
|
int64 padding_rows = 0;
|
int64 padding_cols = 0;
|
const int64 out_batch = GetTensorDim(*output, params.data_format, 'N');
|
const int64 out_rows = GetTensorDim(*output, params.data_format, 'H');
|
const int64 out_cols = GetTensorDim(*output, params.data_format, 'W');
|
const int64 out_depths = GetTensorDim(*output, params.data_format, 'C');
|
|
// Bias of the following dimensions: [ output_depth ]
|
const Tensor& bias = context->input(2);
|
OP_REQUIRES(context, bias.dims() == 1,
|
errors::InvalidArgument("bias must be 1-dimensional",
|
bias.shape().DebugString()));
|
OP_REQUIRES(context, bias.dim_size(0) == out_depths,
|
errors::InvalidArgument("bias depth must be equal to out depth",
|
bias.shape().DebugString()));
|
|
if (params.padding == SAME) {
|
// Total padding on rows and cols is
|
// Pr = (R' - 1) * S + (Kr - 1) * Dr + 1 - R
|
// Pc = (C' - 1) * S + (Kc - 1) * Dc + 1 - C
|
// where (R', C') are output dimensions, (R, C) are input dimensions, S
|
// is stride, (Dr, Dc) are dilations, (Kr, Kc) are filter dimensions.
|
// We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
|
// and Pc - Pc/2 on the bottom. When Pr or Pc is odd, this means
|
// we pad more on the right and bottom than on the top and left.
|
padding_rows = std::max<int>(
|
0, (out_rows - 1) * dimensions.stride_rows +
|
(patch_rows - 1) * dimensions.dilation_rows + 1 - in_rows);
|
padding_cols = std::max<int>(
|
0, (out_cols - 1) * dimensions.stride_cols +
|
(patch_cols - 1) * dimensions.dilation_cols + 1 - in_cols);
|
const bool rows_odd = (padding_rows % 2 != 0);
|
const bool cols_odd = (padding_cols % 2 != 0);
|
if (rows_odd || cols_odd) {
|
Tensor transformed_input;
|
int64 new_in_rows = in_rows + rows_odd;
|
int64 new_in_cols = in_cols + cols_odd;
|
OP_REQUIRES_OK(context,
|
context->allocate_temp(
|
DataTypeToEnum<T>::value,
|
ShapeFromFormat(params.data_format, in_batch,
|
new_in_rows, new_in_cols, in_depths),
|
&transformed_input));
|
|
functor::PadInput<GPUDevice, T, int, 4>()(
|
context->eigen_device<GPUDevice>(),
|
To32Bit(input_param.tensor<T, 4>()), {{0, 0}},
|
{{rows_odd, cols_odd}}, To32Bit(transformed_input.tensor<T, 4>()),
|
params.data_format);
|
|
input = transformed_input;
|
in_rows = new_in_rows;
|
in_cols = new_in_cols;
|
}
|
}
|
|
if (params.data_format == FORMAT_NHWC) {
|
// Convert the input tensor from NHWC to NCHW.
|
TensorShape nchw_shape =
|
ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
|
if (in_depths > 1) {
|
Tensor transformed_input;
|
OP_REQUIRES_OK(context,
|
context->allocate_temp(DataTypeToEnum<T>::value,
|
nchw_shape, &transformed_input));
|
functor::NHWCToNCHW<GPUDevice, T, 4>()(
|
context->eigen_device<GPUDevice>(),
|
const_cast<const Tensor&>(input).tensor<T, 4>(),
|
transformed_input.tensor<T, 4>());
|
input = transformed_input;
|
} else {
|
// If depth <= 1, then just reshape.
|
CHECK(input.CopyFrom(input, nchw_shape)); // Crash OK
|
}
|
}
|
|
CHECK(padding_rows >= 0) << "Negative padding rows"; // Crash OK
|
CHECK(padding_cols >= 0) << "Negative padding cols"; // Crash OK
|
|
se::dnn::ActivationMode dnn_activation_mode;
|
switch (fusion) {
|
case FusedComputationType::kBiasAddWithRelu:
|
dnn_activation_mode = se::dnn::ActivationMode::kRelu;
|
break;
|
default:
|
LOG(FATAL) << "Unsupported fusion type"; // Crash OK
|
}
|
|
se::dnn::BatchDescriptor input_desc;
|
input_desc.set_count(in_batch)
|
.set_feature_map_count(in_depths)
|
.set_height(in_rows)
|
.set_width(in_cols)
|
.set_layout(se::dnn::DataLayout::kBatchDepthYX);
|
se::dnn::FilterDescriptor filter_desc;
|
filter_desc.set_input_filter_height(patch_rows)
|
.set_input_filter_width(patch_cols)
|
.set_input_feature_map_count(patch_depths)
|
.set_output_feature_map_count(filter.dim_size(3));
|
se::dnn::BatchDescriptor bias_desc;
|
bias_desc.set_count(1)
|
.set_height(1)
|
.set_width(1)
|
.set_feature_map_count(out_depths)
|
.set_layout(se::dnn::DataLayout::kBatchDepthYX);
|
se::dnn::ConvolutionDescriptor conv_desc;
|
conv_desc.set_vertical_dilation_rate(dimensions.dilation_rows)
|
.set_horizontal_dilation_rate(dimensions.dilation_cols)
|
.set_vertical_filter_stride(dimensions.stride_rows)
|
.set_horizontal_filter_stride(dimensions.stride_cols)
|
.set_zero_padding_height(padding_rows / 2)
|
.set_zero_padding_width(padding_cols / 2)
|
.set_group_count(in_depths / patch_depths);
|
se::dnn::BatchDescriptor output_desc;
|
output_desc.set_count(out_batch)
|
.set_height(out_rows)
|
.set_width(out_cols)
|
.set_feature_map_count(out_depths)
|
.set_layout(se::dnn::DataLayout::kBatchDepthYX);
|
|
Tensor transformed_filter;
|
OP_REQUIRES_OK(context,
|
context->allocate_temp(
|
DataTypeToEnum<T>::value,
|
TensorShape({filter.dim_size(3), filter.dim_size(2),
|
filter.dim_size(0), filter.dim_size(1)}),
|
&transformed_filter));
|
functor::TransformFilter<GPUDevice, T, int, 4>()(
|
context->eigen_device<GPUDevice>(), FORMAT_OIHW,
|
To32Bit(filter.tensor<T, 4>()),
|
To32Bit(transformed_filter.tensor<T, 4>()));
|
|
Tensor transformed_output;
|
if (params.data_format == FORMAT_NHWC) {
|
// Only allocate temporary memory when a layout transformation is needed.
|
OP_REQUIRES_OK(context,
|
context->allocate_temp(
|
DataTypeToEnum<T>::value,
|
ShapeFromFormat(FORMAT_NCHW, out_batch, out_rows,
|
out_cols, out_depths),
|
&transformed_output));
|
} else {
|
transformed_output = *output;
|
}
|
|
const auto tensor_on_device = [](const Tensor& t) -> se::DeviceMemory<T> {
|
return AsDeviceMemory(t.template flat<T>().data(),
|
t.template flat<T>().size());
|
};
|
|
se::DeviceMemory<T> input_ptr = tensor_on_device(input);
|
se::DeviceMemory<T> filter_ptr = tensor_on_device(transformed_filter);
|
se::DeviceMemory<T> bias_ptr = tensor_on_device(bias);
|
se::DeviceMemory<T> output_ptr = tensor_on_device(transformed_output);
|
|
// We do not use side inputs, so we can safely pass nullptr.
|
se::DeviceMemory<T> side_input_ptr =
|
AsDeviceMemory(static_cast<T*>(nullptr), 0);
|
|
int device_id = stream->parent()->device_ordinal();
|
DataType dtype = input.dtype();
|
FusedConvParameters conv_parameters = {
|
{
|
in_batch, // batch
|
in_depths, // in_depths
|
{{in_rows, // in_rows
|
in_cols}}, // in_cols
|
FORMAT_NCHW, // compute_data_format
|
out_depths, // out_depths
|
{{patch_rows, // filter_rows
|
patch_cols, // filter_cols
|
patch_depths}}, // filter_depths
|
{{dimensions.dilation_rows, // dilation_rows
|
dimensions.dilation_cols}}, // dilation_cols
|
{{dimensions.stride_rows, // stride_rows
|
dimensions.stride_cols}}, // stride_cols
|
{{padding_rows, // padding_rows
|
padding_cols}}, // padding_cols
|
dtype, // tensor datatype
|
device_id, // device_id
|
},
|
dnn_activation_mode // activation_mode
|
};
|
|
// Launch fused convolution with given parameters and scratch allocator.
|
// Record profile result into `profile_result` if it's not nullptr.
|
const auto launch = [&](se::dnn::AlgorithmConfig algorithm_config,
|
DnnScratchAllocator* scratch_allocator,
|
se::dnn::ProfileResult* profile_result) -> bool {
|
return stream
|
->ThenFusedConvolveWithAlgorithm(
|
input_desc, input_ptr, // input
|
/*conv_input_scale=*/1.0, // input_scale
|
filter_desc, filter_ptr, // filter
|
conv_desc, // conv
|
side_input_ptr, /*side_input_scale=*/0.0, // side_input
|
bias_desc, bias_ptr, // bias
|
dnn_activation_mode, // activation
|
output_desc, &output_ptr, // output
|
scratch_allocator, algorithm_config, profile_result)
|
.ok();
|
};
|
|
se::dnn::AlgorithmConfig algorithm_config;
|
if (cudnn_use_autotune) {
|
auto status = FindBestConvolveAlgorithm<T>(
|
conv_parameters, launch, context, stream,
|
[&](absl::Span<const tensorflow::AutotuneResult> results) {
|
LogFusedConvAutotuneResults(
|
context->op_kernel().def(), input, transformed_filter,
|
transformed_output, bias, nullptr, stream->parent(), results);
|
},
|
&algorithm_config);
|
OP_REQUIRES_OK(context, status);
|
}
|
|
DnnScratchAllocator scratch_allocator(ConvolveScratchSize(), context);
|
bool cudnn_launch_status = launch(algorithm_config, &scratch_allocator,
|
/*profile_result=*/nullptr);
|
OP_REQUIRES(
|
context, cudnn_launch_status,
|
errors::Internal(absl::Substitute(
|
"cuDNN launch failure: input shape($0) filter shape($1)",
|
input.shape().DebugString(), filter.shape().DebugString())));
|
|
// Convert the output tensor back from NCHW to NHWC.
|
if (params.data_format == FORMAT_NHWC) {
|
functor::NCHWToNHWC<GPUDevice, T, 4>()(
|
context->eigen_device<GPUDevice>(),
|
const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
|
output->tensor<T, 4>());
|
}
|
}
|
};
|
|
#endif // GOOGLE_CUDA
|
|
template <typename Device, typename T>
|
class FusedConv2DOp : public OpKernel {
|
public:
|
explicit FusedConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
|
OP_REQUIRES_OK(context, InitConv2DParameters(context, ¶ms_));
|
|
OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
|
use_cudnn_ &= CanUseCudnn();
|
cudnn_use_autotune_ = CudnnUseAutotune();
|
|
// 'fused_ops' and 'num_args' attributes are specified by the Grappler
|
// Remapper optimizer (see grappler/optimizers/remapper.cc).
|
|
std::vector<string> fused_ops;
|
OP_REQUIRES_OK(context, context->GetAttr("fused_ops", &fused_ops));
|
OP_REQUIRES(context, !fused_ops.empty(),
|
errors::InvalidArgument(
|
"Fused Conv2D must have at least one fused op."));
|
|
int num_args;
|
OP_REQUIRES_OK(context, context->GetAttr("num_args", &num_args));
|
|
// TODO(ezhulenev): Add support for fusion element-wise op chains defined
|
// at runtime, e.g. Relu+Sqrt+Tanh+etc.
|
|
// Match combination of fused ops to one of the supported fusions.
|
if (FusedOpsMatchAndSupportedOnDevice(fused_ops, {"BiasAdd"},
|
/*cpu_only=*/true)) {
|
fused_computation_ = FusedComputationType::kBiasAdd;
|
} else if (FusedOpsMatchAndSupportedOnDevice(fused_ops, {"BiasAdd", "Relu"},
|
/*cpu_only=*/false)) {
|
fused_computation_ = FusedComputationType::kBiasAddWithRelu;
|
} else if (FusedOpsMatchAndSupportedOnDevice(fused_ops, {"FusedBatchNorm"},
|
/*cpu_only=*/true)) {
|
fused_computation_ = FusedComputationType::kFusedBatchNorm;
|
} else if (FusedOpsMatchAndSupportedOnDevice(fused_ops,
|
{"FusedBatchNorm", "Relu"},
|
/*cpu_only=*/true)) {
|
fused_computation_ = FusedComputationType::kFusedBatchNormWithRelu;
|
} else {
|
OP_REQUIRES(context, false,
|
errors::Unimplemented("Fusion is not implemented: [",
|
absl::StrJoin(fused_ops, ","), "]"));
|
}
|
|
// Depending on a picked fusion type validate fusion-specific arguments.
|
|
if (fused_computation_ == FusedComputationType::kBiasAdd ||
|
fused_computation_ == FusedComputationType::kBiasAddWithRelu) {
|
OP_REQUIRES(context, num_args == 1,
|
errors::InvalidArgument(
|
"Fused Conv2D must have one extra argument: bias."));
|
}
|
|
if (fused_computation_ == FusedComputationType::kFusedBatchNorm ||
|
fused_computation_ == FusedComputationType::kFusedBatchNormWithRelu) {
|
OP_REQUIRES(
|
context, num_args == 4,
|
errors::InvalidArgument("Fused FusedBatchNorm must have four extra "
|
"arguments: scale, offset, mean, variance."));
|
OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
|
}
|
}
|
|
void Compute(OpKernelContext* context) override {
|
// Input tensor is of the following dimensions:
|
// [ batch, in_rows, in_cols, in_depth ]
|
const Tensor& input = context->input(0);
|
|
// Input filter is of the following dimensions:
|
// [ filter_rows, filter_cols, in_depth, out_depth]
|
const Tensor& filter = context->input(1);
|
|
Conv2DDimensions dimensions;
|
OP_REQUIRES_OK(context,
|
ComputeConv2DDimension(params_, input, filter, &dimensions));
|
|
TensorShape out_shape = ShapeFromFormat(
|
params_.data_format, dimensions.batch, dimensions.out_rows,
|
dimensions.out_cols, dimensions.out_depth);
|
|
// Output tensor is of the following dimensions:
|
// [ in_batch, out_rows, out_cols, out_depth ]
|
Tensor* output = nullptr;
|
OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
|
|
VLOG(2) << "FusedConv2D: in_depth = " << dimensions.in_depth
|
<< ", patch_depth = " << dimensions.patch_depth
|
<< ", input_cols = " << dimensions.input_cols
|
<< ", filter_cols = " << dimensions.filter_cols
|
<< ", input_rows = " << dimensions.input_rows
|
<< ", filter_rows = " << dimensions.filter_rows
|
<< ", stride_rows = " << dimensions.stride_rows
|
<< ", stride_cols = " << dimensions.stride_cols
|
<< ", dilation_rows = " << dimensions.dilation_rows
|
<< ", dilation_cols = " << dimensions.dilation_cols
|
<< ", out_depth = " << dimensions.out_depth;
|
|
// If there is nothing to compute, return.
|
if (out_shape.num_elements() == 0) {
|
return;
|
}
|
|
FusedComputationArgs args;
|
args.epsilon = epsilon_;
|
|
LaunchFusedConv2DOp<Device, T>()(context, use_cudnn_, cudnn_use_autotune_,
|
input, filter, fused_computation_, args,
|
params_, dimensions, output);
|
}
|
|
private:
|
bool FusedOpsMatchAndSupportedOnDevice(const std::vector<string>& fused_ops,
|
const std::vector<string>& expected,
|
bool cpu_only) const {
|
if (std::is_same<Device, GPUDevice>::value && cpu_only) {
|
return false;
|
}
|
return fused_ops == expected;
|
}
|
|
Conv2DParameters params_;
|
bool use_cudnn_;
|
bool cudnn_use_autotune_;
|
|
FusedComputationType fused_computation_;
|
|
float epsilon_; // Used only in FusedBatchNorm fusion
|
|
TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DOp);
|
};
|
|
// Registration of the CPU implementations.
|
#define REGISTER_FUSED_CPU_CONV2D(T) \
|
REGISTER_KERNEL_BUILDER( \
|
Name("_FusedConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
|
FusedConv2DOp<CPUDevice, T>);
|
|
#if GOOGLE_CUDA
|
|
#define DECLARE_FUNCTOR_GPU_SPEC(T) \
|
template <> \
|
void TransformFilter<GPUDevice, T, int, 4>::operator()( \
|
const GPUDevice& d, FilterTensorFormat dst_filter_format, \
|
typename TTypes<T, 4, int>::ConstTensor in, \
|
typename TTypes<T, 4, int>::Tensor out); \
|
extern template struct TransformFilter<GPUDevice, T, int, 4>; \
|
template <> \
|
void PadInput<GPUDevice, T, int, 4>::operator()( \
|
const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in, \
|
const std::array<int, 2>& padding_left, \
|
const std::array<int, 2>& padding_right, \
|
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
|
extern template struct PadInput<GPUDevice, T, int, 4>
|
|
// Registration of the GPU implementations.
|
#define REGISTER_FUSED_GPU_CONV2D(T) \
|
REGISTER_KERNEL_BUILDER( \
|
Name("_FusedConv2D").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
|
FusedConv2DOp<GPUDevice, T>);
|
|
#endif // GOOGLE_CUDA
|
|
} // namespace tensorflow
|
|
#endif // TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_
|