~lzh/A133.git

/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
 
    http://www.apache.org/licenses/LICENSE-2.0
 
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
 
// The CUDA implementation of the StreamExecutorInterface functionality.
// CUDA inclusions are ideally confined to this implementation file.
//
// The notions from the StreamExecutor basically correspond to the CUDA streams
// programming model provided by the libcuda.so driver APIs, so we don't have
// to do much more than wrap the calls to the libraries appropriately.
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
 
#include <set>
#include <unordered_map>
 
#include "absl/strings/string_view.h"
#include "tensorflow/stream_executor/event.h"
#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
#include "tensorflow/stream_executor/lib/status.h"
#include "tensorflow/stream_executor/lib/statusor.h"
#include "tensorflow/stream_executor/platform.h"
#include "tensorflow/stream_executor/platform/mutex.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/platform/thread_annotations.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"
 
namespace stream_executor {
namespace gpu {
 
// CUDA-platform implementation of the platform-agnostic
// StreamExecutorInferface.
class GpuExecutor : public internal::StreamExecutorInterface {
 public:
  // sub_platform indicates the subplatform used in this executor; it must
  // be a CUDA type.
  explicit GpuExecutor(const PluginConfig& plugin_config)
      : device_(0),
        context_(nullptr),
        device_ordinal_(0),
        cc_major_(0),
        cc_minor_(0),
        version_(0),
        plugin_config_(plugin_config) {}
 
  // See the corresponding StreamExecutor methods for method comments on the
  // following overrides.
 
  ~GpuExecutor() override;
 
  port::Status Init(int device_ordinal, DeviceOptions device_options) override;
 
  bool GetKernel(const MultiKernelLoaderSpec& spec,
                 KernelBase* kernel) override;
  // (supported on CUDA only)
  void UnloadKernel(const KernelBase* kernel) override;
  bool LoadModule(const MultiModuleLoaderSpec& spec,
                  ModuleHandle* module_handle) override;
  bool UnloadModule(ModuleHandle module_handle) override;
 
  bool Launch(Stream* stream, const ThreadDim& thread_dims,
              const BlockDim& block_dims, const KernelBase& k,
              const KernelArgsArrayBase& args) override;
 
  // (supported on CUDA only)
  int CalculateOccupancy(const DeviceDescription& device_description,
                         uint64 registers_per_thread,
                         uint64 shared_memory_per_block,
                         const ThreadDim& thread_dims, GpuFunctionHandle func);
 
  // (supported on CUDA only)
  int CompareOccupancy(int* initial_blocks,
                       const DeviceDescription& device_description,
                       uint64 registers_per_thread,
                       uint64 shared_memory_per_block,
                       const ThreadDim& thread_dims, GpuFunctionHandle func);
 
  void* Allocate(uint64 size) override;
 
  void* AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
                          uint64 size_bytes) override;
 
  void Deallocate(DeviceMemoryBase* mem) override;
 
  void* UnifiedMemoryAllocate(uint64 size) override {
    return GpuDriver::UnifiedMemoryAllocate(context_, size);
  }
 
  void UnifiedMemoryDeallocate(void* location) override {
    return GpuDriver::UnifiedMemoryDeallocate(context_, location);
  }
 
  // CUDA allocation/registration functions are necessary because the driver
  // internally sets up buffers for DMA operations (and page locks them).
  // There's no external interface for us to otherwise control these DMA
  // settings.
  void* HostMemoryAllocate(uint64 size) override {
    return GpuDriver::HostAllocate(context_, size);
  }
 
  void HostMemoryDeallocate(void* location) override {
    return GpuDriver::HostDeallocate(context_, location);
  }
 
  bool HostMemoryRegister(void* location, uint64 size) override;
 
  bool HostMemoryUnregister(void* location) override;
 
  bool SynchronizeAllActivity() override;
 
  bool SynchronousMemZero(DeviceMemoryBase* location, uint64 size) override;
 
  bool SynchronousMemSet(DeviceMemoryBase* location, int value,
                         uint64 size) override;
 
  port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
                                 const void* host_src, uint64 size) override;
 
  port::Status SynchronousMemcpy(void* host_dst,
                                 const DeviceMemoryBase& gpu_src,
                                 uint64 size) override;
 
  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
                                               const DeviceMemoryBase& gpu_src,
                                               uint64 size) override;
 
  bool MemZero(Stream* stream, DeviceMemoryBase* location,
               uint64 size) override;
  bool Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
              uint64 size) override;
  bool Memset32(Stream* stream, DeviceMemoryBase* location, uint32 pattern,
                uint64 size) override;
 
  bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
              uint64 size) override;
 
  bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
              uint64 size) override;
 
  bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
                            const DeviceMemoryBase& gpu_src,
                            uint64 size) override;
 
  bool HostCallback(Stream* stream,
                    std::function<port::Status()> callback) override;
 
  bool AllocateStream(Stream* stream) override;
 
  void DeallocateStream(Stream* stream) override;
 
  bool CreateStreamDependency(Stream* dependent, Stream* other) override;
 
  bool AllocateTimer(Timer* timer) override;
 
  void DeallocateTimer(Timer* timer) override;
 
  bool StartTimer(Stream* stream, Timer* timer) override;
 
  bool StopTimer(Stream* stream, Timer* timer) override;
 
  port::Status AllocateEvent(Event* event) override;
 
  port::Status DeallocateEvent(Event* event) override;
 
  port::Status RecordEvent(Stream* stream, Event* event) override;
 
  port::Status WaitForEvent(Stream* stream, Event* event) override;
 
  Event::Status PollForEventStatus(Event* event) override;
 
  port::Status BlockHostUntilDone(Stream* stream) override;
 
  int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); }
 
  port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
 
  bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
 
  SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
 
  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
 
  bool DeviceMemoryUsage(int64* free, int64* total) const override;
 
  // Search for the symbol and returns a device pointer and size.
  // Returns false if symbol does not exist.
  bool GetSymbol(const string& symbol_name, ModuleHandle module_handle,
                 void** mem, size_t* bytes) override;
 
  DeviceDescription* PopulateDeviceDescription() const override;
 
  // Populates the block_dim_limit by querying the device driver API. If an
  // error occurs at any point while asking the driver for block dim limits, it
  // will be only partially populated as a result, and an error will be logged.
  bool FillBlockDimLimit(BlockDim* block_dim_limit) const;
 
  bool SupportsBlas() const override;
 
  blas::BlasSupport* CreateBlas() override;
 
  bool SupportsFft() const override;
 
  fft::FftSupport* CreateFft() override;
 
  bool SupportsRng() const override;
 
  rng::RngSupport* CreateRng() override;
 
  bool SupportsDnn() const override;
 
  dnn::DnnSupport* CreateDnn() override;
 
  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
      override;
 
  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
      override;
 
  std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
 
  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
 
  void* GpuContextHack() override;
 
  GpuContext* gpu_context();
 
 private:
  // Attempts to find a more specific version of the file indicated by
  // filename by looking for compute-capability-specific suffixed versions; i.e.
  // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
  // we're on a compute capability 3.0 machine.
  // (supported on CUDA only)
  bool FindOnDiskForComputeCapability(absl::string_view filename,
                                      absl::string_view canonical_suffix,
                                      string* found_filename) const;
 
  // Attempts to find a more specific version of the file indicated by
  // filename by looking for AMDGPU ISA-specific suffixed versions.
  // (supported on ROCm only)
 
  bool FindOnDiskForISAVersion(absl::string_view filename,
                               absl::string_view canonical_suffix,
                               string* found_filename) const;
 
  // Host callback landing routine invoked by CUDA.
  // data: User-provided callback provided to HostCallback() above, captured
  //       as a std::function<void()>. Allocated/initialized inside
  //       HostCallback() and owned and deleted by this call.
  static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status,
                                   void* data);
 
  // Collects metadata for the specified kernel.
  bool GetKernelMetadata(GpuKernel* cuda_kernel,
                         KernelMetadata* kernel_metadata);
 
  // Prints to VLOG(2) information about the kernel's occupancy and how it might
  // be improved.
  void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims,
                         const BlockDim& block_dims);
 
  // (supported on CUDA only)
  bool LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
  // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
  // (supported on CUDA only)
  bool LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
  // (supported on ROCm only)
  bool LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
  bool UnloadGpuBinary(const void* gpu_binary)
      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
  // Guards the on-disk-module mapping.
  mutex disk_modules_mu_;
 
  // Mapping from filename to GPUModuleHandle, if it was already retrieved.
  // Multiple GPUFunctionHandle are usually obtained from a single
  // GPUModuleHandle so we attempt to hit in this mapping first, before
  // retrieving it.
  std::map<string, GpuModuleHandle> disk_modules_ GUARDED_BY(disk_modules_mu_);
 
  // Guards the in-memory-module mapping.
  mutex in_memory_modules_mu_;
 
  std::map<const char*, GpuModuleHandle> in_memory_modules_
      GUARDED_BY(in_memory_modules_mu_);
 
  // Kernel -> loaded GPU binary. Many kernels may load the same binary.
  std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
      GUARDED_BY(in_memory_modules_mu_);
  // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
  std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64>>
      gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
 
  // Guards the launched kernel set.
  mutex launched_kernels_mu_;
 
  // Keeps track of the set of launched kernels. Currently used to suppress the
  // occupancy check on subsequent launches.
  std::set<GpuFunctionHandle> launched_kernels_
      GUARDED_BY(launched_kernels_mu_);
 
  // Handle for the CUDA device being operated on. Immutable
  // post-initialization.
  GpuDeviceHandle device_;
 
  // Handle for session with the library/driver. Immutable post-initialization.
  GpuContext* context_;
 
  // The device ordinal value that this executor was initialized with; recorded
  // for use in getting device metadata. Immutable post-initialization.
  int device_ordinal_;
 
  // The major verion of the compute capability for device_.
  int cc_major_;
 
  // The minor verion of the compute capability for device_.
  int cc_minor_;
 
  // GPU ISA version for device_.
  int version_;
 
  // The plugin configuration associated with this instance.
  PluginConfig plugin_config_;
 
  SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
};
 
}  // namespace gpu
}  // namespace stream_executor
 
#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_