! 
!     Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
!
! NVIDIA CORPORATION and its licensors retain all intellectual property
! and proprietary rights in and to this software, related documentation
! and any modifications thereto.  Any use, reproduction, disclosure or
! distribution of this software and related documentation without an express
! license agreement from NVIDIA CORPORATION is strictly prohibited.
! 

!          THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT
!   WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT
!   NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR
!   FITNESS FOR A PARTICULAR PURPOSE.
!
INTEGER, PARAMETER :: CUDA_EVENT_KIND  = INT_PTR_KIND()
INTEGER, PARAMETER :: CUDA_COUNT_KIND  = INT_PTR_KIND()
INTEGER, PARAMETER :: CUDA_STREAM_KIND = INT_PTR_KIND()

TYPE cudaArrayPtr
  TYPE(C_PTR)  :: aptr
END TYPE cudaArrayPtr

! This is used for CUDA 11.x and 12.x
! Note, in 11.0, persistingL2CacheMaxSize was added in the middle of the struct

TYPE cudaDeviceProp
  CHARACTER*256                :: name
  CHARACTER*16                 :: uuid
  CHARACTER*8                  :: luid
  INTEGER(KIND=4)              :: luidDeviceNodeMask
  INTEGER(KIND=INT_PTR_KIND()) :: totalGlobalMem
  INTEGER(KIND=INT_PTR_KIND()) :: sharedMemPerBlock
  INTEGER(KIND=4)              :: regsPerBlock
  INTEGER(KIND=4)              :: warpSize
  INTEGER(KIND=INT_PTR_KIND()) :: memPitch
  INTEGER(KIND=4)              :: maxThreadsPerBlock
  INTEGER(KIND=4)              :: maxThreadsDim(3)
  INTEGER(KIND=4)              :: maxGridSize(3)
  INTEGER(KIND=4)              :: clockRate
  INTEGER(KIND=INT_PTR_KIND()) :: totalConstMem
  INTEGER(KIND=4)              :: major
  INTEGER(KIND=4)              :: minor
  INTEGER(KIND=INT_PTR_KIND()) :: textureAlignment
  INTEGER(KIND=INT_PTR_KIND()) :: texturePitchAlignment
  INTEGER(KIND=4)              :: deviceOverlap
  INTEGER(KIND=4)              :: multiProcessorCount
  INTEGER(KIND=4)              :: kernelExecTimeoutEnabled
  INTEGER(KIND=4)              :: integrated
  INTEGER(KIND=4)              :: canMapHostMemory
  INTEGER(KIND=4)              :: computeMode
  INTEGER(KIND=4)              :: maxTexture1D
  INTEGER(KIND=4)              :: maxTexture1DMipmap
  INTEGER(KIND=4)              :: maxTexture1DLinear
  INTEGER(KIND=4)              :: maxTexture2D(2)
  INTEGER(KIND=4)              :: maxTexture2DMipmap(2)
  INTEGER(KIND=4)              :: maxTexture2DLinear(3)
  INTEGER(KIND=4)              :: maxTexture2DGather(2)
  INTEGER(KIND=4)              :: maxTexture3D(3)
  INTEGER(KIND=4)              :: maxTexture3DAlt(3)
  INTEGER(KIND=4)              :: maxTextureCubemap
  INTEGER(KIND=4)              :: maxTexture1DLayered(2)
  INTEGER(KIND=4)              :: maxTexture2DLayered(3)
  INTEGER(KIND=4)              :: maxTextureCubemapLayered(2)
  INTEGER(KIND=4)              :: maxSurface1D
  INTEGER(KIND=4)              :: maxSurface2D(2)
  INTEGER(KIND=4)              :: maxSurface3D(3)
  INTEGER(KIND=4)              :: maxSurface1DLayered(2)
  INTEGER(KIND=4)              :: maxSurface2DLayered(3)
  INTEGER(KIND=4)              :: maxSurfaceCubemap
  INTEGER(KIND=4)              :: maxSurfaceCubemapLayered(2)
  INTEGER(KIND=INT_PTR_KIND()) :: surfaceAlignment
  INTEGER(KIND=4)              :: concurrentKernels
  INTEGER(KIND=4)              :: ECCEnabled
  INTEGER(KIND=4)              :: pciBusID
  INTEGER(KIND=4)              :: pciDeviceID
  INTEGER(KIND=4)              :: pciDomainID
  INTEGER(KIND=4)              :: tccDriver
  INTEGER(KIND=4)              :: asyncEngineCount
  INTEGER(KIND=4)              :: unifiedAddressing
  INTEGER(KIND=4)              :: memoryClockRate
  INTEGER(KIND=4)              :: memoryBusWidth
  INTEGER(KIND=4)              :: l2CacheSize
  INTEGER(KIND=4)              :: persistingL2CacheMaxSize
  INTEGER(KIND=4)              :: maxThreadsPerMultiProcessor
  INTEGER(KIND=4)              :: streamPrioritiesSupported
  INTEGER(KIND=4)              :: globalL1CacheSupported
  INTEGER(KIND=4)              :: localL1CacheSupported
  INTEGER(KIND=INT_PTR_KIND()) :: sharedMemPerMultiprocessor
  INTEGER(KIND=4)              :: regsPerMultiprocessor
  INTEGER(KIND=4)              :: managedMemory
  INTEGER(KIND=4)              :: isMultiGpuBoard
  INTEGER(KIND=4)              :: multiGpuBoardGroupID
  INTEGER(KIND=4)              :: hostNativeAtomicSupported
  INTEGER(KIND=4)              :: singleToDoublePrecisionPerfRatio
  INTEGER(KIND=4)              :: pageableMemoryAccess
  INTEGER(KIND=4)              :: concurrentManagedAccess
  INTEGER(KIND=4)              :: computePreemptionSupported
  INTEGER(KIND=4)              :: canUseHostPointerForRegisteredMem
  INTEGER(KIND=4)              :: cooperativeLaunch
  INTEGER(KIND=4)              :: cooperativeMultiDeviceLaunch
  INTEGER(KIND=INT_PTR_KIND()) :: sharedMemPerBlockOptin
  INTEGER(KIND=4)              :: pageableMemoryAccessUsesHostPageTables
  INTEGER(KIND=4)              :: directManagedMemAccessFromHost
  INTEGER(KIND=4)              :: maxBlocksPerMultiProcessor
  INTEGER(KIND=4)              :: accessPolicyMaxWindowSize
  INTEGER(KIND=INT_PTR_KIND()) :: reservedSharedMemPerBlock
  INTEGER(KIND=4)              :: hostRegisterSupported
  INTEGER(KIND=4)              :: sparseCudaArraySupported
  INTEGER(KIND=4)              :: hostRegisterReadOnlySupported
  INTEGER(KIND=4)              :: timelineSemaphoreInteropSupported
  INTEGER(KIND=4)              :: memoryPoolsSupported
  INTEGER(KIND=4)              :: gpuDirectRDMASupported
  INTEGER(KIND=4)              :: gpuDirectRDMAFlushWritesOptions
  INTEGER(KIND=4)              :: gpuDirectRDMAWritesOptions
  INTEGER(KIND=4)              :: gpuDirectRDMAWritesOrdering
  INTEGER(KIND=4)              :: memoryPoolSupportedHandleTypes
  INTEGER(KIND=4)              :: deferredMappingCudaArraySupported
  INTEGER(KIND=4)              :: ipcEventSupported
  INTEGER(KIND=4)              :: clusterLaunch
  INTEGER(KIND=4)              :: unifiedFunctionPointers
  INTEGER(KIND=4)              :: reserved(63)
END TYPE cudaDeviceProp

TYPE cudaEvent
  INTEGER(KIND=INT_PTR_KIND()) :: evt
END TYPE cudaEvent

TYPE cudaIpcMemHandle
  TYPE(C_PTR)  :: handle
END TYPE cudaIpcMemHandle

TYPE cudaIpcEventHandle
  TYPE(C_PTR)  :: handle
END TYPE cudaIpcEventHandle

TYPE cudaPitchedPtr
  TYPE(C_DEVPTR) :: ptr
  INTEGER(KIND=INT_PTR_KIND()) :: pitch
  INTEGER(KIND=INT_PTR_KIND()) :: xsize
  INTEGER(KIND=INT_PTR_KIND()) :: ysize
END TYPE cudaPitchedPtr

TYPE cudaChannelFormatDesc
  INTEGER :: x
  INTEGER :: y
  INTEGER :: z
  INTEGER :: w
  INTEGER(C_INT) :: f
END TYPE cudaChannelFormatDesc

TYPE cudaExtent
  INTEGER(KIND=INT_PTR_KIND()) :: width
  INTEGER(KIND=INT_PTR_KIND()) :: height
  INTEGER(KIND=INT_PTR_KIND()) :: depth
END TYPE cudaExtent

TYPE cudaPos
  INTEGER(KIND=INT_PTR_KIND()) :: x
  INTEGER(KIND=INT_PTR_KIND()) :: y
  INTEGER(KIND=INT_PTR_KIND()) :: z
END TYPE cudaPos

TYPE cudaMemcpy3DParms
  TYPE(cudaArrayPtr)   :: srcArray
  TYPE(cudaPos)        :: srcPos
  TYPE(cudaPitchedPtr) :: srcPtr
  TYPE(cudaArrayPtr)   :: dstArray
  TYPE(cudaPos)        :: dstPos
  TYPE(cudaPitchedPtr) :: dstPtr
  TYPE(cudaExtent)     :: extent
  INTEGER(C_INT)       :: kind
END TYPE cudaMemcpy3DParms

TYPE cudaMemcpy3DPeerParms
  TYPE(cudaArrayPtr)   :: srcArray
  TYPE(cudaPos)        :: srcPos
  TYPE(cudaPitchedPtr) :: srcPtr
  INTEGER              :: srcDevice
  TYPE(cudaArrayPtr)   :: dstArray
  TYPE(cudaPos)        :: dstPos
  TYPE(cudaPitchedPtr) :: dstPtr
  INTEGER              :: dstDevice
  TYPE(cudaExtent)     :: extent
END TYPE cudaMemcpy3DPeerParms

TYPE cudaPointerAttributes
  INTEGER(C_INT)       :: type
  INTEGER              :: device
  TYPE(C_DEVPTR)       :: devptr
  TYPE(C_PTR)          :: hostptr
END TYPE cudaPointerAttributes

TYPE cudaFuncAttributes
  INTEGER(KIND=INT_PTR_KIND()) :: sharedSizeBytes
  INTEGER(KIND=INT_PTR_KIND()) :: constSizeBytes
  INTEGER(KIND=INT_PTR_KIND()) :: localSizeBytes
  INTEGER :: maxThreadsPerBlock
  INTEGER :: numRegs
  INTEGER :: ptxVersion
  INTEGER :: binaryVersion
  INTEGER :: cacheModeCA
  INTEGER :: maxDynamicSharedSizeBytes
  INTEGER :: preferredShmemCarveout
  INTEGER :: clusterDimMustBeSet
  INTEGER :: requiredClusterWidth
  INTEGER :: requiredClusterHeight
  INTEGER :: requiredClusterDepth
  INTEGER :: clusterSchedulingPolicyPreference
  INTEGER :: nonPortableClusterSizeAllowed
  INTEGER :: reserved(16)
END TYPE cudaFuncAttributes

TYPE cudaTextureReference
  INTEGER        :: normalized
  INTEGER(C_INT) :: filterMode
  INTEGER(C_INT) :: addressMode(3)
  TYPE(cudaChannelFormatDesc) :: channelDesc
  INTEGER        :: sRGB
  INTEGER        :: reserved(15)
END TYPE cudaTextureReference

TYPE cudaGraph
  TYPE(C_PTR)  :: graph
END TYPE cudaGraph

TYPE cudaGraphExec
  TYPE(C_PTR)  :: exec
END TYPE cudaGraphExec

TYPE cudaGraphNode
  TYPE(C_PTR)  :: node
END TYPE cudaGraphNode

TYPE cudaMemPool
  TYPE(C_PTR)  :: pool
END TYPE cudaMemPool

TYPE, BIND(C) :: cudaMemLocation
  integer(4) :: type
  integer(4) :: id
END TYPE cudaMemLocation

! bind(c) to hopefully make type member alignment okay
TYPE, BIND(C) :: cudaMemPoolProps
  integer(4) :: allocType
  integer(4) :: handleTypes
  type(cudaMemLocation) :: location
  integer(8) :: win32SecurityAttributes
  integer(8) :: maxSize
  integer(2) :: usage
  integer(1) :: reserved(54)
END TYPE cudaMemPoolProps

TYPE :: cudaLaunchConfig
  TYPE(dim3) :: gridDim
  TYPE(dim3) :: blockDim
  integer(8) :: dynamicSmemBytes
  integer(8) :: stream
  integer :: numAttrs
  integer(4) :: id
  TYPE(dim3) :: clusterDim
END TYPE
