PLSSVM/svm__kernel_8hip_8hpp_source.html

 #ifndef PLSSVM_BACKENDS_HIP_SVM_KERNEL_HPP_

 #define PLSSVM_BACKENDS_HIP_SVM_KERNEL_HPP_

 #pragma once


 #include "hip/hip_runtime.h"

 #include "hip/hip_runtime_api.h"


 #include "plssvm/constants.hpp"  // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE


 namespace plssvm::hip {


 template <typename real_type>

 __global__ void device_kernel_linear(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) {

     kernel_index_type i = blockIdx.x * blockDim.x * INTERNAL_BLOCK_SIZE;

     kernel_index_type j = blockIdx.y * blockDim.y * INTERNAL_BLOCK_SIZE;


     __shared__ real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];

     __shared__ real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];

     real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };

     real_type data_j[INTERNAL_BLOCK_SIZE];


     if (i >= j) {

         i += threadIdx.x * INTERNAL_BLOCK_SIZE;

         const kernel_index_type ji = j + threadIdx.x * INTERNAL_BLOCK_SIZE;

         j += threadIdx.y * INTERNAL_BLOCK_SIZE;

         // cache data

         for (kernel_index_type vec_index = 0; vec_index < feature_range * num_rows; vec_index += num_rows) {

             __syncthreads();

             #pragma unroll INTERNAL_BLOCK_SIZE

             for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {

                 const kernel_index_type idx = block_id % THREAD_BLOCK_SIZE;

                 if (threadIdx.y == idx) {

                     data_intern_i[threadIdx.x][block_id] = data_d[block_id + vec_index + i];

                 }

                 const kernel_index_type idx_2 = block_id + INTERNAL_BLOCK_SIZE % THREAD_BLOCK_SIZE;

                 if (threadIdx.y == idx_2) {

                     data_intern_j[threadIdx.x][block_id] = data_d[block_id + vec_index + ji];

                 }

             }

             __syncthreads();


             #pragma unroll INTERNAL_BLOCK_SIZE

             for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {

                 data_j[data_index] = data_intern_j[threadIdx.y][data_index];

             }


             #pragma unroll INTERNAL_BLOCK_SIZE

             for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {

                 const real_type data_i = data_intern_i[threadIdx.x][l];

                 #pragma unroll INTERNAL_BLOCK_SIZE

                 for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {

                     matr[k][l] += data_i * data_j[k];

                 }

             }

         }


         #pragma unroll INTERNAL_BLOCK_SIZE

         for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {

             real_type ret_jx = 0.0;

             #pragma unroll INTERNAL_BLOCK_SIZE

             for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {

                 real_type temp;

                 if (id == 0) {

                     temp = (matr[x][y] + QA_cost - q[i + y] - q[j + x]) * add;

                 } else {

                     temp = matr[x][y] * add;

                 }

                 if (i + x > j + y) {

                     // upper triangular matrix

                     atomicAdd(&ret[i + y], temp * d[j + x]);

                     ret_jx += temp * d[i + y];

                 } else if (i + x == j + y) {

                     // diagonal

                     if (id == 0) {

                         ret_jx += (temp + cost * add) * d[i + y];

                     } else {

                         ret_jx += temp * d[i + y];

                     }

                 }

             }

             atomicAdd(&ret[j + x], ret_jx);

         }

     }

 }


 template <typename real_type>

 __global__ void device_kernel_polynomial(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) {

     kernel_index_type i = blockIdx.x * blockDim.x * INTERNAL_BLOCK_SIZE;

     kernel_index_type j = blockIdx.y * blockDim.y * INTERNAL_BLOCK_SIZE;


     __shared__ real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];

     __shared__ real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];

     real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };

     real_type data_j[INTERNAL_BLOCK_SIZE];


     if (i >= j) {

         i += threadIdx.x * INTERNAL_BLOCK_SIZE;

         const kernel_index_type ji = j + threadIdx.x * INTERNAL_BLOCK_SIZE;

         j += threadIdx.y * INTERNAL_BLOCK_SIZE;

         for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {

             __syncthreads();

             #pragma unroll INTERNAL_BLOCK_SIZE

             for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {

                 const kernel_index_type idx = block_id % THREAD_BLOCK_SIZE;

                 if (threadIdx.y == idx) {

                     data_intern_i[threadIdx.x][block_id] = data_d[block_id + vec_index + i];

                 }

                 const kernel_index_type idx_2 = block_id + INTERNAL_BLOCK_SIZE % THREAD_BLOCK_SIZE;

                 if (threadIdx.y == idx_2) {

                     data_intern_j[threadIdx.x][block_id] = data_d[block_id + vec_index + ji];

                 }

             }

             __syncthreads();


             #pragma unroll INTERNAL_BLOCK_SIZE

             for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {

                 data_j[data_index] = data_intern_j[threadIdx.y][data_index];

             }


             #pragma unroll INTERNAL_BLOCK_SIZE

             for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {

                 const real_type data_i = data_intern_i[threadIdx.x][l];

                 #pragma unroll INTERNAL_BLOCK_SIZE

                 for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {

                     matr[k][l] += data_i * data_j[k];

                 }

             }

         }


         #pragma unroll INTERNAL_BLOCK_SIZE

         for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {

             real_type ret_jx = 0.0;

             #pragma unroll INTERNAL_BLOCK_SIZE

             for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {

                 const real_type temp = (pow(gamma * matr[x][y] + coef0, degree) + QA_cost - q[i + y] - q[j + x]) * add;

                 if (i + x > j + y) {

                     // upper triangular matrix

                     atomicAdd(&ret[i + y], temp * d[j + x]);

                     ret_jx += temp * d[i + y];

                 } else if (i + x == j + y) {

                     // diagonal

                     ret_jx += (temp + cost * add) * d[i + y];

                 }

             }

             atomicAdd(&ret[j + x], ret_jx);

         }

     }

 }


 template <typename real_type>

 __global__ void device_kernel_rbf(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) {

     kernel_index_type i = blockIdx.x * blockDim.x * INTERNAL_BLOCK_SIZE;

     kernel_index_type j = blockIdx.y * blockDim.y * INTERNAL_BLOCK_SIZE;


     __shared__ real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];

     __shared__ real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];

     real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };

     real_type data_j[INTERNAL_BLOCK_SIZE];


     if (i >= j) {

         i += threadIdx.x * INTERNAL_BLOCK_SIZE;

         const kernel_index_type ji = j + threadIdx.x * INTERNAL_BLOCK_SIZE;

         j += threadIdx.y * INTERNAL_BLOCK_SIZE;

         for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {

             __syncthreads();

             #pragma unroll INTERNAL_BLOCK_SIZE

             for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {

                 const kernel_index_type idx = block_id % THREAD_BLOCK_SIZE;

                 if (threadIdx.y == idx) {

                     data_intern_i[threadIdx.x][block_id] = data_d[block_id + vec_index + i];

                 }

                 const kernel_index_type idx2 = block_id + INTERNAL_BLOCK_SIZE % THREAD_BLOCK_SIZE;

                 if (threadIdx.y == idx2) {

                     data_intern_j[threadIdx.x][block_id] = data_d[block_id + vec_index + ji];

                 }

             }

             __syncthreads();


             #pragma unroll INTERNAL_BLOCK_SIZE

             for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {

                 data_j[data_index] = data_intern_j[threadIdx.y][data_index];

             }


             #pragma unroll INTERNAL_BLOCK_SIZE

             for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {

                 const real_type data_i = data_intern_i[threadIdx.x][l];

                 #pragma unroll INTERNAL_BLOCK_SIZE

                 for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {

                     matr[k][l] += (data_i - data_j[k]) * (data_i - data_j[k]);

                 }

             }

         }


         #pragma unroll INTERNAL_BLOCK_SIZE

         for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {

             real_type ret_jx = 0.0;

             #pragma unroll INTERNAL_BLOCK_SIZE

             for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {

                 const real_type temp = (exp(-gamma * matr[x][y]) + QA_cost - q[i + y] - q[j + x]) * add;

                 if (i + x > j + y) {

                     // upper triangular matrix

                     atomicAdd(&ret[i + y], temp * d[j + x]);

                     ret_jx += temp * d[i + y];

                 } else if (i + x == j + y) {

                     // diagonal

                     ret_jx += (temp + cost * add) * d[i + y];

                 }

             }

             atomicAdd(&ret[j + x], ret_jx);

         }

     }

 }


 }  // namespace plssvm::hip


 #endif  // PLSSVM_BACKENDS_HIP_SVM_KERNEL_HPP_

atomicAdd
__device__ __forceinline__ double atomicAdd(double *addr, const double val)
Atomically add the double precision val to the value denoted by addr.
Definition: atomics.cuh:24

constants.hpp
Global type definitions and compile-time constants.

plssvm::hip
Namespace containing the C-SVM using the HIP backend.
Definition: csvm.hpp:34

plssvm::hip::device_kernel_linear
__global__ void device_kernel_linear(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id)
Calculates the C-SVM kernel using the linear kernel function.
Definition: svm_kernel.hip.hpp:39

plssvm::hip::device_kernel_polynomial
__global__ void device_kernel_polynomial(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0)
Calculates the C-SVM kernel using the polynomial kernel function.
Definition: svm_kernel.hip.hpp:130

plssvm::hip::device_kernel_rbf
__global__ void device_kernel_rbf(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma)
Calculates the C-SVM kernel using the radial basis function kernel function.
Definition: svm_kernel.hip.hpp:209

plssvm::THREAD_BLOCK_SIZE
constexpr kernel_index_type THREAD_BLOCK_SIZE
Global compile-time constant used for internal caching. May be changed during the CMake configuration...
Definition: constants.hpp:25

plssvm::kernel_index_type
int kernel_index_type
Integer type used inside kernels.
Definition: constants.hpp:19

plssvm::INTERNAL_BLOCK_SIZE
constexpr kernel_index_type INTERNAL_BLOCK_SIZE
Global compile-time constant used for internal caching. May be changed during the CMake configuration...
Definition: constants.hpp:32