A neat C++ custom Matrix class to perform super-fast GPU (or CPU) powered Matrix/Vector computations with minimal code, leveraging the power of cuBLAS where applicable (Python Interface in the works).
Tested on -> Windows 10, NVIDIA GTX 1070 (notebook), CUDA 8.0 (with cuDNN v5.0), C++11, Intel Core i7-6700HQ 2.6 GHz Compiled with Visual Studio 2015
Relevant DLL files (etc.) contained in the "package" folder for respective platforms. More details coming soon...
This only shows the += operation, similar pattern can be used for +, *, - operations between SHMatrix objects or floats (any way) on CPU or GPU.
#include <vector>
#include <iostream>
#include "SHMatrix.h"
using namespace std;
int main() {
cublasHandle_t cublasHandle;
CublasSafeCall(cublasCreate_v2(&cublasHandle));
SHMatrix a(cublasHandle, std::vector<int> { 3, 5 }, GPU);
a.GaussianInit(); //Initializing a 3x5 matrix with random numbers from gaussian distribution.
a.Print();
SHMatrix b(cublasHandle, std::vector<int> { 3, 5 }, GPU);
b.UniformInit(); //Initializing a 3x5 matrix with random numbers from uniform distribution.
b.Print();
b.T(); //Performing in-place lazy-transpose to change dimensions to 5x3.
SHMatrix c(cublasHandle, std::vector<int> { 3, 3 }, GPU); //SHMatrix to store dot-product results.
SHMatrix::Dot(cublasHandle, a, b, c); //Performing dot-product on GPU.
c.Print();
c.Move2CPU();
SHMatrix::Dot(cublasHandle, a, b, c); //Performing dot-product on CPU.
c.Print();
b.T(); //Changing dimensions to 3x5 for element-wise operations with a.
a += b; //In-place matrix-matrix add operation (b is added to a) on GPU.
a.Print();
a.Move2CPU();
a += b; //In-place matrix-matrix add operation (b is added to a) on CPU.
a.Print();
}
1.1.1
SHMatrix(const cublasHandle_t &cublas_handle_arg,
float *mat_data, std::vector<int> &dims,
mem_location = GPU);
1.1.2
SHMatrix(const cublasHandle_t &cublas_handle_arg,
std::vector<int> &dims, mem_location = GPU,
bool default_init = false, float init_val = 0.0f);
1.1.3
SHMatrix(const cublasHandle_t &cublas_handle_arg,
SHMatrix &src_shmatrix, mem_location = GPU);
1.2.1
void Equate(SHMatrix &src_shmatrix);
1.2.2
void Reallocate(std::vector<int> &dims, mem_location mem_loc = GPU,
bool copy_original = false, bool default_init = false,
float init_val = 0.0f);
1.2.3
void Print(bool print_elem = true);
1.2.4
void Move2GPU();
1.2.5
void Move2CPU();
1.2.6
Clear();
1.2.7
void GaussianInit(float mean = 0.0f, float stddev = 0.1f);
1.2.8
void UniformInit(float lower = -0.5f, float higher = 0.5f);
1.2.9
SHMatrix& T();
1.2.10
SHMatrix& Scale(float scale_arg);
1.2.11
void CommitUnaryOps();
1.2.12
void CommitTranspose();
1.2.13
void CommitScale();
1.3.1
void Dot(cublasHandle_t cublas_handle, SHMatrix &A, SHMatrix &B, SHMatrix &C);
1.3.2
float* DataPointerAtLoc(SHMatrix& arg, mem_location desired_loc);
1.3.3
float GetGaussianNum(float mean, float stddev);
1.3.4
float GetUniformNum(float lower, float higher);
1.4.1
float *data;
1.4.2
float scalar;
1.4.3
mem_location data_loc;
1.4.4
std::vector<int> data_dims;
1.4.5
std::string name;
1.4.6
float mean;
1.4.7
float mini;
1.4.8
float maxi;
1.4.9
bool allocated;
1.4.10
int rows;
1.4.11
int cols;
1.4.12
int num_elems;
1.4.13
float mini_idx;
1.4.14
float maxi_idx;