Lab 02 - Basit Matriks Çarpımı
Amaç
Basit yoğun matriks çarpımı kodu yazmak
Aşamalar
Cihaz üzerinde bellek ayırmak
Host ile ciahz arasında veri kopyalamak
İş parçacığı bloklarını ve şebeke boyularını oluşturmak
CUDA çekirdeğini çalıştırmak
Sonucu cihazdan hosta kopyalamak
Ayrılan belleği temizlemek
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <vector>
#include <ios>
#include <sstream>
// Compute C = A * B
// Sgemm stands for single precision general matrix-matrix multiply
__global__ void sgemm(float* A, float* B, float* C, int numARows,
int numAColumns, int numBRows, int numBColumns) {
//@@ Insert code to implement matrix multiplication here
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < numARows && col < numBColumns) {
float sum = 0;
for (int ii = 0; ii < numAColumns; ii++) {
sum += A[row * numAColumns + ii] * B[ii * numBColumns + col];
}
C[row * numBColumns + col] = sum;
}
}
std::vector<float> readFile(const std::string &fileName) {
std::fstream fs;
fs.open(fileName, std::ios::in);
std::vector<float> floatVec;
std::string strFloat;
float fNum;
while (getline(fs, strFloat))
{
std::stringstream linestream(strFloat);
while (linestream >> fNum)
{
floatVec.push_back(fNum);
}
}
return floatVec;
}
int main(int argc, char** argv) {
float* hostA; // The A matrix
float* hostB; // The B matrix
float* hostC; // The output C matrix
float* deviceA;
float* deviceB;
float* deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows;
int numCColumns;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
numARows = atoi(argv[1]);
numAColumns = atoi(argv[1]);
numBRows = atoi(argv[2]);
numBColumns = atoi(argv[2]);
cudaEventRecord(start);
std::cout << "Importing data and creating memory on host \n";
hostA = (float*)&readFile(std::string(argv[3]))[0];
hostB = (float*)&readFile(std::string(argv[4]))[0];
//@@ Allocate the hostC matrix
hostC = (float*)malloc(numARows * numBColumns * sizeof(float));
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "It took: " << milliseconds << " ms\n"; //TODO time
numCRows = numARows;
numCColumns = numBColumns;
std::cout << "The dimensions of A are " << numARows << " x " << numAColumns << "\n";
std::cout << "The dimensions of B are " << numBRows << " x " << numBColumns << "\n";
std::cout << "The dimensions of C are " << numCRows << " x " << numCColumns << "\n";
cudaEventRecord(start);
std::cout << "Allocationg GPU memory\n";
//@@ Allocate GPU memory here
(cudaMalloc((void**)&deviceA,
numARows * numAColumns * sizeof(float)));
(cudaMalloc((void**)&deviceB,
numBRows * numBColumns * sizeof(float)));
(cudaMalloc((void**)&deviceC,
numARows * numBColumns * sizeof(float)));
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "It took: " << milliseconds << " ms\n"; //TODO time
cudaEventRecord(start);
std::cout << "Copying input memory to the GPU.\n";
//@@ Copy memory to the GPU here
(cudaMemcpy(deviceA, hostA,
numARows * numAColumns * sizeof(float),
cudaMemcpyHostToDevice));
(cudaMemcpy(deviceB, hostB,
numBRows * numBColumns * sizeof(float),
cudaMemcpyHostToDevice));
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "It took: " << milliseconds << " ms\n"; //TODO time
//@@Initialize the grid and block
dim3 blockDim(16, 16);
// changed to BColumns and ARows from Acolumns and BRows
dim3 gridDim(ceil(((float)numBColumns) / blockDim.x),
ceil(((float)numARows) / blockDim.y));
std::cout << "The block dimensions are " << blockDim.x << " x " << blockDim.y << "\n";
std::cout << "The grid dimensions are " << gridDim.x << " x " << gridDim.y << "\n";
cudaEventRecord(start);
std::cout << "Performing CUDA computation\n";
//@@ Launch the GPU Kernel here
(cudaMemset(deviceC, 0, numARows * numBColumns * sizeof(float)));
sgemm << <gridDim, blockDim >> > (deviceA, deviceB, deviceC, numARows,
numAColumns, numBRows, numBColumns);
cudaDeviceSynchronize();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "It took: " << milliseconds << " ms\n";
//@@ Copy the GPU memory back to the CPU here
cudaEventRecord(start);
std::cout << "Copying output memory to the CPU\n";
(cudaMemcpy(hostC, deviceC,
numARows * numBColumns * sizeof(float),
cudaMemcpyDeviceToHost));
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "It took: " << milliseconds << " ms\n";
cudaEventRecord(start);
std::cout << "Freeing GPU Memory\n";
//@@ Free the GPU memory here
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "It took: " << milliseconds << " ms\n";
for (int i = 0; i < numCRows * numCColumns; i++) {
std::cout << hostC[i] << " ";
}
std::cout << "\n";
return 0;
}
Çıktı