Lab 03 - Bölmeli (Tiled) Matriks Çarpımı
Amaç
Bölmeli yoğun matriks çarpımı kodu yazmak
Aşamalar
Cihaz üzerinde bellek ayırmak
Host ile ciahz arasında veri kopyalamak
İş parçacığı bloklarını ve şebeke boyularını oluşturmak
CUDA çekirdeğini çalıştırmak
Sonucu cihazdan hosta kopyalamak
Ayrılan belleği temizlemek
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <vector>
#include <ios>
#include <sstream>
#define TILE_WIDTH 16
// Compute C = A * B
__global__ void matrixMultiply(float* A, float* B, float* C, int numARows,
int numAColumns, int numBRows,
int numBColumns, int numCRows,
int numCColumns) {
//@@ Insert code to implement matrix multiplication here
__shared__ float ds_M[TILE_WIDTH][TILE_WIDTH];
__shared__ float ds_N[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x, by = blockIdx.y, tx = threadIdx.x, ty = threadIdx.y,
Row = by * TILE_WIDTH + ty, Col = bx * TILE_WIDTH + tx;
float Pvalue = 0;
for (int m = 0; m < (numAColumns - 1) / TILE_WIDTH + 1; ++m) {
if (Row < numARows && m * TILE_WIDTH + tx < numAColumns)
ds_M[ty][tx] = A[Row * numAColumns + m * TILE_WIDTH + tx];
else
ds_M[ty][tx] = 0;
if (Col < numBColumns && m * TILE_WIDTH + ty < numBRows)
ds_N[ty][tx] = B[(m * TILE_WIDTH + ty) * numBColumns + Col];
else
ds_N[ty][tx] = 0;
__syncthreads();
for (int k = 0; k < TILE_WIDTH; ++k)
Pvalue += ds_M[ty][k] * ds_N[k][tx];
__syncthreads();
}
if (Row < numCRows && Col < numCColumns)
C[Row * numCColumns + Col] = Pvalue;
}
std::vector<float> readFile(const std::string &fileName) {
std::fstream fs;
fs.open(fileName, std::ios::in);
std::vector<float> floatVec;
std::string strFloat;
float fNum;
while (getline(fs, strFloat))
{
std::stringstream linestream(strFloat);
while (linestream >> fNum)
{
floatVec.push_back(fNum);
}
}
return floatVec;
}
int main(int argc, char** argv) {
float* hostA; // The A matrix
float* hostB; // The B matrix
float* hostC; // The output C matrix
float* deviceA;
float* deviceB;
float* deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows;
int numCColumns;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
numARows = atoi(argv[1]);
numAColumns = atoi(argv[1]);
numBRows = atoi(argv[2]);
numBColumns = atoi(argv[2]);
cudaEventRecord(start);
std::cout << "Importing data and creating memory on host \n";
hostA = (float*)&readFile(std::string(argv[3]))[0];
hostB = (float*)&readFile(std::string(argv[4]))[0];
//@@ Allocate the hostC matrix
hostC = (float*)malloc(numARows * numBColumns * sizeof(float));
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "It took: " << milliseconds << " ms\n"; //TODO time
numCRows = numARows;
numCColumns = numBColumns;
std::cout << "The dimensions of A are " << numARows << " x " << numAColumns << "\n";
std::cout << "The dimensions of B are " << numBRows << " x " << numBColumns << "\n";
std::cout << "The dimensions of C are " << numCRows << " x " << numCColumns << "\n";
cudaEventRecord(start);
std::cout << "Allocationg GPU memory\n";
//@@ Allocate GPU memory here
(cudaMalloc((void**)&deviceA,
numARows * numAColumns * sizeof(float)));
(cudaMalloc((void**)&deviceB,
numBRows * numBColumns * sizeof(float)));
(cudaMalloc((void**)&deviceC,
numARows * numBColumns * sizeof(float)));
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "It took: " << milliseconds << " ms\n"; //TODO time
cudaEventRecord(start);
std::cout << "Copying input memory to the GPU.\n";
//@@ Copy memory to the GPU here
(cudaMemcpy(deviceA, hostA,
numARows * numAColumns * sizeof(float),
cudaMemcpyHostToDevice));
(cudaMemcpy(deviceB, hostB,
numBRows * numBColumns * sizeof(float),
cudaMemcpyHostToDevice));
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "It took: " << milliseconds << " ms\n"; //TODO time
//@@ Initialize the grid and block dimensions here
dim3 dimGrids((numCColumns - 1) / TILE_WIDTH + 1,
(numCRows - 1) / TILE_WIDTH + 1, 1);
dim3 dimBlocks(TILE_WIDTH, TILE_WIDTH, 1);
cudaEventRecord(start);
std::cout << "Performing CUDA computation\n";
//@@ Launch the GPU Kernel here
(cudaMemset(deviceC, 0, numARows * numBColumns * sizeof(float)));
matrixMultiply << <dimGrids, dimBlocks >> > (
deviceA, deviceB, deviceC, numARows, numAColumns, numBRows,
numBColumns, numCRows, numCColumns);
cudaDeviceSynchronize();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "It took: " << milliseconds << " ms\n";
//@@ Copy the GPU memory back to the CPU here
cudaEventRecord(start);
std::cout << "Copying output memory to the CPU\n";
(cudaMemcpy(hostC, deviceC,
numARows * numBColumns * sizeof(float),
cudaMemcpyDeviceToHost));
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "It took: " << milliseconds << " ms\n";
cudaEventRecord(start);
std::cout << "Freeing GPU Memory\n";
//@@ Free the GPU memory here
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "It took: " << milliseconds << " ms\n";
for (int i = 0; i < numCRows * numCColumns; i++) {
std::cout << hostC[i] << " ";
}
std::cout << "\n";
return 0;
}
Çıktı