Lab 03 - Bölmeli (Tiled) Matriks Çarpımı

Amaç

  • Bölmeli yoğun matriks çarpımı kodu yazmak

Aşamalar

  • Cihaz üzerinde bellek ayırmak

  • Host ile ciahz arasında veri kopyalamak

  • İş parçacığı bloklarını ve şebeke boyularını oluşturmak

  • CUDA çekirdeğini çalıştırmak

  • Sonucu cihazdan hosta kopyalamak

  • Ayrılan belleği temizlemek

#include <cstdlib>
#include <iostream>
#include <fstream>
#include <vector>
#include <ios>
#include <sstream>

#define TILE_WIDTH 16

// Compute C = A * B
__global__ void matrixMultiply(float* A, float* B, float* C, int numARows,
    int numAColumns, int numBRows,
    int numBColumns, int numCRows,
    int numCColumns) {
    //@@ Insert code to implement matrix multiplication here
    __shared__ float ds_M[TILE_WIDTH][TILE_WIDTH];
    __shared__ float ds_N[TILE_WIDTH][TILE_WIDTH];
    int bx = blockIdx.x, by = blockIdx.y, tx = threadIdx.x, ty = threadIdx.y,
        Row = by * TILE_WIDTH + ty, Col = bx * TILE_WIDTH + tx;
    float Pvalue = 0;

    for (int m = 0; m < (numAColumns - 1) / TILE_WIDTH + 1; ++m) {
        if (Row < numARows && m * TILE_WIDTH + tx < numAColumns)
            ds_M[ty][tx] = A[Row * numAColumns + m * TILE_WIDTH + tx];
        else
            ds_M[ty][tx] = 0;
        if (Col < numBColumns && m * TILE_WIDTH + ty < numBRows)
            ds_N[ty][tx] = B[(m * TILE_WIDTH + ty) * numBColumns + Col];
        else
            ds_N[ty][tx] = 0;

        __syncthreads();
        for (int k = 0; k < TILE_WIDTH; ++k)
            Pvalue += ds_M[ty][k] * ds_N[k][tx];
        __syncthreads();
    }
    if (Row < numCRows && Col < numCColumns)
        C[Row * numCColumns + Col] = Pvalue;
}

std::vector<float> readFile(const std::string &fileName) {
    std::fstream fs;
    fs.open(fileName, std::ios::in);
    std::vector<float> floatVec;
    std::string strFloat;
    float fNum;
    while (getline(fs, strFloat))
    {
        std::stringstream  linestream(strFloat);
        while (linestream >> fNum)
    {
            floatVec.push_back(fNum);
    }
    }

    return floatVec;
}

int main(int argc, char** argv) {
    float* hostA; // The A matrix
    float* hostB; // The B matrix
    float* hostC; // The output C matrix
    float* deviceA;
    float* deviceB;
    float* deviceC;
    int numARows; // number of rows in the matrix A
    int numAColumns; // number of columns in the matrix A
    int numBRows; // number of rows in the matrix B
    int numBColumns; // number of columns in the matrix B
    int numCRows;
    int numCColumns;


    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);


    numARows = atoi(argv[1]);
    numAColumns = atoi(argv[1]);

    numBRows = atoi(argv[2]);
    numBColumns = atoi(argv[2]);

    cudaEventRecord(start);
    std::cout << "Importing data and creating memory on host \n";
    hostA = (float*)&readFile(std::string(argv[3]))[0];
    hostB = (float*)&readFile(std::string(argv[4]))[0];
    //@@ Allocate the hostC matrix
    hostC = (float*)malloc(numARows * numBColumns * sizeof(float));
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "It took: " << milliseconds << " ms\n"; //TODO time


    numCRows = numARows;
    numCColumns = numBColumns;

    std::cout << "The dimensions of A are " << numARows << " x " << numAColumns << "\n";
    std::cout << "The dimensions of B are " << numBRows << " x " << numBColumns << "\n";
    std::cout << "The dimensions of C are " << numCRows << " x " << numCColumns << "\n";

    cudaEventRecord(start);
    std::cout << "Allocationg GPU memory\n";

    //@@ Allocate GPU memory here
    (cudaMalloc((void**)&deviceA,
        numARows * numAColumns * sizeof(float)));
    (cudaMalloc((void**)&deviceB,
        numBRows * numBColumns * sizeof(float)));
    (cudaMalloc((void**)&deviceC,
        numARows * numBColumns * sizeof(float)));
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "It took: " << milliseconds << " ms\n"; //TODO time



    cudaEventRecord(start);
    std::cout << "Copying input memory to the GPU.\n";

    //@@ Copy memory to the GPU here
    (cudaMemcpy(deviceA, hostA,
        numARows * numAColumns * sizeof(float),

        cudaMemcpyHostToDevice));
    (cudaMemcpy(deviceB, hostB,
        numBRows * numBColumns * sizeof(float),
        cudaMemcpyHostToDevice));
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "It took: " << milliseconds << " ms\n"; //TODO time

        //@@ Initialize the grid and block dimensions here
    dim3 dimGrids((numCColumns - 1) / TILE_WIDTH + 1,
        (numCRows - 1) / TILE_WIDTH + 1, 1);
    dim3 dimBlocks(TILE_WIDTH, TILE_WIDTH, 1);




    cudaEventRecord(start);
    std::cout << "Performing CUDA computation\n";
    //@@ Launch the GPU Kernel here
    (cudaMemset(deviceC, 0, numARows * numBColumns * sizeof(float)));
    matrixMultiply << <dimGrids, dimBlocks >> > (
        deviceA, deviceB, deviceC, numARows, numAColumns, numBRows,
        numBColumns, numCRows, numCColumns);
    cudaDeviceSynchronize();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "It took: " << milliseconds << " ms\n";



    //@@ Copy the GPU memory back to the CPU here
    cudaEventRecord(start);
    std::cout << "Copying output memory to the CPU\n";
    (cudaMemcpy(hostC, deviceC,
        numARows * numBColumns * sizeof(float),
        cudaMemcpyDeviceToHost));
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "It took: " << milliseconds << " ms\n";

    cudaEventRecord(start);
    std::cout << "Freeing GPU Memory\n";
    //@@ Free the GPU memory here
    cudaFree(deviceA);
    cudaFree(deviceB);
    cudaFree(deviceC);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "It took: " << milliseconds << " ms\n";

    for (int i = 0; i < numCRows * numCColumns; i++) {
        std::cout << hostC[i] << " ";
    }
    std::cout << "\n";


    return 0;
}