Lab 02 - Basit Matriks Çarpımı

Amaç

  • Basit yoğun matriks çarpımı kodu yazmak

Aşamalar

  • Cihaz üzerinde bellek ayırmak

  • Host ile ciahz arasında veri kopyalamak

  • İş parçacığı bloklarını ve şebeke boyularını oluşturmak

  • CUDA çekirdeğini çalıştırmak

  • Sonucu cihazdan hosta kopyalamak

  • Ayrılan belleği temizlemek

#include <cstdlib>
#include <iostream>
#include <fstream>
#include <vector>
#include <ios>
#include <sstream>
// Compute C = A * B
// Sgemm stands for single precision general matrix-matrix multiply
__global__ void sgemm(float* A, float* B, float* C, int numARows,
    int numAColumns, int numBRows, int numBColumns) {
    //@@ Insert code to implement matrix multiplication here
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < numARows && col < numBColumns) {
        float sum = 0;
        for (int ii = 0; ii < numAColumns; ii++) {
            sum += A[row * numAColumns + ii] * B[ii * numBColumns + col];
        }
        C[row * numBColumns + col] = sum;
    }
}


std::vector<float> readFile(const std::string &fileName) {
    std::fstream fs;
    fs.open(fileName, std::ios::in);
    std::vector<float> floatVec;
    std::string strFloat;
    float fNum;
    while (getline(fs, strFloat))
    {
        std::stringstream  linestream(strFloat);
        while (linestream >> fNum)
    {
            floatVec.push_back(fNum);
    }
    }

    return floatVec;
}

int main(int argc, char** argv) {
    float* hostA; // The A matrix
    float* hostB; // The B matrix
    float* hostC; // The output C matrix
    float* deviceA;
    float* deviceB;
    float* deviceC;
    int numARows; // number of rows in the matrix A
    int numAColumns; // number of columns in the matrix A
    int numBRows; // number of rows in the matrix B
    int numBColumns; // number of columns in the matrix B
    int numCRows;
    int numCColumns;


    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);


    numARows = atoi(argv[1]);
    numAColumns = atoi(argv[1]);

    numBRows = atoi(argv[2]);
    numBColumns = atoi(argv[2]);

    cudaEventRecord(start);
    std::cout << "Importing data and creating memory on host \n";
    hostA = (float*)&readFile(std::string(argv[3]))[0];
    hostB = (float*)&readFile(std::string(argv[4]))[0];
    //@@ Allocate the hostC matrix
    hostC = (float*)malloc(numARows * numBColumns * sizeof(float));
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "It took: " << milliseconds << " ms\n"; //TODO time


    numCRows = numARows;
    numCColumns = numBColumns;

    std::cout << "The dimensions of A are " << numARows << " x " << numAColumns << "\n";
    std::cout << "The dimensions of B are " << numBRows << " x " << numBColumns << "\n";
    std::cout << "The dimensions of C are " << numCRows << " x " << numCColumns << "\n";

    cudaEventRecord(start);
    std::cout << "Allocationg GPU memory\n";

    //@@ Allocate GPU memory here
    (cudaMalloc((void**)&deviceA,
        numARows * numAColumns * sizeof(float)));
    (cudaMalloc((void**)&deviceB,
        numBRows * numBColumns * sizeof(float)));
    (cudaMalloc((void**)&deviceC,
        numARows * numBColumns * sizeof(float)));
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "It took: " << milliseconds << " ms\n"; //TODO time



    cudaEventRecord(start);
    std::cout << "Copying input memory to the GPU.\n";

    //@@ Copy memory to the GPU here
    (cudaMemcpy(deviceA, hostA,
        numARows * numAColumns * sizeof(float),

        cudaMemcpyHostToDevice));
    (cudaMemcpy(deviceB, hostB,
        numBRows * numBColumns * sizeof(float),
        cudaMemcpyHostToDevice));
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "It took: " << milliseconds << " ms\n"; //TODO time

    //@@Initialize the grid and block
        dim3 blockDim(16, 16);
    // changed to BColumns and ARows from Acolumns and BRows

    dim3 gridDim(ceil(((float)numBColumns) / blockDim.x),
        ceil(((float)numARows) / blockDim.y));

    std::cout << "The block dimensions are " << blockDim.x << " x " << blockDim.y << "\n";
    std::cout << "The grid dimensions are " << gridDim.x << " x " << gridDim.y << "\n";


    cudaEventRecord(start);
    std::cout << "Performing CUDA computation\n";
    //@@ Launch the GPU Kernel here
    (cudaMemset(deviceC, 0, numARows * numBColumns * sizeof(float)));
    sgemm << <gridDim, blockDim >> > (deviceA, deviceB, deviceC, numARows,
        numAColumns, numBRows, numBColumns);
    cudaDeviceSynchronize();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "It took: " << milliseconds << " ms\n";



    //@@ Copy the GPU memory back to the CPU here
    cudaEventRecord(start);
    std::cout << "Copying output memory to the CPU\n";
    (cudaMemcpy(hostC, deviceC,
        numARows * numBColumns * sizeof(float),
        cudaMemcpyDeviceToHost));
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "It took: " << milliseconds << " ms\n";

    cudaEventRecord(start);
    std::cout << "Freeing GPU Memory\n";
    //@@ Free the GPU memory here
    cudaFree(deviceA);
    cudaFree(deviceB);
    cudaFree(deviceC);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "It took: " << milliseconds << " ms\n";

    for (int i = 0; i < numCRows * numCColumns; i++) {
        std::cout << hostC[i] << " ";
    }
    std::cout << "\n";


    return 0;
}