/* * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ // ---------------------------------------------------------------------------------------- // Transpose // // This file contains both device and host code for transposing a floating-point // matrix. It performs several transpose kernels, which incrementally improve performance // through coalescing, removing shared memory bank conflicts, and eliminating partition // camping. Several of the kernels perform a copy, used to represent the best case // performance that a transpose can achieve. // // Please see the whitepaper in the docs folder of the transpose project for a detailed // description of this performance study. // ---------------------------------------------------------------------------------------- // Utilities and system includes #include #include #include const char *sSDKsample = "Transpose"; // Each block transposes/copies a tile of TILE_DIM x TILE_DIM elements // using TILE_DIM x BLOCK_ROWS threads, so that each thread transposes // TILE_DIM/BLOCK_ROWS elements. TILE_DIM must be an integral multiple of BLOCK_ROWS #define TILE_DIM 16 #define BLOCK_ROWS 16 // Modified the amount of workload, so during device emulation it runs in a reasonable amount of time. // This sample assumes that MATRIX_SIZE_X = MATRIX_SIZE_Y #ifdef __DEVICE_EMULATION__ int MATRIX_SIZE_X = 128; int MATRIX_SIZE_Y = 128; int MUL_FACTOR = 4; #else int MATRIX_SIZE_X = 1024; int MATRIX_SIZE_Y = 1024; int MUL_FACTOR = TILE_DIM; #endif #define FLOOR(a,b) (a-(a%b)) // Compute the tile size necessary to illustrate performance cases for SM12+ hardware int MAX_TILES_SM12 = (FLOOR(MATRIX_SIZE_X,512) * FLOOR(MATRIX_SIZE_Y,512)) / (TILE_DIM*TILE_DIM); // Compute the tile size necessary to illustrate performance cases for SM10,SM11 hardware int MAX_TILES_SM10 = (FLOOR(MATRIX_SIZE_X,384) * FLOOR(MATRIX_SIZE_Y,384)) / (TILE_DIM*TILE_DIM); // Number of repetitions used for timing. Two sets of repetitions are performed: // 1) over kernel launches and 2) inside the kernel over just the loads and stores #define NUM_REPS 100 // ------------------------------------------------------- // Copies // width and height must be integral multiples of TILE_DIM // ------------------------------------------------------- __global__ void copy(float *odata, float* idata, int width, int height, int nreps) { int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; int index = xIndex + width*yIndex; for (int r=0; r < nreps; r++) { for (int i=0; i max_tile_dim) { shrLog("> MatrixSize X = %d is greater than the recommended size = %d\n", size_x, max_tile_dim); } else { shrLog("> MatrixSize X = %d\n", size_x); } } else { size_x = max_tile_dim; // If this is SM12 hardware, we want to round down to a multiple of 512 if ((deviceProp.major == 1 && deviceProp.minor >= 2) || deviceProp.major > 1 ) { size_x = FLOOR(size_x, 512); } else { // else for SM10,SM11 we round down to a multiple of 384 size_x = FLOOR(size_x, 384); } } if ( cutGetCmdLineArgumenti(argc, (const char **) argv, "dimy", &size_y) || cutGetCmdLineArgumenti(argc, (const char **) argv, "dimY", &size_y) ) { if (size_y > max_tile_dim) { shrLog("> MatrixSize Y = %d is greater than the recommended size = %d\n", size_y, max_tile_dim); } else { shrLog("> MatrixSize Y = %d\n", size_y); } } else { size_y = max_tile_dim; // If this is SM12 hardware, we want to round down to a multiple of 512 if ((deviceProp.major == 1 && deviceProp.minor >= 2) || deviceProp.major > 1) { size_y = FLOOR(size_y, 512); } else { // else for SM10,SM11 we round down to a multiple of 384 size_y = FLOOR(size_y, 384); } } } void showHelp() { shrLog("\n> Command line options\n", sSDKsample); shrLog("\t-device=n (where n=0,1,2.... for the GPU device)\n\n"); shrLog("> The default matrix size can be overridden with these parameters\n"); shrLog("\t-dimX=row_dim_size (matrix row dimensions)\n"); shrLog("\t-dimY=col_dim_size (matrix column dimensions)\n"); } // ---- // main // ---- int main( int argc, char** argv) { shrQAStart(argc, argv); // Start logs shrSetLogFileName ("transpose.txt"); shrLog("%s Starting...\n\n", argv[0]); if( cutCheckCmdLineFlag(argc, (const char**)argv, "help") ) { showHelp(); return 0; } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cutilSafeCall( cudaSetDevice( cutGetMaxGflopsDeviceId() ) ); int devID; cudaDeviceProp deviceProp; // get number of SMs on this GPU cutilSafeCall(cudaGetDevice(&devID)); cutilSafeCall(cudaGetDeviceProperties(&deviceProp, devID)); // compute the scaling factor (for GPUs with fewer MPs) float scale_factor, total_tiles; scale_factor = max((192.0f / (ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount) ), 1.0f); shrLog("> Device %d: \"%s\"\n", devID, deviceProp.name); shrLog("> SM Capability %d.%d detected:\n", deviceProp.major, deviceProp.minor); // Calculate number of tiles we will run for the Matrix Transpose performance tests int size_x, size_y, max_matrix_dim, matrix_size_test; if ((deviceProp.major == 1 && deviceProp.minor >= 2) || deviceProp.major > 1) { matrix_size_test = 512; // we round down max_matrix_dim for this perf test total_tiles = (float)MAX_TILES_SM12 / scale_factor; } else { matrix_size_test = 384; // we round down max_matrix_dim for this perf test total_tiles = (float)MAX_TILES_SM10 / scale_factor; } max_matrix_dim = FLOOR((int)(floor(sqrt(total_tiles))* TILE_DIM), matrix_size_test); // This is the minimum size allowed if (max_matrix_dim == 0) max_matrix_dim = matrix_size_test; shrLog("> [%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name, deviceProp.multiProcessorCount, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); shrLog("> Compute performance scaling factor = %4.2f\n", scale_factor); // Extract parameters if there are any, command line -dimx and -dimy can override // any of these settings getParams(argc, argv, deviceProp, size_x, size_y, max_matrix_dim); if (size_x != size_y) { shrLog("\n[%s] does not support non-square matrices (row_dim_size(%d) != col_dim_size(%d))\nExiting...\n\n", sSDKsample, size_x, size_y); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } if (size_x%TILE_DIM != 0 || size_y%TILE_DIM != 0) { shrLog("[%s] Matrix size must be integral multiple of tile size\nExiting...\n\n", sSDKsample); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } // kernel pointer and descriptor void (*kernel)(float *, float *, int, int, int); char *kernelName; // execution configuration parameters dim3 grid(size_x/TILE_DIM, size_y/TILE_DIM), threads(TILE_DIM,BLOCK_ROWS); if (grid.x < 1 || grid.y < 1) { shrLog("[%s] grid size computation incorrect in test \nExiting...\n\n", sSDKsample); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } // CUDA events cudaEvent_t start, stop; // size of memory required to store the matrix const int mem_size = sizeof(float) * size_x*size_y; // allocate host memory float *h_idata = (float*) malloc(mem_size); float *h_odata = (float*) malloc(mem_size); float *transposeGold = (float *) malloc(mem_size); float *gold; // allocate device memory float *d_idata, *d_odata; cutilSafeCall( cudaMalloc( (void**) &d_idata, mem_size) ); cutilSafeCall( cudaMalloc( (void**) &d_odata, mem_size) ); // initalize host data for( int i = 0; i < (size_x*size_y); ++i) h_idata[i] = (float) i; // copy host data to device cutilSafeCall( cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) ); // Compute reference transpose solution computeTransposeGold(transposeGold, h_idata, size_x, size_y); // print out common data for all kernels shrLog("\nMatrix size: %dx%d (%dx%d tiles), tile size: %dx%d, block size: %dx%d\n\n", size_x, size_y, size_x/TILE_DIM, size_y/TILE_DIM, TILE_DIM, TILE_DIM, TILE_DIM, BLOCK_ROWS); // initialize events cutilSafeCall( cudaEventCreate(&start) ); cutilSafeCall( cudaEventCreate(&stop) ); // // loop over different kernels // CUTBoolean success = CUTTrue; for (int k = 0; k<8; k++) { // set kernel pointer switch (k) { case 0: kernel = © kernelName = "simple copy "; break; case 1: kernel = ©SharedMem; kernelName = "shared memory copy"; break; case 2: kernel = &transposeNaive; kernelName = "naive "; break; case 3: kernel = &transposeCoalesced; kernelName = "coalesced "; break; case 4: kernel = &transposeNoBankConflicts; kernelName = "optimized "; break; case 5: kernel = &transposeCoarseGrained; kernelName = "coarse-grained "; break; case 6: kernel = &transposeFineGrained; kernelName = "fine-grained "; break; case 7: kernel = &transposeDiagonal; kernelName = "diagonal "; break; } // set reference solution if (kernel == © || kernel == ©SharedMem) { gold = h_idata; } else if (kernel == &transposeCoarseGrained || kernel == &transposeFineGrained) { gold = h_odata; // fine- and coarse-grained kernels are not full transposes, so bypass check } else { gold = transposeGold; } // Clear error status cutilSafeCall( cudaGetLastError() ); // warmup to avoid timing startup kernel<<>>(d_odata, d_idata, size_x, size_y, 1); // take measurements for loop over kernel launches cutilSafeCall( cudaEventRecord(start, 0) ); for (int i=0; i < NUM_REPS; i++) { kernel<<>>(d_odata, d_idata, size_x, size_y, 1); // Ensure no launch failure cutilSafeCall( cudaGetLastError() ); } cutilSafeCall( cudaEventRecord(stop, 0) ); cutilSafeCall( cudaEventSynchronize(stop) ); float outerTime; cutilSafeCall( cudaEventElapsedTime(&outerTime, start, stop) ); cutilSafeCall( cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost) ); CUTBoolean res = cutComparef(gold, h_odata, size_x*size_y); if (res == CUTFalse) { shrLog("*** %s kernel FAILED ***\n", kernelName); success = CUTFalse; } // take measurements for loop inside kernel cutilSafeCall( cudaEventRecord(start, 0) ); kernel<<>>(d_odata, d_idata, size_x, size_y, NUM_REPS); cutilSafeCall( cudaEventRecord(stop, 0) ); cutilSafeCall( cudaEventSynchronize(stop) ); float innerTime; cutilSafeCall( cudaEventElapsedTime(&innerTime, start, stop) ); cutilSafeCall( cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost) ); res = cutComparef(gold, h_odata, size_x*size_y); if (res == CUTFalse) { shrLog("*** %s kernel FAILED ***\n", kernelName); success = CUTFalse; } // report effective bandwidths float outerBandwidth = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(outerTime/NUM_REPS); float innerBandwidth = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(innerTime/NUM_REPS); shrLog("\n"); shrLogEx(LOGBOTH | MASTER, 0, "transpose-Outer-%s, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", kernelName, outerBandwidth, outerTime/NUM_REPS, (size_x * size_y), 1, TILE_DIM * BLOCK_ROWS); shrLogEx(LOGBOTH | MASTER, 0, "transpose-Inner-%s, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", kernelName, innerBandwidth, innerTime/NUM_REPS, (size_x * size_y), 1, TILE_DIM * BLOCK_ROWS); } // cleanup free(h_idata); free(h_odata); free(transposeGold); cutilSafeCall( cudaFree(d_idata) ); cutilSafeCall( cudaFree(d_odata) ); cutilSafeCall( cudaEventDestroy(start) ); cutilSafeCall( cudaEventDestroy(stop) ); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, (success == CUTTrue) ? QA_PASSED : QA_FAILED); return 0; }