Re: [問題] 使用CUDA來擷取矩陣中部分值

看板C_and_CPP (C/C++)作者時間16年前 (2010/02/23 10:09), 編輯推噓1(1011)
留言12則, 1人參與, 最新討論串6/10 (看更多)
我執行的結果正常啊 取上下左右的部份都沒問題 #include <stdio.h> #include <stdlib.h> #include <string.h> #include <cuda_runtime.h> #include <cutil.h> #define NNx 4 #define NNy 4 __global__ void movaIM5(float *input1, float *output1) { int col = blockIdx.x*blockDim.x+threadIdx.x; int row = blockIdx.y*blockDim.y+threadIdx.y; output1[row*NNx+col] = input1[(row+1)*NNx+col]; // 取下面矩陣 //output1[row*NNx+col] = input1[row*NNx+col]; // 取上面矩陣 } __global__ void movaIM6(float *input1, float *output1) { int col = blockIdx.x*blockDim.x+threadIdx.x; int row = blockIdx.y*blockDim.y+threadIdx.y; output1[row*(NNx-1)+col] = input1[row*NNx+col+1]; // 取右邊矩陣 //output1[row*(NNx-1)+col] = input1[row*NNx+col]; // 取左邊矩陣 } int main(int argc, char* argv[]) { int i; float input[NNx*NNy]; float *d_input; for(i = 0; i < NNx*NNy; i++){ input[i] = i+1; if(i % NNx == 0) printf("\n"); printf("%3.0f ", input[i]); } printf("\n"); cudaMalloc((void**)&d_input, sizeof(float)*NNx*NNy); cudaMemcpy( d_input, input, sizeof(float)*NNx*NNy, cudaMemcpyHostToDevice ); float *d_IM5, *d_IM6; cudaMalloc((void**)&d_IM5, sizeof(float)*NNx*(NNy-1)); cudaMalloc((void**)&d_IM6, sizeof(float)*(NNx-1)*NNy); dim3 blocks(1,1); dim3 threadsIM5(NNx,NNy-1); dim3 threadsIM6(NNx-1,NNy); movaIM5<<<blocks, threadsIM5>>>( d_input, d_IM5 ); movaIM6<<<blocks, threadsIM6>>>( d_input, d_IM6 ); float *IM5, *IM6; IM5 = (float*) malloc( sizeof(float)*NNx*(NNy-1) ); IM6 = (float*) malloc( sizeof(float)*(NNx-1)*NNy ); cudaMemcpy( IM5, d_IM5, sizeof(float)*NNx*(NNy-1), cudaMemcpyDeviceToHost ); cudaMemcpy( IM6, d_IM6, sizeof(float)*(NNx-1)*NNy, cudaMemcpyDeviceToHost ); for(i = 0; i < NNx*(NNy-1); i++){ if(i % NNx == 0) printf("\n"); printf("%3.0f ", IM5[i]); } printf("\n"); for(i = 0; i < (NNx-1)*NNy; i++){ if(i % (NNx-1) == 0) printf("\n"); printf("%3.0f ", IM6[i]); } printf("\n"); cudaFree(d_input); cudaFree(d_IM5); cudaFree(d_IM6); system("pause"); return 0; } -- ※ 發信站: 批踢踢實業坊(ptt.cc) ◆ From: 122.120.40.234

02/23 12:08, , 1F
如果我今天矩陣是比較大的畫,如512*512
02/23 12:08, 1F

02/23 12:10, , 2F
dim3 blocks, threadsIM5好像就不可以這樣寫了,是嗎
02/23 12:10, 2F

02/23 12:20, , 3F
int bx = (NNx + BLOCK_SIZE - 1) / BLOCK_SIZE;
02/23 12:20, 3F

02/23 12:20, , 4F
dim3 blocks(bx, bx);
02/23 12:20, 4F

02/23 12:21, , 5F
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
02/23 12:21, 5F

02/23 12:21, , 6F
BLOCK_SIZE 我設為16
02/23 12:21, 6F

02/23 22:22, , 7F
我IM5大致上試出來了
02/23 22:22, 7F

02/23 22:23, , 8F
dim3 blocksIM5( 16, 73);
02/23 22:23, 8F

02/23 22:23, , 9F
dim3 threadsIM5( 32, 7 );
02/23 22:23, 9F

02/23 22:23, , 10F
movaIM5<<<blocksIM5, threadsIM5>>>( d_input, d_IM5 );
02/23 22:23, 10F

02/23 22:25, , 11F
這樣剛好處理(16*32=512, 73*7=511) --> 512*511的矩陣
02/23 22:25, 11F

02/23 22:25, , 12F
我這樣寫的方式OK嗎,有什麼地方可以改進的
02/23 22:25, 12F
文章代碼(AID): #1BWpZJXb (C_and_CPP)
討論串 (同標題文章)
文章代碼(AID): #1BWpZJXb (C_and_CPP)