[問題]關於CUDA 想請問一下 為什麼我的memory뜠…

看板C_and_CPP (C/C++)作者 (小小小寶貝)時間16年前 (2010/03/03 22:14), 編輯推噓0(000)
留言0則, 0人參與, 最新討論串1/2 (看更多)
( *[1m *[m 為色碼,可以按 Ctrl+V 預覽會顯示的顏色 ) ( 未必需要依照此格式,文章條理清楚即可 ) 遇到的問題: (題意請描述清楚) 我在test3 要用share做運算時 傳到GPU的值 會跑掉 希望得到的正確結果: 理論上在test3那邊 我的SHGA 應該還是我所宣告的100 可是值全都跑掉了 是某一次TEST2 OR TEST 1所做出來的結果 應該是TEST2 程式跑出來的錯誤結果: 開發平台: (例: VC++ or gcc/g++ or Dev-C++, Windows or Linux) C CODE 如下 http://ppt.cc/P(KJ #include<stdio.h> #include<time.h> #include<cuda.h> #define BLOCK 512 void smooth_host(float* b, float* a, int n){ for(int k=1; k<n-1; k++){ b[k]=a[k]+(a[k-1]-2*a[k]+a[k+1])*1; } b[0]=300.0; b[n-1]=300.0; } __global__ void smooth_global(float* b, float* a, int n, float* temp){ int k = blockIdx.x*blockDim.x+threadIdx.x; if(k==0){ b[k]=300.0; } else if(k==n-1){ b[k]=300.0; } else if(k<n){ b[k]=a[k]+(a[k-1]-2*a[k]+a[k+1])*1; } temp[k]=b[k]; a[k]=temp[k]; } __global__ void smooth_shared(float* b, float* a, int n, float* tempgs){ //---------------------------------------- int base = blockIdx.x*blockDim.x; int t = threadIdx.x; //---------------------------------------- __shared__ float s[BLOCK+2]; if(base+t<n){ s[t+1]=a[base+t]; } if(t==0){ //左邊界. if(base==0){ s[0]=300.0; } else{ s[0]=a[base-1]; } } if(t==32){ //右邊界. if(base+BLOCK>=n){ s[n-base+1]=300.0; } else{ s[BLOCK+1] = a[base+BLOCK]; } } __syncthreads(); if(base+t<n){ b[base+t]=s[t+1]+(s[t]-2*s[t+1]+s[t+2])*1; } tempgs[base+t]=b[base+t]; a[base+t]=tempgs[base+t]; }; int main(){ //-------------------------------------------------- int num=21; int loop=20; float* a=new float[num]; float* b=new float[num]; float* bg=new float[num]; float* bs=new float[num]; float* temp=new float[num]; float* tempgs=new float[num]; float* gg=new float[num]; float *GA, *GB, *SHGA, *SHGB, *tt, *tgss; cudaMalloc((void**) &GA, sizeof(float)*num); cudaMalloc((void**) &GB, sizeof(float)*num); cudaMalloc((void**) &SHGA, sizeof(float)*num); cudaMalloc((void**) &SHGB, sizeof(float)*num); cudaMalloc((void**) &tt, sizeof(float)*num); cudaMalloc((void**) &tgss, sizeof(float)*num); FILE *out; out=fopen("intital.txt","a"); //-------------------------------------------------- for(int k=0; k<num; k++){ a[k]=100.0; b[k]=bg[k]=bs[k]=0; fprintf(out,"i=%d, a=%f \n",k,a[k]); } cudaMemcpy(GA, a, sizeof(float)*num, cudaMemcpyHostToDevice); cudaMemcpy(SHGA, a, sizeof(float)*num, cudaMemcpyHostToDevice); //Test(1) //-------------------------------------------------- FILE *gosttt; gosttt=fopen("host.txt","a"); double t_host=(double)clock()/CLOCKS_PER_SEC; for(int k=0; k<loop; k++){ smooth_host(b,a,num); for(int j=0; j<num; j++){ temp[j]=b[j]; a[j]=temp[j]; } for(int j=0; j<num; j++){ fprintf(gosttt,"i=%d, a=%f,b=%f \n",j,a[j],b[j]); } } t_host=((double)clock()/CLOCKS_PER_SEC-t_host)/loop; //Test(2) //-------------------------------------------------- double t_global=(double)clock()/CLOCKS_PER_SEC; cudaThreadSynchronize(); for(int k=0; k<loop; k++){ smooth_global<<<num/BLOCK+1,BLOCK>>>(GB,GA,num,tt); } cudaThreadSynchronize(); t_global=((double)clock()/CLOCKS_PER_SEC-t_global)/loop; cudaMemcpy(bg, GB, sizeof(float)*num, cudaMemcpyDeviceToHost); //bg目的地 GB指向來原位置 FILE *go; go=fopen("global.txt","a"); for(int j=0; j<num; j++){ fprintf(go,"i=%d,b=%f \n",j,bg[j]); } //------------------------------------------------------------- FILE *sss; sss=fopen("sssstext.txt","a"); cudaThreadSynchronize(); cudaMemcpy(gg, SHGA, sizeof(float)*num, cudaMemcpyDeviceToHost); for(int j=0; j<num; j++){ fprintf(sss,"i=%d, a=%f \n",j,gg[j]); } //------------------------------------------------------------- cudaFree(GA); cudaFree(GB); FILE *ss; ss=fopen("sstext.txt","a"); cudaThreadSynchronize(); cudaMemcpy(gg, SHGA, sizeof(float)*num, cudaMemcpyDeviceToHost); for(int j=0; j<num; j++){ fprintf(ss,"i=%d, a=%f \n",j,gg[j]); } //Test(3) //-------------------------------------------------- double t_shared=(double)clock()/CLOCKS_PER_SEC; for(int k=0; k<loop; k++){ smooth_shared<<<num/BLOCK+1,BLOCK>>>(SHGB,SHGA,num,tgss); } cudaThreadSynchronize(); t_shared=((double)clock()/CLOCKS_PER_SEC-t_shared)/loop; cudaMemcpy(bs, SHGB, sizeof(float)*num, cudaMemcpyDeviceToHost); FILE *sh; sh=fopen("share.txt","a"); for(int k=0; k<num; k++){ fprintf(sh,"i=%d, a=%f \n",k,bs[k]); } //-------------------------------------------------- double sum_dg2=0, sum_ds2=0, sum_b2=0; for(int k=0; k<num; k++){ double dg=bg[k]-b[k]; double ds=bs[k]-b[k]; sum_b2+=b[k]*b[k]; sum_dg2+=dg*dg; sum_ds2+=ds*ds; } //output //-------------------------------------------------- printf("Smooth_Host: %g ms\n", t_host*1000); printf("Smooth_Global: %g ms\n", t_global*1000); printf("Smooth_Shared: %g ms\n", t_shared*1000); printf("\n"); //相對誤差. printf("Diff(Smooth_Global): %g \n", sqrt(sum_dg2/sum_b2)); printf("Diff(Smooth_Shared): %g \n", sqrt(sum_ds2/sum_b2)); printf("\n"); cudaFree(GA); cudaFree(GB); cudaFree(tt); cudaFree(tgss); cudaFree(SHGA); cudaFree(SHGB); delete [] a; delete [] b; delete [] bg; delete [] bs; delete [] temp; delete [] tempgs; delete [] gg; return 0; } -- ※ 發信站: 批踢踢實業坊(ptt.cc) ◆ From: 140.113.134.115 ※ 編輯: cl36260 來自: 140.113.180.60 (03/04 02:22)
文章代碼(AID): #1BZcxOtP (C_and_CPP)
文章代碼(AID): #1BZcxOtP (C_and_CPP)