[問題]關於CUDA 想請問一下 為什麼我的memory뜠…
( *[1m *[m 為色碼,可以按 Ctrl+V 預覽會顯示的顏色 )
( 未必需要依照此格式,文章條理清楚即可 )
遇到的問題: (題意請描述清楚)
我在test3 要用share做運算時 傳到GPU的值 會跑掉
希望得到的正確結果:
理論上在test3那邊 我的SHGA 應該還是我所宣告的100 可是值全都跑掉了
是某一次TEST2 OR TEST 1所做出來的結果 應該是TEST2
程式跑出來的錯誤結果:
開發平台: (例: VC++ or gcc/g++ or Dev-C++, Windows or Linux)
C
CODE 如下
http://ppt.cc/P(KJ
#include<stdio.h>
#include<time.h>
#include<cuda.h>
#define BLOCK 512
void smooth_host(float* b, float* a, int n){
for(int k=1; k<n-1; k++){
b[k]=a[k]+(a[k-1]-2*a[k]+a[k+1])*1;
}
b[0]=300.0;
b[n-1]=300.0;
}
__global__ void smooth_global(float* b, float* a, int n, float* temp){
int k = blockIdx.x*blockDim.x+threadIdx.x;
if(k==0){
b[k]=300.0;
}
else if(k==n-1){
b[k]=300.0;
}
else if(k<n){
b[k]=a[k]+(a[k-1]-2*a[k]+a[k+1])*1;
}
temp[k]=b[k];
a[k]=temp[k];
}
__global__ void smooth_shared(float* b, float* a, int n, float* tempgs){
//----------------------------------------
int base = blockIdx.x*blockDim.x;
int t = threadIdx.x;
//----------------------------------------
__shared__ float s[BLOCK+2];
if(base+t<n){
s[t+1]=a[base+t];
}
if(t==0){
//左邊界.
if(base==0){
s[0]=300.0;
}
else{
s[0]=a[base-1];
}
}
if(t==32){
//右邊界.
if(base+BLOCK>=n){
s[n-base+1]=300.0;
}
else{
s[BLOCK+1] = a[base+BLOCK];
}
}
__syncthreads();
if(base+t<n){
b[base+t]=s[t+1]+(s[t]-2*s[t+1]+s[t+2])*1;
}
tempgs[base+t]=b[base+t];
a[base+t]=tempgs[base+t];
};
int main(){
//--------------------------------------------------
int num=21;
int loop=20;
float* a=new float[num];
float* b=new float[num];
float* bg=new float[num];
float* bs=new float[num];
float* temp=new float[num];
float* tempgs=new float[num];
float* gg=new float[num];
float *GA, *GB, *SHGA, *SHGB, *tt, *tgss;
cudaMalloc((void**) &GA, sizeof(float)*num);
cudaMalloc((void**) &GB, sizeof(float)*num);
cudaMalloc((void**) &SHGA, sizeof(float)*num);
cudaMalloc((void**) &SHGB, sizeof(float)*num);
cudaMalloc((void**) &tt, sizeof(float)*num);
cudaMalloc((void**) &tgss, sizeof(float)*num);
FILE *out;
out=fopen("intital.txt","a");
//--------------------------------------------------
for(int k=0; k<num; k++){
a[k]=100.0;
b[k]=bg[k]=bs[k]=0;
fprintf(out,"i=%d, a=%f \n",k,a[k]);
}
cudaMemcpy(GA, a, sizeof(float)*num, cudaMemcpyHostToDevice);
cudaMemcpy(SHGA, a, sizeof(float)*num, cudaMemcpyHostToDevice);
//Test(1)
//--------------------------------------------------
FILE *gosttt;
gosttt=fopen("host.txt","a");
double t_host=(double)clock()/CLOCKS_PER_SEC;
for(int k=0; k<loop; k++){
smooth_host(b,a,num);
for(int j=0; j<num; j++){
temp[j]=b[j];
a[j]=temp[j];
}
for(int j=0; j<num; j++){
fprintf(gosttt,"i=%d, a=%f,b=%f \n",j,a[j],b[j]);
}
}
t_host=((double)clock()/CLOCKS_PER_SEC-t_host)/loop;
//Test(2)
//--------------------------------------------------
double t_global=(double)clock()/CLOCKS_PER_SEC;
cudaThreadSynchronize();
for(int k=0; k<loop; k++){
smooth_global<<<num/BLOCK+1,BLOCK>>>(GB,GA,num,tt);
}
cudaThreadSynchronize();
t_global=((double)clock()/CLOCKS_PER_SEC-t_global)/loop;
cudaMemcpy(bg, GB, sizeof(float)*num, cudaMemcpyDeviceToHost); //bg目的地
GB指向來原位置
FILE *go;
go=fopen("global.txt","a");
for(int j=0; j<num; j++){
fprintf(go,"i=%d,b=%f \n",j,bg[j]);
}
//-------------------------------------------------------------
FILE *sss;
sss=fopen("sssstext.txt","a");
cudaThreadSynchronize();
cudaMemcpy(gg, SHGA, sizeof(float)*num, cudaMemcpyDeviceToHost);
for(int j=0; j<num; j++){
fprintf(sss,"i=%d, a=%f \n",j,gg[j]);
}
//-------------------------------------------------------------
cudaFree(GA);
cudaFree(GB);
FILE *ss;
ss=fopen("sstext.txt","a");
cudaThreadSynchronize();
cudaMemcpy(gg, SHGA, sizeof(float)*num, cudaMemcpyDeviceToHost);
for(int j=0; j<num; j++){
fprintf(ss,"i=%d, a=%f \n",j,gg[j]);
}
//Test(3)
//--------------------------------------------------
double t_shared=(double)clock()/CLOCKS_PER_SEC;
for(int k=0; k<loop; k++){
smooth_shared<<<num/BLOCK+1,BLOCK>>>(SHGB,SHGA,num,tgss);
}
cudaThreadSynchronize();
t_shared=((double)clock()/CLOCKS_PER_SEC-t_shared)/loop;
cudaMemcpy(bs, SHGB, sizeof(float)*num, cudaMemcpyDeviceToHost);
FILE *sh;
sh=fopen("share.txt","a");
for(int k=0; k<num; k++){
fprintf(sh,"i=%d, a=%f \n",k,bs[k]);
}
//--------------------------------------------------
double sum_dg2=0, sum_ds2=0, sum_b2=0;
for(int k=0; k<num; k++){
double dg=bg[k]-b[k];
double ds=bs[k]-b[k];
sum_b2+=b[k]*b[k];
sum_dg2+=dg*dg;
sum_ds2+=ds*ds;
}
//output
//--------------------------------------------------
printf("Smooth_Host: %g ms\n", t_host*1000);
printf("Smooth_Global: %g ms\n", t_global*1000);
printf("Smooth_Shared: %g ms\n", t_shared*1000);
printf("\n");
//相對誤差.
printf("Diff(Smooth_Global): %g \n", sqrt(sum_dg2/sum_b2));
printf("Diff(Smooth_Shared): %g \n", sqrt(sum_ds2/sum_b2));
printf("\n");
cudaFree(GA);
cudaFree(GB);
cudaFree(tt);
cudaFree(tgss);
cudaFree(SHGA);
cudaFree(SHGB);
delete [] a;
delete [] b;
delete [] bg;
delete [] bs;
delete [] temp;
delete [] tempgs;
delete [] gg;
return 0;
}
--
※ 發信站: 批踢踢實業坊(ptt.cc)
◆ From: 140.113.134.115
※ 編輯: cl36260 來自: 140.113.180.60 (03/04 02:22)
討論串 (同標題文章)
以下文章回應了本文:
完整討論串 (本文為第 1 之 2 篇):
C_and_CPP 近期熱門文章
PTT數位生活區 即時熱門文章