// reduce.cu#include<cuda_runtime.h>#include<iostream>#include<time.h>__global__voidreductionKernel(float*input,float*output,intsize){extern__shared__floatsharedData[];inttid=threadIdx.x;intindex=blockIdx.x*blockDim.x+threadIdx.x;// Load data into shared memorysharedData[tid]=(index<size)?input[index]:0;// __syncthreads();// Perform reduction in shared memoryfor(intstride=blockDim.x/2;stride>0;stride>>=1){if(tid<stride){sharedData[tid]+=sharedData[tid+stride];}// __syncthreads();}// Write result for this block to global memoryif(tid==0){output[blockIdx.x]=sharedData[0];}}voidcheckCudaError(cudaError_terr,constchar*msg){if(err!=cudaSuccess){std::cerr<<"CUDA Error: "<<msg<<" - "<<cudaGetErrorString(err)<<std::endl;exit(EXIT_FAILURE);}}voidreduction_gpu(float*input,float*output,intsize){intblockSize=256;// Number of threads per blockintgridSize=(size+blockSize-1)/blockSize;float*d_input,*d_output;checkCudaError(cudaMalloc(&d_input,size*sizeof(float)),"Failed to allocate device memory for input");checkCudaError(cudaMalloc(&d_output,gridSize*sizeof(float)),"Failed to allocate device memory for output");checkCudaError(cudaMemcpy(d_input,input,size*sizeof(float),cudaMemcpyHostToDevice),"Failed to copy data to device");// Launch the reduction kernel using cudaLaunchKernelclock_tstart=clock();void*kernelArgs[]={&input,&output,&size};// void* kernelArgs[] = { &d_input, &d_output, &size };checkCudaError(cudaLaunchKernel((void*)reductionKernel,dim3(gridSize),dim3(blockSize),kernelArgs,blockSize*sizeof(float),0),"Failed to launch reduction kernel");clock_tend=clock();doubleduration_kernel=(double)(end-start)/(CLOCKS_PER_SEC);printf("reduction_kernel: %.9lf\n",duration_kernel);// Check for any errors during kernel executioncheckCudaError(cudaGetLastError(),"Kernel execution failed");// Wait for the kernel to complete cudaDeviceSynchronize();end=clock();duration_kernel=(double)(end-start)/(CLOCKS_PER_SEC);printf("reduction_kernel after sync: %.9lf\n",duration_kernel);// Copy the partial results back to the hostcheckCudaError(cudaMemcpy(output,d_output,gridSize*sizeof(float),cudaMemcpyDeviceToHost),"Failed to copy data back to host");// Perform the final reduction on the hostfloatfinalSum=0.0f;for(inti=0;i<gridSize;++i){finalSum+=output[i];}std::cout<<"Total sum: "<<finalSum<<std::endl;cudaFree(d_input);cudaFree(d_output);}intmain(){constintsize=1024*1024*16;float*input=newfloat[size];float*output=newfloat[(size+255)/256];// Initialize input datafor(inti=0;i<size;++i){input[i]=static_cast<float>(2);}reduction_gpu(input,output,size);delete[]input;delete[]output;return0;}