בעיות אופטימיזציה בCUDA

שלום לכולם ,הקוד הבא לקוח מהאתר של אונב' סטנפורד. יש לי כמה שאלות לגביו.
עבור הפונקציה :adjacent_difference
קיימת ההגדרה :

extern __shared__ int s_data[];

אני לא ממש מבין בשביל מה אני צריך את החלק של ה

extern

ע"פ מה שכתוב בהסברים נאמר שהגודל נקבע בצורה דינאמית בזמן ההפעלה של הפונקציה.

אבל זה לא ממש מסתדר.
אני מניח , שיש קשר לקריאה עצמה לפונקציה

קוד:

adjacent_difference_simple<<<num_blocks,block_size,block_size*sizeof(int)>>>(d_result, d_input);

עבור הפרמטר השלישי שבין ה <<<>>> אבל זה לא מסתדר כי בעצם שולחים את כמות הבלוקים כפול גודל של משתנה . זה לא היה אמור להיות פשוט גודל המערך?

תודה לעוזרים!

קוד:


// This example demonstrates the use of shared per-block variables to
 
// implement an optimized adjacent difference algorithm. In this example,
 
// a per-block __shared__ array acts as a "bandwidth multiplier" by eliminating
 
// redundant loads issued by neighboring threads.
 
 
 
#include <stdlib.h>
 
#include <vector>
 
#include <algorithm>
 
#include <iostream>
 
 
 
// compute the number of lines of code each implementation requires
 
const unsigned int simple_implementation_begin = __LINE__;
 
 
 
// a simple version of adjacent_difference which issues redundant loads from off-chip global memory
 
__global__ void adjacent_difference_simple(int *result, int *input)
 
{
 
// compute this thread's global index
 
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
 
 
if(i > 0)
 
{
 
	// each thread loads two elements from global memory
 
	int x_i = input[i];
 
	int x_i_minus_one = input[i-1];
 
 
 
	// compute the difference using values stored in registers
 
	result[i] = x_i - x_i_minus_one;
 
}
 
}
 
const unsigned int simple_implementation_size = __LINE__ - simple_implementation_begin;
 
 
 
 
 
const unsigned int optimized_implementation_begin = __LINE__;
 
 
 
// an optimized version of adjacent_difference which eliminates redundant loads
 
__global__ void adjacent_difference(int *result, int *input)
 
{
 
// a __shared__ array with one element per thread
 
// the size of the array is allocated dynamically upon kernel launch
 
extern __shared__ int s_data[];
 
 
 
// each thread reads one element to s_data
 
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
 
 
// since one array gets allocated per-block, we index the array using our
 
// per-block thread index, threadIdx
 
// the global array, input, is indexed as usual
 
s_data[threadIdx.x] = input[i];
 
 
 
// avoid race condition: ensure all loads to s_data complete before we try to read from it
 
__syncthreads();
 
 
 
if(threadIdx.x > 0)
 
{
 
	// compute the difference directly from s_data
 
	// it is implemented in fast, on-chip memory
 
	result[i] = s_data[threadIdx.x] - s_data[threadIdx.x - 1];
 
}
 
else if(i > 0)
 
{
 
	// handle thread block boundary
 
	// the first thread in a block needs data that was read by the
 
	// last thread of the previous block into its shared array
 
	// this thread can't access that array, so issue one redundant load per block
 
	result[i] = s_data[threadIdx.x] - input[i-1];
 
}
 
}
 
const unsigned int optimized_implementation_size = __LINE__ - optimized_implementation_begin;
 
 
 
 
 
int main(void)
 
{
 
// create a large workload so we can easily measure the
 
// performance difference of both implementations
 
const size_t block_size = 512;
 
const size_t num_blocks = (1<<24) / block_size;
 
const size_t n = num_blocks * block_size;
 
 
 
// generate random input on the host
 
std::vector<int> h_input(n);
 
std::generate(h_input.begin(), h_input.end(), rand);
 
 
 
// allocate storage for the device
 
int *d_input = 0, *d_result = 0;
 
cudaMalloc((void**)&d_input, sizeof(int) * n);
 
cudaMalloc((void**)&d_result, sizeof(int) * n);
 
 
 
// copy input to the device
 
cudaMemcpy(d_input, &h_input[0], sizeof(int) * n, cudaMemcpyHostToDevice);
 
 
 
// time the kernel launches using CUDA events
 
cudaEvent_t launch_begin, launch_end;
 
cudaEventCreate(&launch_begin);
 
cudaEventCreate(&launch_end);
 
 
 
// to get accurate timings, launch a single "warm-up" kernel
 
 
 
// dynamically allocate the __shared__ array by passing its
 
// size in bytes to the 3rd parameter of the triple chevrons
 
adjacent_difference_simple<<<num_blocks,block_size,block_size*sizeof(int)>>>(d_result, d_input);
 
 
 
const size_t num_launches = 100;
 
 
 
// time many kernel launches and take the average time
 
float average_simple_time = 0;
 
for(int i = 0; i < num_launches; ++i)
 
{
 
	// record a CUDA event immediately before and after the kernel launch
 
	cudaEventRecord(launch_begin,0);
 
	adjacent_difference_simple<<<num_blocks,block_size,block_size*sizeof(int)>>>(d_result, d_input);
 
	cudaEventRecord(launch_end,0);
 
	cudaEventSynchronize(launch_end);
 
 
 
	// measure the time spent in the kernel
 
	float time = 0;
 
	cudaEventElapsedTime(&time, launch_begin, launch_end);
 
 
 
	average_simple_time += time;
 
}
 
average_simple_time /= num_launches;
 
 
 
// now time the optimized kernel
 
 
 
// again, launch a single "warm-up" kernel
 
adjacent_difference<<<num_blocks,block_size,block_size*sizeof(int)>>>(d_result, d_input);
 
 
 
// time many kernel launches and take the average time
 
float average_optimized_time = 0;
 
for(int i = 0; i < num_launches; ++i)
 
{
 
	// record a CUDA event immediately before and after the kernel launch
 
	cudaEventRecord(launch_begin,0);
 
	adjacent_difference<<<num_blocks,block_size,block_size*sizeof(int)>>>(d_result, d_input);
 
	cudaEventRecord(launch_end,0);
 
	cudaEventSynchronize(launch_end);
 
 
 
	// measure the time spent in the kernel
 
	float time = 0;
 
	cudaEventElapsedTime(&time, launch_begin, launch_end);
 
 
 
	average_optimized_time += time;
 
}
 
average_optimized_time /= num_launches;
 
 
 
// report the effective throughput of each kernel in GB/s
 
// the effective throughput is measured as size of input read + size of output written divided by time
 
float simple_throughput = static_cast<float>(2 * n * sizeof(int)) / (average_simple_time / 1000.0f) / 1000000000.0f;
 
float optimized_throughput = static_cast<float>(2 * n * sizeof(int)) / (average_optimized_time / 1000.0f) / 1000000000.0f;
 
 
 
// compute throughput per line of code to measure how productive we were
 
float simple_throughput_per_sloc = simple_throughput / simple_implementation_size;
 
float optimized_throughput_per_sloc = optimized_throughput / optimized_implementation_size;
 
 
 
std::cout << "Work load size: " << n << std::endl;
 
std::cout << "Simple implementation SLOCs: " << simple_implementation_size << std::endl;
 
std::cout << "Optimized implementation SLOCs: " << optimized_implementation_size << std::endl << std::endl;
 
 
 
std::cout << "Throughput of simple kernel: " << simple_throughput << " GB/s" << std::endl;
 
std::cout << "Throughput of optimized kernel: " << optimized_throughput << " GB/s" << std::endl;
 
std::cout << "Performance improvement: " << optimized_throughput / simple_throughput << "x" << std::endl;
 
std::cout << std::endl;
 
 
 
std::cout << "Throughput of simple kernel per line of code: " << simple_throughput_per_sloc << " GB/s/sloc" << std::endl;
 
std::cout << "Throughput of optimized kernel per line of code: " << optimized_throughput_per_sloc << " GB/s/sloc" << std::endl;
 
std::cout << "Performance improvement per line of code: " << optimized_throughput_per_sloc / simple_throughput_per_sloc << "x" << std::endl;
 
 
 
// destroy the CUDA events
 
cudaEventDestroy(launch_begin);
 
cudaEventDestroy(launch_end);
 
 
 
// deallocate device memory
 
cudaFree(d_input);
 
cudaFree(d_result);
 
 
 
return 0;
 
}

כלי אשכול	חפש באשכול זה
הצג גירסת הדפסה שלח דף זה ב E-Mail	חפש באשכול זה: חיפוש מתקדם
מצבי תצוגה	דרג אשכול זה
עבור למצב לינארי מצב כלאיים עבור למצב משורשר	דרג אשכול זה:

הדף נוצר ב 0.07 שניות עם 10 שאילתות
צור קשר - Fresh - למעלה