thysol
BIOS-Overclocker(in)
Hi,
Ich moechte gerne die Zeit messen die ein Device braucht um einen Kernel auszufuehren. Das mache ich so:
Hier ist der gesamte Code:
Ich moechte gerne die Zeit messen die ein Device braucht um einen Kernel auszufuehren. Das mache ich so:
Allerdings ist die Zeit die gemessen wird immer gleich, auch wenn der Kernel gefuehlt mehrere Sekunden zum ausfuehren braucht gibt der Timer immer 0.00813 Sekunden als Zeit an. Diese Zeit kriege ich immer, egal ob ich den Kernel auf der CPU oder GPU ausfuehre. Das kann ja wohl nicht sein. Was habe ich falsch gemacht?ret = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_START,sizeof(long long),&kernelsStartTime,NULL);
ret = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_END,sizeof(long long),&kernelsEndTime,NULL);
Hier ist der gesamte Code:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
void *threadFunc(void *arg)
{
cl_double totalKernelTime;
long long kernelsStartTime;
long long kernelsEndTime;
cl_event event;
int i;
const int LIST_SIZE = 1000000;
float *A = (float*)malloc(sizeof(float)*LIST_SIZE);
float *B = (float*)malloc(sizeof(float)*LIST_SIZE);
float *C = (float*)malloc(sizeof(float)*LIST_SIZE);
float *D = (float*)malloc(sizeof(float)*LIST_SIZE);
float *E = (float*)malloc(sizeof(float)*LIST_SIZE);
float *F = (float*)malloc(sizeof(float)*LIST_SIZE);
float *G = (float*)malloc(sizeof(float)*LIST_SIZE);
float *H = (float*)malloc(sizeof(float)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A = 1000 + (i * 0.099);
B = 200000 + (i * 9.8);
C = 6378140 + B;
}
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel2.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1,
&device_id, &ret_num_devices);
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
cl_mem d_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
cl_mem e_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
cl_mem f_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
cl_mem g_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
cl_mem h_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), B, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), C, 0, NULL, NULL);
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&d_mem_obj);
ret = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&e_mem_obj);
ret = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&f_mem_obj);
ret = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&g_mem_obj);
ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&h_mem_obj);
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 50; // Process one item at a time
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, &event);
ret = clWaitForEvents(1, &event);
ret = clEnqueueReadBuffer(command_queue, d_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), D, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, e_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), E, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, f_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), F, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, g_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), G, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, h_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), H, 0, NULL, NULL);
ret = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_START,sizeof(long long),&kernelsStartTime,NULL);
ret = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_END,sizeof(long long),&kernelsEndTime,NULL);
totalKernelTime = (double)(kernelsEndTime - kernelsStartTime)/1e9;
clReleaseEvent(event);
printf("%f\n", totalKernelTime);
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseMemObject(d_mem_obj);
ret = clReleaseMemObject(e_mem_obj);
ret = clReleaseMemObject(f_mem_obj);
ret = clReleaseMemObject(g_mem_obj);
ret = clReleaseMemObject(h_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
free(D);
free(E);
free(F);
free(G);
free(H);
return 0;
}
int main(void) {
pthread_t pth; // this is our thread identifier
pthread_create(&pth,NULL,threadFunc,"processing...");
cl_double totalKernelTime;
long long kernelsStartTime;
long long kernelsEndTime;
cl_event event;
int i;
const int LIST_SIZE = 1000000;
float *A = (float*)malloc(sizeof(float)*LIST_SIZE);
float *B = (float*)malloc(sizeof(float)*LIST_SIZE);
float *C = (float*)malloc(sizeof(float)*LIST_SIZE);
float *D = (float*)malloc(sizeof(float)*LIST_SIZE);
float *E = (float*)malloc(sizeof(float)*LIST_SIZE);
float *F = (float*)malloc(sizeof(float)*LIST_SIZE);
float *G = (float*)malloc(sizeof(float)*LIST_SIZE);
float *H = (float*)malloc(sizeof(float)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A = 1000 + (i * 0.099);
B = 200000 + (i * 9.8);
C = 6378140 + B;
}
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel2.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1,
&device_id, &ret_num_devices);
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
cl_mem d_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
cl_mem e_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
cl_mem f_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
cl_mem g_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
cl_mem h_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(float), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), B, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), C, 0, NULL, NULL);
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&d_mem_obj);
ret = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&e_mem_obj);
ret = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&f_mem_obj);
ret = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&g_mem_obj);
ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&h_mem_obj);
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 50; // Process one item at a time
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, &event);
ret = clWaitForEvents(1, &event);
ret = clEnqueueReadBuffer(command_queue, d_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), D, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, e_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), E, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, f_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), F, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, g_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), G, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, h_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(float), H, 0, NULL, NULL);
pthread_join(pth, NULL /* void ** return value could go here */);
ret = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_START,sizeof(long long),&kernelsStartTime,NULL);
ret = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_END,sizeof(long long),&kernelsEndTime,NULL);
totalKernelTime = (double)(kernelsEndTime - kernelsStartTime)/1e9;
clReleaseEvent(event);
printf("%f\n", totalKernelTime);
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseMemObject(d_mem_obj);
ret = clReleaseMemObject(e_mem_obj);
ret = clReleaseMemObject(f_mem_obj);
ret = clReleaseMemObject(g_mem_obj);
ret = clReleaseMemObject(h_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
free(D);
free(E);
free(F);
free(G);
free(H);
return 0;
}
Zuletzt bearbeitet: