//"This software contains source code provided by NVIDIA Corporation."
//"This software contains source code provided by AMD."
#include <iostream>//Ausgabe
#include <vector>
#include <utility>
//#define alloca _alloca
#define __NO_STD_VECTOR //Use cl::vektor instead of STL version
#include <functional>
#include <time.h>//für Zeitmessung erfoderlich
#include <CL\cl.hpp>
#include <stdio.h>
using namespace std;
//////////////////////////////////////////////////////////////////////////
//Definitionen
//////////////////////////////////////////////////////////////////////////
#define NWITEMS 64//hier überflüssig, siehe 0. blockSize und blocks
//////////////////////////////////////////////////////////////////////////
//Kernel
//////////////////////////////////////////////////////////////////////////
const char *source=
"__kernel void differenzBild( \n"
" __global const float4 * orginal, \n"
" __global float4 * differenz, \n"
" __local float4 * data) \n"
" \n"
"{ \n"
" uint gid0 = get_global_id(0); \n"
" uint gid1 = get_global_id(1); \n"
" uint gs0 = get_global_size(0); \n"
"// uint gs1 = get_global_size(1); \n"
" uint grid0 = get_group_id(0);//entspricht Zeile des Bildes \n"
" uint grid1 = get_group_id(1);//entspricht Nummer des Bildes \n"
"// uint ls0 = get_local_size(0);//Bildgröße \n"
" uint lid0 = get_local_id(0); \n"
" uint lid1 = get_local_id(1); \n"
" \n"
" data[lid0]=orginal[gid0+gid1*gs0]-orginal[gid0+(gid1+1)*gs0]; \n"
" //differenz[gid0+gid1*gs0]=orginal[gid0+gid1*gs0]-orginal[gid0+(gid1+1)*gs0];\n"
" //Es muss noch abschliesend geklärt werden, ob mit dem Umweg über __local data die selben Ergebnisse herauskommen \n"
" differenz[gid0+gid1*gs0]=data[lid0]; \n"
" \n"
" \n"
"} \n"
" \n"
"__kernel void localmax( \n"
" __global float4 * differenz, \n"
" __global float4 * max, \n"
" __local float4 * data) \n"
"{ \n"
" uint gid0 = get_global_id(0); \n"
" uint gid1 = get_global_id(1); \n"
" uint gs0 = get_global_size(0); \n"
"// uint gs1 = get_global_size(1); \n"
"// uint grid0 = get_group_id(0);//entspricht Zeile des Bildes \n"
"// uint grid1 = get_group_id(1);//entspricht Nummer des Bildes \n"
" uint ls0 = get_local_size(0);//entspricht float4 pro Zeile \n"
" uint lid0 = get_local_id(0); \n"
"// uint lid1 = get_local_id(1); \n"
" \n"
" data[lid0] = differenz[gid0+gid1*gs0]; \n"
" \n"
" max[gid0+gid1*gs0].s0=(data[lid0].s0<data[lid0].s1) ? 0 : data[lid0].s0 ; \n"
" max[gid0+gid1*gs0].s1=(data[lid0].s1>=data[lid0].s2 && data[lid0].s1>=data[lid0].s0) ? data[lid0].s1 : 0 ; \n"
" max[gid0+gid1*gs0].s2=(data[lid0].s2>=data[lid0].s3 && data[lid0].s2>=data[lid0].s1) ? data[lid0].s2 : 0 ; \n"
" \n"
" if(lid0<ls0-1) \n"
" { \n"
" max[gid0+gid1*gs0].s3=(data[lid0].s3>=data[lid0+1].s0 && data[lid0].s3>=data[lid0].s2) ? data[lid0].s3 : 0 ; \n"
" } \n"
" else \n"
" { \n"
" max[gid0+gid1*gs0].s3=0; \n"
" } \n"
" if(lid0!=0 && max[gid0+gid1*gs0].s0!=0) \n"
" { \n"
" max[gid0+gid1*gs0].s0=(data[lid0].s0<data[lid0-1].s3) ? 0 : data[lid0].s0; \n"
" } \n"
" if(lid0>0) \n"
" { \n"
" if(max[gid0+gid1*gs0].s0!=0) \n"
" { \n"
" max[gid0+gid1*gs0].s0=(data[lid0].s0>data[lid0-1].s0 && data[lid0].s0>data[lid0+1].s0) ? data[lid0].s0 : 0 ; \n"
" } \n"
" if(max[gid0+gid1*gs0].s1!=0) \n"
" { \n"
" max[gid0+gid1*gs0].s1=(data[lid0].s1>data[lid0-1].s1 && data[lid0].s1>data[lid0+1].s1) ? data[lid0].s1 : 0 ; \n"
" } \n"
" if(max[gid0+gid1*gs0].s2!=0) \n"
" { \n"
" max[gid0+gid1*gs0].s2=(data[lid0].s2>data[lid0-1].s2 && data[lid0].s2>data[lid0+1].s2) ? data[lid0].s2 : 0 ; \n"
" } \n"
" if(max[gid0+gid1*gs0].s3!=0) \n"
" { \n"
" max[gid0+gid1*gs0].s3=(data[lid0].s3>data[lid0-1].s3 && data[lid0].s3>data[lid0+1].s3) ? data[lid0].s3 : 0 ; \n"
" } \n"
" } \n"
" \n"
" \n"
"} \n"
;
//////////////////////////////////////////////////////////////////////////
//Fehlerbehandlung
//////////////////////////////////////////////////////////////////////////
inline void checkErr(cl_int err, const char* name)//Fehlerbehandlung da eine Anwendung geöffnet wird
{
if (err != CL_SUCCESS){
std::cerr<<"ERROR: "<<name<<" ("<<err<<")"<<std::endl;
exit(EXIT_FAILURE);
//Wenn OpenCL nicht richtig initialisiert wird, wird ein Fehler ausgegeben und das Programm beendet
//Wenn kein Fehler auftaucht passiert nichts
}
}
//////////////////////////////////////////////////////////////////////////
//Funktionen
//////////////////////////////////////////////////////////////////////////
int main (int argc, char ** argv)
{
//0. Variablen
//////////////////////////////////////////////////////////////////////////
cl_int err;
cl_uint num_entries=1;
size_t breite, höhe, anzahlBilder;
size_t auflösung;
breite = 256; //standart 256
höhe = 256; //standart 256
auflösung = breite*höhe;
anzahlBilder= 500; //standart 2000 (maximal 500 möglich)
size_t globalworksize[2]= {breite/4*(höhe+2),anzahlBilder-1};
size_t localworksize[2] = {64,1};
const size_t * ptrgws=globalworksize;
const size_t * ptrlws=localworksize;
clock_t start_cpu, start_input, end_cpu, end_input;
//////////////////////////////////////////////////////////////////////////
//1. Platform initialisieren
cl_platform_id platform;
err=clGetPlatformIDs(num_entries,&platform,NULL);
//2. Infos zur Platform
/*
C++ Befehle bzw OpenCL-Template Befehle
cl::vector <cl:

latform> platformList;
cl:

latform::get(&platformList);
checkErr(platformList.size()!=0 ? CL_SUCCESS : -1, "cl:

latform::get");
*/
cl_platform_info platforminfo[5];
string param_name[5];
char platforminfoc[1024];
param_name[0]="Gefundene Platform:";
platforminfo[0]=CL_PLATFORM_NAME;
param_name[1]="Hersteller:\t";
platforminfo[1]=CL_PLATFORM_VENDOR;
param_name[2]="Version:\t";
platforminfo[2]=CL_PLATFORM_VERSION;
param_name[3]="Erweiterungen: \t";
platforminfo[3]=CL_PLATFORM_EXTENSIONS;
param_name[4]="Unterstützungsmodus: ";
platforminfo[4]=CL_PLATFORM_PROFILE;
for(int i=0;i<5;i++){
clGetPlatformInfo(platform,platforminfo
,sizeof(platforminfoc),&platforminfoc,NULL);
cout<<param_name<<"\t"<<platforminfoc<<endl;
}
//3.1 Gpu device finden. WICHTIG:device[x] gibt die Möglichkeit x device zu erstellen
const int anzahldevices=2;
cl_uint num_devices_returned;
cl_device_id device[anzahldevices];
err=clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU,1,&device[0],&num_devices_returned);
cout<<num_devices_returned<<"x GPU devices gefunden"<<endl;
err=clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU,1,&device[1],&num_devices_returned);
cout<<num_devices_returned<<"x CPU devices gefunden"<<endl;
//3.2 Ausgabe der Device Infos
char device_string[1024];
cl_ulong device_ulong;
cl_uint device_uint;
cl_bool device_bool;
size_t device_size;
cout<<endl<<anzahldevices<<" devices gefunden:"<<endl<<endl;
for (int i=0; i<anzahldevices; i++)
{
// CL_DEVICE_NAME
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
cout<<"\t"<<device_string<<endl;
// CL_DEVICE_VENDOR
clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(device_string), &device_string, NULL);
cout<<"\t Hersteller:"<<device_string<<endl;
// CL_DEVICE_GLOBAL_MEM_CACHE_SIZE
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(device_ulong), &device_ulong, NULL);
cout<<"\t Global mem cache size in bytes:"<<device_ulong<<endl;
// CL_DEVICE_GLOBAL_MEM_CACHE_SIZE
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(device_ulong), &device_ulong, NULL);
cout<<"\t Global mem size in Mbyte:"<<device_ulong/(1048576)<<endl;
// CL_DEVICE_LOCAL_MEM_SIZE
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(device_ulong), &device_ulong, NULL);
cout<<"\t Local mem size in kbyte:"<<device_ulong/(1024)<<endl;
// CL_DEVICE_MAX_COMPUTE_UNITS
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(device_uint), &device_uint, NULL);
cout<<"\t Maximale Anzahl an compute Units: "<<device_uint<<endl;
// CL_DEVICE_MAX_CLOCK_FREQUENCY
clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(device_uint), &device_uint, NULL);
cout<<"\t Maximaler Kerntakt in MHz: "<<device_uint<<endl;
//CL_DEVICE_IMAGE_SUPPORT
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(device_bool), &device_bool, NULL);
cout<<"\t Image Support (1=ja,0=nein): "<<device_bool<<endl;
//CL_DEVICE_IMAGE2D_MAX_HEIGHT
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(device_size), &device_size, NULL);
cout<<"\t Maximale Image2D Hoehe: "<<device_size<<endl;
//CL_DEVICE_IMAGE2D_MAX_WIDTH
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(device_size), &device_size, NULL);
cout<<"\t Maximale Image2D Breite: "<<device_size<<endl;
// CL_DEVICE_INFO
cl_device_type type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
cout<<"\t Type: \t";
if( type & CL_DEVICE_TYPE_CPU )
cout<<"CPU";
if( type & CL_DEVICE_TYPE_GPU )
cout<<"GPU";
if( type & CL_DEVICE_TYPE_ACCELERATOR )
cout<<"ACCELERATOR";
if( type & CL_DEVICE_TYPE_DEFAULT )
cout<<"DEFAULT"<<endl;
cout<<endl<<endl;
}
//4.1 Context für CPU und GPU
cl_context context_gpu, context_cpu;
context_gpu=clCreateContext(NULL,1,&device[0],NULL,NULL,&err);
context_cpu=clCreateContext(NULL,1,&device[1],NULL,NULL,&err);
//4.2 Queues für CPU und GPU
cl_command_queue queue_gpu, queue_cpu;
queue_gpu=clCreateCommandQueue(context_gpu, device[0],0,&err);
queue_cpu=clCreateCommandQueue(context_cpu, device[1],0,&err);
//5. Durchführen der runtime compillierung des source Codes und setzen eines kernel
// entry points für GPU und CPU
// Hier wird ein String array dazu verwendet den source-code bereitzustellen
// Alternativ gibt es auch die Möglichkeit clCreatProgrammWithBinary zu verwenden
/////////////////////////////////////////////////////////////////////////
cl_program program_gpu, program_cpu;
//Programm erstellen und compilieren
program_gpu=clCreateProgramWithSource(context_gpu,1,&source,NULL,&err);
program_cpu=clCreateProgramWithSource(context_cpu,1,&source,NULL,&err);
clBuildProgram(program_gpu,1,&device[0],NULL,NULL,NULL);
clBuildProgram(program_cpu,1,&device[1],NULL,NULL,NULL);
// Kernel erstellen
cl_kernel kernel_gpu, kernel_cpu, maxkernel_gpu, maxkernel_cpu;
kernel_gpu =clCreateKernel(program_gpu,"differenzBild",&err);
kernel_cpu =clCreateKernel(program_gpu,"differenzBild",&err);
maxkernel_gpu =clCreateKernel(program_gpu,"localmax",&err);
maxkernel_cpu =clCreateKernel(program_gpu,"localmax",&err);
////////////////////////////////////////////////////////////////////////
//6.1 Getrennte Datenbuffer erstellen für GPU und CPU
//6.2 allocate host vektoren
cl_float4 * orginalBild =new cl_float4 [breite/4*(höhe+2)*anzahlBilder];
cl_float4 * differenzBild =new cl_float4 [breite/4*(höhe+2)*(anzahlBilder-1)];
cl_float4 * data =new cl_float4 [breite/4*(höhe+2)*(anzahlBilder-1)];
cl_float4 * localmax =new cl_float4 [breite/4*(höhe+2)*(anzahlBilder-1)];
void *ptroB =orginalBild;
void *ptrdB =differenzBild;
void *ptrdata =data;
void *ptrmax =localmax;
//Alternative für SEHR große Datenmengen
//vector<int> input (ydimension*xdimension*anzahlBilder);
//vector<int> output (ydimension*xdimension*anzahlBilder);
//vector<int> C (ydimension*xdimension*anzahlBilder);
//vector<int> D (ydimension*xdimension*anzahlBilder);
// nun ist noch ein Pointer auf den Anfang nötig um den vector in setKernel etc zu verwenden
// int* ptrA=&[0] z.B. hier liegt der Pointer auf dem ersten Eintrag. Nimmt man andere kann
// man sich den Offset in den OpenCL abfragen wohl sparen
// orginalBild wird mit Daten initialisiert.
for(unsigned int a=0; a<anzahlBilder;a++)
{
for(unsigned int y=0; y<höhe+2; y++)
{
for(unsigned int x=0; x<breite/4; x++)
{
for(int i=0;i<4;i++)
{
if(y==0)
{
orginalBild[x+y*breite/4+a*(auflösung/4+2*breite/4)].s=0;
}
else
{
if(y==höhe+1)
{
orginalBild[x+y*breite/4+a*(auflösung/4+2*breite/4)].s=0;
}
else
{
orginalBild[x+y*breite/4+a*(auflösung/4+2*breite/4)].s=rand()%9+1;
}
}
}
}
}
}
cout<<"Host Speicher erstellt und initialisiert"<<endl;
// Device Speicher erstellen 0=GPU 1=CPU
cl_mem device_orginalBild, device_differenzBild, device_localmax;
device_orginalBild =clCreateBuffer (context_gpu,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,(breite/4*(höhe+2)*anzahlBilder)*sizeof(cl_float4),ptroB,&err);
device_differenzBild =clCreateBuffer (context_gpu,CL_MEM_READ_WRITE, (breite/4*(höhe+2)*(anzahlBilder-1))*sizeof(cl_float4),0,&err);
device_localmax =clCreateBuffer (context_gpu,CL_MEM_READ_WRITE, (breite/4*(höhe+2)*(anzahlBilder-1))*sizeof(cl_float4),0,&err);
cout<<"GPU Speicher erstellt"<<endl;
// Kernel Argumente setzen/übergeben
// In diesem Fall werden die Daten aus dem Host Speicher in den Device Speicher kopiert
clSetKernelArg(kernel_gpu,0,sizeof(cl_mem),(void *) &device_orginalBild);
clSetKernelArg(kernel_gpu,1,sizeof(cl_mem),(void *) &device_differenzBild);
clSetKernelArg(kernel_gpu,2,sizeof(cl_mem), NULL);
clSetKernelArg(maxkernel_gpu,0,sizeof(cl_mem),(void *) &device_differenzBild);
clSetKernelArg(maxkernel_gpu,1,sizeof(cl_mem),(void *) &device_localmax);
clSetKernelArg(maxkernel_gpu,2,sizeof(cl_mem), NULL);
// Kernel ausführen
clEnqueueNDRangeKernel(queue_gpu,kernel_gpu,2,0,ptrgws,NULL//ptrlws/*local_work_size*/
,NULL,NULL,NULL);
clEnqueueNDRangeKernel(queue_gpu,maxkernel_gpu,2,0,ptrgws,NULL//ptrlws/*local_work_size*/
,NULL,NULL,NULL);
// Kopieren der Ergebnisse vom Device zurück auf den Host
// clEnqueueReadBuffer(queue_gpu, device_orginalBild, CL_TRUE,0,(breite/4*(höhe+2)*anzahlBilder)*sizeof(cl_float4),ptroB,NULL,NULL,NULL);
// clEnqueueReadBuffer(queue_gpu, device_differenzBild, CL_TRUE,0,(breite/4*(höhe+2)*(anzahlBilder-1))*sizeof(cl_float4),ptrdB,NULL,NULL,NULL);
clEnqueueReadBuffer(queue_gpu, device_localmax, CL_TRUE,0,(breite/4*(höhe+2)*(anzahlBilder-1))*sizeof(cl_float4),ptrmax,NULL,NULL,NULL);
clEnqueueReadBuffer(queue_gpu, device_differenzBild, CL_TRUE,0,(breite/4*(höhe+2)*(anzahlBilder-1))*sizeof(cl_float4),ptrdB,NULL,NULL,NULL);
//////////////////////////////////////////////////////////////////////////
//Bench
//////////////////////////////////////////////////////////////////////////
/*
long long test=0;
for(int i=0;i<anzahlBilder*xdimension;i++)
{
test=test+output;
};
cout<<"CPU Summe: "<<b<<endl;
cout<<"GPU Summe:"<<test<<endl;
clock_t inputdif, cpudif;
inputdif=end_input-start_input;
cpudif=end_cpu-start_cpu;
cout<<"Zeit fuer die Initialisierung der Inputdaten:"<<inputdif<<endl;
cout<<"Zeit fuer die Summierung:"<<cpudif<<endl;
cout<<CLOCKS_PER_SEC<<endl;
*/
clFinish(queue_gpu);
clFinish(queue_cpu);
//////////////////////////////////////////////////////////////////////////
//Testausgabe
//////////////////////////////////////////////////////////////////////////
/*
for(unsigned int a=0; a<anzahlBilder-1;a++)
{
for(unsigned int y=0; y<höhe+2; y++)
{
for(unsigned int x=0; x<breite/4; x++)
{
cout<<endl;
for(int i=0;i<4;i++)
{
cout<<differenzBild[x+y*breite/4+a*(auflösung/4+2*breite/4)].s<<"_";
}
cout<<endl;
for(int i=0;i<4;i++)
{
cout<<localmax[x+y*breite/4+a*(auflösung/4+2*breite/4)].s<<"-";
}
}
}
}
*/
//////////////////////////////////////////////////////////////////////////
//Freigeben von Objekten
//////////////////////////////////////////////////////////////////////////
// Freigeben des reservierten Speichers auf dem Device
// clReleaseMemObject(device_orginalBild);
// clReleaseMemObject(device_differenzBild);
clReleaseKernel(kernel_gpu);
clReleaseKernel(kernel_cpu);
// clReleaseKernel(reduce2kernel_gpu); noch nicht erstellt
clReleaseContext(context_gpu);
clReleaseContext(context_cpu);
clReleaseProgram(program_gpu);
clReleaseProgram(program_cpu);
clReleaseCommandQueue(queue_gpu);
clReleaseCommandQueue(queue_cpu);
// Freigeben des reservierten Speichers auf dem Host
delete[] orginalBild;
delete[] differenzBild;
//////////////////////////////////////////////////////////////////////////
//
//////////////////////////////////////////////////////////////////////////
cout<<endl;
return system("pause");
return 0;
}