i'm new opencl , willing compare performance gain between c code , opencl kernels. can please elaborate method among these 2 better/correct profiling opencl code when comparing performance c reference code: using queryperformancecounter()/__rdtsc() cycles (called inside gettime function) ret |= clfinish(command_queue); //empty queue gettime(&begin); ret |= clenqueuendrangekernel(command_queue, kernel, 2, null, global_ws, null, 0, null, null); //profiling disabled. ret |= clfinish(command_queue); gettime(&end); g_ndrangepureexectimesec = elapsed_time(&begin, &end); //performs: (end-begin)/(clock_per_cycle*clock_per_cycle*clock_per_cycle) using events profiling: ret = clenqueuemarker(command_queue, &evt1); //empty queue ret |= clenqueuendrangekernel(command_queue, kernel, 2, null, global_ws, null, 0, null, &evt1); ret |= clwaitforevents(1, &evt1); ret |= clgeteventprofilinginfo(evt1, cl_profiling_command_start, sizeof(cl_long), &begi...