rdtsc

What is rdtsc

  • rdtsc is ReaD TimeStamp Count
  • limition
    • only work on x86 platform

simple implement use for x86_64 platform

uint64_t rdtsc()
{
   uint32_t hi, lo;
   __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
   return ( (uint64_t)lo)|( ((uint64_t)hi)<<32 );
}

rdtsc.h

#ifndef __RDTSC_H_DEFINED__
#define __RDTSC_H_DEFINED__


#if defined(__i386__)

static __inline__ unsigned long long rdtsc(void)
{
  unsigned long long int x;
     __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
     return x;
}
#elif defined(__x86_64__)

static __inline__ unsigned long long rdtsc(void)
{
  unsigned hi, lo;
  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
  return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

#elif defined(__powerpc__)

static __inline__ unsigned long long rdtsc(void)
{
  unsigned long long int result=0;
  unsigned long int upper, lower,tmp;
  __asm__ volatile(
                "0:                  \n"
                "\tmftbu   %0           \n"
                "\tmftb    %1           \n"
                "\tmftbu   %2           \n"
                "\tcmpw    %2,%0        \n"
                "\tbne     0b         \n"
                : "=r"(upper),"=r"(lower),"=r"(tmp)
                );
  result = upper;
  result = result<<32;
  result = result|lower;

  return(result);
}

#else

#error "No tick counter is available!"

#endif


/*  $RCSfile:  $   $Author: kazutomo $
 *  $Revision: 1.6 $  $Date: 2005/04/13 18:49:58 $
 */

#endif

Sample one

#include <stdio.h>
#include "rdtsc.h"

int main(int argc, char* argv[])
{
  unsigned long long a,b;

  a = rdtsc();
  b = rdtsc();

  printf("%llu\n", b-a);
  return 0;
}

Sample two

#include <stdio.h>
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>

#include "rdtsc.h"

#define N (1024*1024*2)

int main(int argc, char* argv[])
{
  unsigned long long a,b;
  unsigned long long min,max;
  char* p;
  int i;

  p = (char*)malloc(N);
  assert( p!=(char*)0 );

  max = 0;
  min = UINT64_MAX;

  for(i=0; i<N; i++ ) {
    a = rdtsc();
    p[i] = 0;
    b = rdtsc() - a;
    if( b > max ) max = b;
    else if( b < min ) min = b;
  }
  printf("min=%llu\n", min);
  printf("max=%llu\n", max);
  return 0;
}

TODO: translate CPU cycle to time

  • time_in_seconds = number_of_clock_cycles / frequency
  • cat /proc/cpuinfo |grep 'MHz'
  • gettimeofday and rdtsc

Something need to consider?

  • Context switch?
  • Multi-core processors

gettimeofday

#include <iostream>
#include <sys/time.h> // for gettimeofday()
using namespace std;

int main()
{
    struct timeval t1, t2;
    double elapsedTime;

    // start timer
    gettimeofday(&t1, NULL);

    // do something
    // ...

    // stop timer
    gettimeofday(&t2, NULL);

    // compute and print the elapsed time in millisec
    elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0;    // sec to ms
    elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms
    cout << elapsedTime << " ms.\n";

    return 0;
}

gettimeofday vs rdtsc, source code

#include <iostream>
#include <sys/time.h> // for gettimeofday()
#include <stdlib.h> // for malloc
#include <string.h> // for memcpy
#include "rdtsc.h"

// cat /proc/cpuinfo |grep 'MHz'
#define CPUMHZ 1200.0f
// frequency per micro second
#define CPU_SPEED_US (CPUMHZ*1024*1024/1000/1000)

using namespace std;
int main()
{
    // test malloc
    cout << "malloc:\n";
    struct timeval t1, t2;
    double elapsedTime;
    gettimeofday(&t1, NULL);
    char* p = (char*)malloc(4);
    gettimeofday(&t2, NULL);
    free(p);
    elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000000.0; // sec to us
    elapsedTime += (t2.tv_usec - t1.tv_usec) ;
    cout << "\t" << elapsedTime << " us (by gettimeofday)\n";

    unsigned long long a,b;
    a = rdtsc();
    char* q = (char*)malloc(4);
    b = rdtsc() - a;
    free(q);
    cout << "\ttimestamp count is " << b << ", elapsedTime " << b/CPU_SPEED_US << " us\n";

    // test memcpy
    cout << "memcpy:\n";
    char tmp1[1024] = {0};
    char tmp2[1024] = {'a'};
    int count = 1000;

    gettimeofday(&t1, NULL);
    for (int i=0; i<count; i++) {
        memcpy(tmp1, tmp2, sizeof(tmp1));
    }
    gettimeofday(&t2, NULL);
    elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000000.0; // sec to us
    elapsedTime += (t2.tv_usec - t1.tv_usec) ;
    cout << "\t" << elapsedTime << " us (by gettimeofday)\n";

    a = rdtsc();
    for (int i=0; i<count; i++) {
        memcpy(tmp1, tmp2, sizeof(tmp1));
    }
    b = (rdtsc()-a)/count;
    cout << "\ttimestamp count is " << b << ", elapsedTime " << b/CPU_SPEED_US << " us\n";
    return 0;
}

output

dennis@dennis:~/tt$ g++ -O3 -o t t.cc
dennis@dennis:~/tt$ ./t
malloc:
    0 us (by gettimeofday)
    timestamp count is 58, elapsedTime 0.0460943 us
memcpy:
    0 us (by gettimeofday)
    timestamp count is 0, elapsedTime 0 us
dennis@dennis:~/tt$ g++ -O2 -o t t.cc
dennis@dennis:~/tt$ ./t
malloc:
    0 us (by gettimeofday)
    timestamp count is 69, elapsedTime 0.0548363 us
memcpy:
    0 us (by gettimeofday)
    timestamp count is 0, elapsedTime 0 us
dennis@dennis:~/tt$ g++ -O1 -o t t.cc
dennis@dennis:~/tt$ ./t
malloc:
    0 us (by gettimeofday)
    timestamp count is 58, elapsedTime 0.0460943 us
memcpy:
    2 us (by gettimeofday)
    timestamp count is 3, elapsedTime 0.00238419 us
dennis@dennis:~/tt$ g++ -o t t.cc
dennis@dennis:~/tt$ ./t
malloc:
    80 us (by gettimeofday)
    timestamp count is 1198, elapsedTime 0.952085 us
memcpy:
    102 us (by gettimeofday)
    timestamp count is 295, elapsedTime 0.234445 us
dennis@dennis:~/tt$ g++ -g -o t t.cc
dennis@dennis:~/tt$ ./t
malloc:
    82 us (by gettimeofday)
    timestamp count is 1421, elapsedTime 1.12931 us
memcpy:

    timestamp count is 295, elapsedTime 0.234445 us

Reference