HaWe Brickbench Benchmark Test 2.0

HaWe · 14.07.2018, 15:34

HaWe Brickbench Benchmark Test 2.1
angepasste Routinen zum besseren Vergleich von SoCs und MCUs,
ver 2.1: inkl GPIO toggle r/w test:

Code:

// HaWe Brickbench
// benchmark test for SoCs and MCUs
// PL: GCC, Raspi, Raspbian  Linux
// Autor: (C) Helmut Wunder 2013-2018
// ported to Raspi  by "HaWe"
//
// freie Verwendung für private Zwecke
// für kommerzielle Zwecke nur nach schriftlicher Genehmigung durch den Autor.
// protected under the friendly Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License
// http://creativecommons.org/licenses/by-nc-sa/3.0/
// version 2.1 2018-07-14
// feat. GPIO toggle r/w test



#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <fcntl.h>
#include <string.h>
#include <sys/ioctl.h>


#include <stdint.h>
#include <time.h>
#include <sys/time.h>
#include <wiringPi.h>

//#include "VG/openvg.h"
#include "VG/vgu.h"
#include "fontinfo.h"
#include "shapes.h"


unsigned long runtime[8];

int a[500], b[500], c[500], t[500];

#define tpin1  17  // GPIO test pins digitalWrite (BCM)
#define tpin2  18  // GPIO test pins digitalWrite (BCM)
#define tpin3  27  // GPIO test pins digitalRead (BCM)

uint32_t timer()
{
   struct timeval now;
   uint32_t ticks;
   gettimeofday(&now, NULL);
   ticks=now.tv_sec*1000+now.tv_usec/1000;
   return(ticks);
}


//--------------------------------------------
// Mersenne Twister
//--------------------------------------------

unsigned long randM(void) {
   const int M = 7;
   const unsigned long A[2] = { 0, 0x8ebfd028 };

   static unsigned long y[25];
   static int index = 25+1;

   if (index >= 25) {
     int k;
     if (index > 25) {
        unsigned long r = 9, s = 3402;
        for (k=0 ; k<25 ; ++k) {
          r = 509845221 * r + 3;
          s *= s + 1;
          y[k] = s + (r >> 10);
        }
     }
     for (k=0 ; k<25-M ; ++k)
        y[k] = y[k+M] ^ (y[k] >> 1) ^ A[y[k] & 1];
     for (; k<25 ; ++k)
        y[k] = y[k+(M-25)] ^ (y[k] >> 1) ^ A[y[k] & 1];
     index = 0;
   }

   unsigned long e = y[index++];
   e ^= (e << 7) & 0x2b5b2500;
   e ^= (e << 15) & 0xdb8b0000;
   e ^= (e >> 16);
   return e;
}


//--------------------------------------------
// Matrix Algebra
//--------------------------------------------

// matrix * matrix multiplication (matrix product)
 
 void MatrixMatrixMult(int N, int M, int K, double *A, double *B, double *C) {
   int i, j, s;
   for (i = 0; i < N; ++i) {
      for (j = 0; j < K; ++j) {
         C[i*K+j] = 0;
         for (s = 0; s < M; ++s) {
            C[i*K+j] = C[i*K+j] + A[i*N+s] * B[s*M+j];
         }
      }
   }
}


// matrix determinant

double MatrixDet(int N, double A[]) {
   int i, j, i_count, j_count, count = 0;
   double Asub[N - 1][N - 1], det = 0;

   if (N == 1)
      return *A;
   if (N == 2)
      return ((*A) * (*(A+1+1*N)) - (*(A+1*N)) * (*(A+1)));

   for (count = 0; count < N; count++) {
      i_count = 0;
      for (i = 1; i < N; i++) {
         j_count = 0;
         for (j = 0; j < N; j++) {
            if (j == count)
               continue;
            Asub[i_count][j_count] = *(A+i+j*N);
            j_count++;
         }
         i_count++;
      }
      det += pow(-1, count) * A[0+count*N] * MatrixDet(N - 1, &Asub[0][0]);
   }
   return det;
}





//--------------------------------------------
// shell sort
//--------------------------------------------

void shellsort(int size, int* A)
{
  int i, j, increment;
  int temp;
  increment = size / 2;

  while (increment > 0) {
    for (i = increment; i < size; i++) {
      j = i;
      temp = A[i];
      while ((j >= increment) && (A[j-increment] > temp)) {
        A[j] = A[j - increment];
        j = j - increment;
      }
      A[j] = temp;
    }

    if (increment == 2)
       increment = 1;
    else
       increment = (unsigned int) (increment / 2.2);
  }
}

//--------------------------------------------
// gnu quick sort
// (0ptional)
//--------------------------------------------

int compare_int (const int *a, const int *b)
{
  int  temp = *a - *b;

  if (temp > 0)          return  1;
  else if (temp < 0)     return -1;
  else                   return  0;
}

// gnu qsort:
// void qsort (void *a , size_a count, size_a size, compare_function)
// gnu qsort call for a[500] array of int:
// qsort (a , 500, sizeof(a), compare_int)



//--------------------------------------------
// benchmark test procedures
//--------------------------------------------


int test_Int_Add() {
   int i=1, j=11, k=112, l=1111, m=11111, n=-1, o=-11, p=-111, q=-1112, r=-11111;
   int x;
   volatile long s=0;
   for(x=0;x<5000000;++x) { 
     s+=i; s+=j; s+=k; s+=l; s+=m; s+=n; s+=o; s+=p; s+=q; s+=r;
   }
   return s;
}


//--------------------------------------------
long test_Int_Mult() {
  int x,y;
  volatile long s;

  for(y=0;y<500000;++y) {  
    s=1;
    for(x=1;x<=10;++x) { s*=x;} //1 3->10
    for(x=10;x>0;--x) { s/=x;}

  }
  return s;
}


#define PI  M_PI

//--------------------------------------------
double test_float_math() {

  volatile double s=PI;
  int y;

  for(y=0;y<500000;++y) {  
     s*=sqrt(s);
     s=sin(s);
     s=exp(s);
     s*=s;
  }
  return s;
}


//--------------------------------------------
long test_rand_MT(){
  volatile unsigned long s;
  int y;

  for(y=0;y<2500000;++y) {  
     s=randM()%10001;
  }
  return s;
}






//--------------------------------------------
double test_matrix_math() {
  int x;

  double A[2][2], B[2][2], C[2][2];
  double S[3][3], T[3][3];
  unsigned long s;

  for(x=0;x<50000;++x) { 

    A[0][0]=1;   A[0][1]=3;
    A[1][0]=2;   A[1][1]=4;

    B[0][0]=10;  B[0][1]=30;
    B[1][0]=20;  B[1][1]=40;

    MatrixMatrixMult(2, 2, 2, A[0], B[0], C[0]);  

    A[0][0]=1;   A[0][1]=3;
    A[1][0]=2;   A[1][1]=4;
    
    MatrixDet(2, A[0]);                          

    S[0][0]=1;   S[0][1]=4;  S[0][2]=7;
    S[1][0]=2;   S[1][1]=5;  S[1][2]=8;
    S[2][0]=3;   S[2][1]=6;  S[2][2]=9;

    MatrixDet(3, S[0]);                          

  }

  s=(S[0][0]*S[1][1]*S[2][2]);
  return s;
}



//--------------------------------------------
// for array copy using void *memcpy(void *dest, const void *src, size_t n);

long test_Sort(){
  unsigned long s;
  int y;
  int t[500];

  for(y=0;y<500;++y) {  
    memcpy(t, a, sizeof(a));
    shellsort(500, t);
   
    memcpy(t, a, sizeof(b));
    shellsort(500, t);
   
    memcpy(t, a, sizeof(c));
    shellsort(500, t);
  }

  return y;
}




//--------------------------------------------
int32_t test_GPIO(){
   volatile static bool w=false, r;
   uint32_t y;
   for (y=0; y<2000000; y++) {      
         digitalWrite(tpin1, w);
         w=!w;
         r=digitalRead(tpin3);
         digitalWrite(tpin2, w&!r);      
   }
   return 1;
}


//--------------------------------------------
long test_TextOut(){


  int  y=77;
  char buf[120];

  for(y=0;y<10;++y) { 
	  
    Background(0, 0, 0);                    // Black background
    
    //Text(x, y, buf, SerifTypeface, 10); 
    Fill(255, 255, 255, 1);                 // White text
    
    sprintf (buf, "%3d %4d  int_Add",    0, 1000); Text( 20, 200- 20, buf, SerifTypeface, 10); End();
    sprintf (buf, "%3d %4d  int_Mult",   1, 1010); Text( 20, 200- 40, buf, SerifTypeface, 10); End(); 
    sprintf (buf, "%3d %4d  float_op",   2, 1020); Text( 20, 200- 60, buf, SerifTypeface, 10); End();
    sprintf (buf, "%3d %4d  randomize",  3, 1030); Text( 20, 200- 80, buf, SerifTypeface, 10); End();
    sprintf (buf, "%3d %4d  matrx_algb", 4, 1040); Text( 20, 200-100, buf, SerifTypeface, 10); End();
    sprintf (buf, "%3d %4d  arr_sort",   5, 1050); Text( 20, 200-120, buf, SerifTypeface, 10); End();
    sprintf (buf, "%3d %4d  GPIO_toggle", 6, 1060); Text( 20, 200-140, buf, SerifTypeface, 10); End();
    sprintf (buf, "%3d %4d  testing...", 7, 1070); Text( 20, 200-160, buf, SerifTypeface, 10); End();

  }

  return y;
}



long test_graphics(){
    int y=0;
     
    for(y=0;y<10;++y) {  
        WindowClear();      // Colour and size are remembered from the
                            // ClearWindowRGBA() call at the start of the program
        Stroke(255, 255, 255, 1);  // Set these at the start, no need to
        Fill(255,255,255, 1);      // keep calling them if colour hasn't changed
        StrokeWidth(1.0);  
        
        End();
   
        CircleOutline(50, 40, 10);       // circles
        End();

        Circle(30, 24, 10);
        End();

        Line(10, 10, 60, 60);            // just 2 intersecting lines
        End();
        Line(50, 20, 90, 70);
        End();

        RectOutline(20, 20, 40, 40);     // rectangles
        End();
      
        Rect(65, 25, 20, 30);
        End();

        CircleOutline(70, 30, 15);  // formerly ellipse
        End();
    }
    return y;
}




inline void displayValues() { // text line patterns

  char buf[120];
    
    WindowClear();      // Colour and size are remembered the start of the program

    sprintf (buf, "%3d %7ld  int_Add",    0, runtime[0]); printf(buf); printf("\n");
    sprintf (buf, "%3d %7ld  int_Mult",   1, runtime[1]); printf(buf); printf("\n");
    sprintf (buf, "%3d %7ld  float_op",   2, runtime[2]); printf(buf); printf("\n");
    sprintf (buf, "%3d %7ld  randomize",  3, runtime[3]); printf(buf); printf("\n");
    sprintf (buf, "%3d %7ld  matrx_algb", 4, runtime[4]); printf(buf); printf("\n");
    sprintf (buf, "%3d %7ld  arr_sort",   5, runtime[5]); printf(buf); printf("\n");
    sprintf (buf, "%3d %7ld  GPIO_toggle", 6, runtime[6]); printf(buf); printf("\n");
    sprintf (buf, "%3d %7ld  graphics",   7, runtime[7]); printf(buf); printf("\n");
}



int main(){

  unsigned long time0, x, y;
  float s;
  char  buf[120];
  int width, height;
  char str[3];

  InitShapes(&width, &height);                  // Graphics initialization
  Start(width, height);                   // Start the picture
    
  WindowClear();
  WindowOpacity(255);       // Hide  the picture    

  printf("hw brickbench"); printf("\n");
  printf("initializing..."); printf("\n");
  
  // wiringPi      
  setenv ("WIRINGPI_GPIOMEM", "1", true) ;            // no sudo for gpios required
  int iores = wiringPiSetupGpio();                        // init by BCM pin numbering
  if( iores == -1 ) return 1;  
  pinMode( tpin1, OUTPUT);  
  pinMode( tpin2, OUTPUT);  
  pinMode( tpin3, INPUT);  pullUpDnControl( tpin3, PUD_UP);
 

  for(y=0;y<500;++y) {
    a[y]=randM()%30000; b[y]=randM()%30000; c[y]=randM()%30000;
  }
  
  time0= timer();
  s=test_Int_Add();
  runtime[0]=timer()-time0;
  sprintf (buf, "%3d %7ld  int_Add",    0, runtime[0]);  printf(buf); printf("\n");

  time0=timer();
  s=test_Int_Mult();
  runtime[1]=timer()-time0;
  sprintf (buf, "%3d %7ld  int_Mult",   0, runtime[1]);  printf(buf); printf("\n");

  time0=timer();
  s=test_float_math();
  runtime[2]=timer()-time0;
  sprintf (buf, "%3d %7ld  float_op",   0, runtime[2]);  printf(buf); printf("\n");

  time0=timer();
  s=test_rand_MT();
  runtime[3]=timer()-time0;
  sprintf (buf, "%3d %7ld  randomize",  0, runtime[3]);  printf(buf); printf("\n");

  time0=timer();
  s=test_matrix_math();
  runtime[4]=timer()-time0;
  sprintf (buf, "%3d %7ld  matrx_algb", 0, runtime[4]);  printf(buf); printf("\n");


  time0=timer();
  s=test_Sort();
  runtime[5]=timer()-time0;
  sprintf (buf, "%3d %7ld  arr_sort",   0, runtime[5]);  printf(buf); printf("\n");
  
  time0=timer();
  s=test_GPIO();
  runtime[6]=timer()-time0;
  sprintf (buf, "%3d %7ld  GPIO_toggle",   0, runtime[6]);  printf(buf); printf("\n");

  time0=timer();
  s=test_TextOut();
  s=test_graphics();
  runtime[7]=timer()-time0;
  sprintf (buf, "%3d %7ld  Graphics",   0, runtime[7]);  printf(buf); printf("\n");

  WindowOpacity(0);       // Hide  the picture   

  y=0;
  for(x=0;x<8;++x) {y+= runtime[x];}
  printf("\n"); 
  printf("\n");
  

   
  displayValues();

  sprintf (buf, "gesamt ms: %ld ", y);           printf(buf); printf("\n");
  sprintf (buf, "benchmark: %ld ", 50000000/y ); printf(buf); printf("\n");



  fgets(str, 2, stdin);                   // look at the pic, end with [RETURN]
  FinishShapes();                         // Graphics cleanup
  exit(0);
}

Code:

test design:
  0   int_Add     50,000,000 int +,- plus counter
  1   int_Mult    10,000,000 int *,/  plus counter
  2   float_op    2,500,000 fp mult, transc.  plus counter
  3   randomize   2,500,000 Mersenne PRNG (+ * & ^ << >>)
  4   matrx_algb  150,000 2D Matrix algebra (mult, det)
  5   arr_sort    1500 shellsort of random array[500]
  6   GPIO toggle 6,000,000 toggle GPIO r/w  plus counter
  7   Graphics    10*8 textlines + 10*8 shapes + 20 clrscr 

.

Code:

Raspi 2, GPU CLOCK 400MHz, NO CPU OVERCLOCK, openVG:
  0     384  int_Add
  1     439  int_Mult
  2     441  float_op (double)
  3     399  randomize
  4     173  matrx_algb
  5     508  arr_sort
  6     823  GPIO_toggle
  7    2632  graphics
gesamt ms: 5799 
benchmark: 8622 

.

Vergleichswerte von Arduinos etc.: https://www.roboternetz.de/community...C3%BCr-Arduino

HaWe · 15.07.2018, 17:53

kann sich jemand von den Python-Programmierern hier vorstellen, wie man diesen Code (auch ggf erstmal nur ein paar Teile davon) nach Python portiert (reine Python3 Bordmittel, also kein eingebundener kompilierter C++ Code)?
1 oder 2 Einzeltest haben schon Hinweise gegeben, dass Python Code 100x bis 1000x langsamer ausgeführt wird als native Linux Executables per C(++), mich würde einmal interessieren, wie das über eine größere Bandbreite verschiedener Tests aussieht.

shedepe · 17.07.2018, 08:22

Also nur mal vom theoretischen Standpunkt ausgesehen: Wenn du z.B. in einer for Schleife in Python etwas rechnest, dann sollte das kaum langsamer sein als wenn du das in C++ ausführst. Am Schluss ist es eben doch binär Code der auf der CPU ausgeführt wird. Vorraussetzung du führst den Code mehr als einmal aus -> Python wird auf vielen Systemen Standardmäßig erst mal in Bytecode beim ersten Mal ausführen kompiliert und dann erst vom Interpreter
Wäre auch mein Tipp noch -> Wenn du wirklich gut benchmarken willst und nicht z.B. auf x86 Pech haben willst, dass grade Speicher kopiert wurde o.ä. führe jede Funktion am besten mehrere 100 Mal aus und berechne Maximale Zeit, Minimale Zeit und die Durchschnittszeit. Für einen Atmega sollte das natürlich keine Rolle spielen, weil da sonst nicht so viel passiert, bzw. die Architektur einfach genug ist.

HaWe · 17.07.2018, 09:32

Zitat von shedepe

Also nur mal vom theoretischen Standpunkt ausgesehen: Wenn du z.B. in einer for Schleife in Python etwas rechnest, dann sollte das kaum langsamer sein als wenn du das in C++ ausführst. Am Schluss ist es eben doch binär Code der auf der CPU ausgeführt wird. Vorraussetzung du führst den Code mehr als einmal aus -> Python wird auf vielen Systemen Standardmäßig erst mal in Bytecode beim ersten Mal ausführen kompiliert und dann erst vom Interpreter
Wäre auch mein Tipp noch -> Wenn du wirklich gut benchmarken willst und nicht z.B. auf x86 Pech haben willst, dass grade Speicher kopiert wurde o.ä. führe jede Funktion am besten mehrere 100 Mal aus und berechne Maximale Zeit, Minimale Zeit und die Durchschnittszeit. Für einen Atmega sollte das natürlich keine Rolle spielen, weil da sonst nicht so viel passiert, bzw. die Architektur einfach genug ist.

hallo,
danke für die Tipps!
tatsächlich wird ja jede Schleife in jedem Sub-Test teilw. mehrere zig-tausend mal ausgeführt. Portieren nach Python kann ich es allerdings selber nicht, da ich absolut überhaupt keinen Schatten eines Schimmers von Python habe
- und dann soll der Code ja auch nicht nur irgendwie ähnlich sein, sondern bis ins Detail identische Routinen haben, soweit das die Python-Syntax auch nur möglich macht.
Das gilt unter besonderer Beachtung auch der Zeilen, in denen Variablen im C-Code als "volatile" deklariert wurden, um zu vermeiden, dass der Präprozessor oder der Compiler sie weg-optimiert (was ja ruckzuck passiert, da mit den mathematischen Ergebnissen nicht weitergerechnet wird)...

PS,
Für Java (JIT-Compiler) habe ich solche Tests auch schon mal gemacht, hier hat sich nach 3 Durchläufen nacheinander auch schon eine erhebliche Geschwindigkeitssteigerung gezeigt. Das bliebe jetzt auch für Python zu zeigen.
Die C(++) Compilate sind allerdings schon sehr, sehr stabil mit ihren runtimes.

shedepe · 18.07.2018, 12:45

Code:

long test_Int_Mult() {
  int x,y;
  volatile long s;

  for(y=0;y<500000;++y) { // *500: 20,000,000 int mult/div
    s=1;
    for(x=1;x<=10;++x) { s*=x;} //1 3->10
    for(x=10;x>0;--x) { s/=x;}

  }
  return s;
}

Würde vorschlagen, das Ergbnis der Funktion in eine Variable einzulesen und erst diese Variable als volatile zu deklarieren. Sonst verhinderst du eventuell Optimierung in der Funktion die sich auf die Laufzeit auswirken könnten. Das selbe gilt auch für alle anderen Funktionen mit volatile Variablen in der Funktion.

HaWe · 18.07.2018, 15:16

Zitat von shedepe

Code:

long test_Int_Mult() {
  int x,y;
  volatile long s;

  for(y=0;y<500000;++y) { //    10,000,000 int mult/div
    s=1;
    for(x=1;x<=10;++x) { s*=x;}  
    for(x=10;x>0;--x) { s/=x;}

  }
  return s;
}

Würde vorschlagen, das Ergbnis der Funktion in eine Variable einzulesen und erst diese Variable als volatile zu deklarieren. Sonst verhinderst du eventuell Optimierung in der Funktion die sich auf die Laufzeit auswirken könnten. Das selbe gilt auch für alle anderen Funktionen mit volatile Variablen in der Funktion.

ja, das stimmt, allerdings hatte ich das genau so eigentlich beabsichtigt, dass es keinesfalls optimiert werden sollte, weil ich unbedingt wollte, dass während der Laufzeit alle Schritte genau so, unverändert und unoptimiert vollständig berechnet werden:
Man sollte ein Maß haben, wieviel Zeit tatsächlich für eine definitiv durchzuführende Anzahl arithmetischer Operationen (hier eben vollständige explizite Integer Mult./Divis.) benötigt wird.

Hintergrund ist auch, dass bei "echten" Anwendungsprogrammen kaum immer so extrem viele kurze Operationen mit denselben Variablen in langen Schleifen durchgeführt werden, sondern diese eher durch if-statements, Jumps/Funktionsaufrufe und GPIO r/w sowie völlig andere Berechnungen mit ganz anderen Variablen unterbrochen sind. Dadurch ist es dann auch nicht unbedingt möglich, diese Operationen immer direkt auf den schnelleren Registern auszuführen, sondern stattdessen sind eher häufige RAM-Zugriffe erforderlich.
Genau diese sonst mögliche, schnellere Register-Arithmetik soll auch hier per volatile verhindert werden, abgesehen von kompletten weg-Optimierungen, weil die Rechenergebnisse weiterhin ungenutzt bleiben.