Benchmark result
AMD 1.4 Ghz ; gcc 3.1
typedef double real
- axpy
- matrix_vector
- matrix_matrix
- aat
- ata
mean calculation
mean in cache : This value corresponds to an average performed between 10 to 100 matrix
ranks (10 to 10000 vector sizes for axpy calculations)
mean out of cache : This value corresponds to an average performed between 300 to 1000 matrix
ranks (100000 to 1000000 vector sizes for axpy calculations)
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
ATL_dgemm(CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
ATL_sgemm(CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
ATL_dgemm(CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
ATL_sgemm(CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void axpy( real coef, const gene_vector & X, gene_vector & Y, int N)
{
ATL_daxpy(N,coef,X,1,Y,1);
}
static inline void axpy(float coef, const gene_vector & X, gene_vector & Y, int N)
{
ATL_saxpy(N,coef,X,1,Y,1);
}
ATLAS_comments
Although this library does not offer high abstraction level (not O.O programming) it is highly valuable since it
competes with vendor-BLAS library with a SINGLE interface and distribution. It is FREE and it allows to get rid of vendor-BLAS
library and increases the portability of application built on top of it.
|
typedef real * gene_matrix;
typedef real * gene_vector; |
ATLAS compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)
|
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
ATL_dgemm(CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);
}
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
ATL_sgemm(CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
ATL_dgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
ATL_sgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
}
static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A(i,k)*A(j,k);
}
X(i,j)=somme;
}
}
}
static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A(k,i)*A(k,j);
}
X(i,j)=somme;
}
}
}
static inline void axpy(const real coef, const gene_vector & X, gene_vector & Y, int N)
{
Y+=coef*X;
}
blitz_comments
This famous multi-dimensional array library have not been designed specificaly for linear algebra calculation.
Hence there are numerous ways of implementing BLA algorithms such as Matrix-Matrix product. two versions of MM product
have been evaluated. The first one makes use of Blitz condensed expression but is slower than the C-like implementation. Hope
that someone can bring a better one :-). Following J. Cummings comments, we are currently testing Blitz matrix containers and
tinyvector/matrix implementation. These results will be posted soon.
|
typedef blitz::Array < real , 2 > gene_matrix;
typedef blitz::Vector < real > gene_vector;
//typedef blitz::Array < real , 1 > gene_vector;
|
blitz compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)
|
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A(i,k)*B(k,j);
}
X(i,j)=somme;
}
}
}
static inline void matrix_vector_product_slow( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
firstIndex i;
secondIndex j;
X = sum(A(i,j)*B(j),j);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
real somme;
for (int i=0 ; i < N ; i++){
somme = 0.0;
for (int j=0 ; j < N ; j++){
somme+=A(i,j)*B[j];
}
X[i]=somme;
}
}
static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A[i][k]*A[j][k];
}
X[i][j]=somme;
}
}
}
static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A[k][i]*A[k][j];
}
X[i][j]=somme;
}
}
}
static inline void axpy( real coef, const gene_vector & X, gene_vector & Y, int N)
{
for (int i=0;i < N;i++){
Y[i]+=coef*X[i];
}
}
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_dgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_sgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_sgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void axpy( real coef, const gene_vector & X, gene_vector & Y, int N)
{
cblas_daxpy(N,coef,X,1,Y,1);
}
static inline void axpy(float coef, const gene_vector & X, gene_vector & Y, int N)
{
cblas_saxpy(N,coef,X,1,Y,1);
}
C_BLAS_comments
typedef real * gene_matrix;
typedef real * gene_vector; |
C_BLAS compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)
|
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);
}
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
cblas_sgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
cblas_dgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
// cblas_sgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
cblas_sgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
}
C_comments
Trivial implementation based on C pointers.
|
typedef real * ptr_ real ;
typedef ptr_ real * gene_matrix;
typedef real * gene_vector;
|
C compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)
|
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A[i][k]*B[k][j];
}
X[i][j]=somme;
}
}
}
static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
{
for (int i=0;i < N;i++){
real somme=0.0;
for (int j=0;j < N;j++){
somme+=A[i][j]*B[j];
}
X[i]=somme;
}
}
void daat_(double * A, double *X, int * N);
void saat_(float * A, float *X, int * N);
}
SUBROUTINE SAAT(A,X,N)
**
** X = AT * A
REAL*4 A(N,N),X(N,N)
DO 20 I=1,N
DO 20 J=1,N
R=0.
DO 10 K=1,N
R=R+A(I,K)*A(J,K)
10 CONTINUE
X(I,J)=R
20 CONTINUE
RETURN
END
SUBROUTINE DAAT(A,X,N)
**
** X = AT * A
REAL*8 A(N,N),X(N,N)
DO 20 I=1,N
DO 20 J=1,N
R=0.
DO 10 K=1,N
R=R+A(I,K)*A(J,K)
10 CONTINUE
X(I,J)=R
20 CONTINUE
RETURN
END
void data_(double * A, double *X, int * N);
void sata_(float * A, float *X, int * N);
}
SUBROUTINE SATA(A,X,N)
**
** X = AT * A
REAL*4 A(N,N),X(N,N)
DO 20 I=1,N
DO 20 J=1,N
R=0.
DO 10 K=1,N
R=R+A(K,I)*A(K,J)
10 CONTINUE
X(I,J)=R
20 CONTINUE
RETURN
END
SUBROUTINE DATA(A,X,N)
**
** X = AT * A
REAL*8 A(N,N),X(N,N)
DO 20 I=1,N
DO 20 J=1,N
R=0.
DO 10 K=1,N
R=R+A(K,I)*A(K,J)
10 CONTINUE
X(I,J)=R
20 CONTINUE
RETURN
END
static inline void axpy( real coef, const gene_vector & X, gene_vector & Y, int N)
{
int one=1;
daxpy_(&N,&coef,X,&one,Y,&one);
}
static inline void axpy(float coef, const gene_vector & X, gene_vector & Y, int N)
{
int one=1;
saxpy_(&N,&coef,X,&one,Y,&one);
}
SUBROUTINE DAXPY(N,A,X,IX,Y,IY)
REAL*8 X(1),Y(1)
REAL*8 A
NN=IY*(N-1)+1
J=1
DO 10 I=1,NN,IY
Y(I)=Y(I)+A*X(J)
10 J=J+IX
RETURN
END
SUBROUTINE SAXPY(N,A,X,IX,Y,IY)
DIMENSION X(1),Y(1)
NN=IY*(N-1)+1
J=1
DO 10 I=1,NN,IY
Y(I)=Y(I)+A*X(J)
10 J=J+IX
RETURN
END
f77_comments
A lot of people still consider fortran as THE scientific langage to be used to achieve high performance and a
comparison with it is unavoidable. Just like the C implementation, the f77 results depend strongly on the used compiler. g77 is
probably not the best optimization performer... We hope that some Fortran experts can improve our matrix-matrix implementation
since we expect that f77 performances should be at least as good as the C one. f77 containers
|
typedef real * gene_matrix;
typedef real * gene_vector; |
f77 compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)
F77FLAGS=-O3
$(F77) $(F77FLAGS) -c *.f
|
static inline void f77_interface < real > ::matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
dmxm_(A,&N,B,&N,X,&N);
}
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
smxm_(A,&N,B,&N,X,&N);
}
SUBROUTINE SMXM(A,N,B,M,C,L)
C
DIMENSION A(N,M),B(M,L),C(N,L)
DO 20 I=1,N
DO 20 J=1,L
R=0.
DO 10 K=1,M
R=R+A(I,K)*B(K,J)
10 CONTINUE
C(I,J)=R
20 CONTINUE
RETURN
END
SUBROUTINE DMXM(A,N,B,M,C,L)
C
REAL*8 A(N,M),B(M,L),C(N,L)
DO 20 I=1,N
DO 20 J=1,L
R=0.
DO 10 K=1,M
R=R+A(I,K)*B(K,J)
10 CONTINUE
C(I,J)=R
20 CONTINUE
RETURN
END
static inline void f77_interface < real > ::matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
dmxv_(A,&N,B,&N,X);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
smxv_(A,&N,B,&N,X);
}
SUBROUTINE SMXV(A,N,X,M,R)
C
REAL*4 X(1),R(1),A(N,M),S
DO 20 I=1,N
S=0.D0
DO 10 J=1,M
S=S+A(I,J)*X(J)
10 CONTINUE
R(I)=S
20 CONTINUE
RETURN
END
SUBROUTINE DMXV(A,N,X,M,R)
C
REAL*8 X(1),R(1),A(N,M),S
DO 20 I=1,N
S=0.D0
DO 10 J=1,M
S=S+A(I,J)*X(J)
10 CONTINUE
R(I)=S
20 CONTINUE
RETURN
END
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_dgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_sgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_sgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void axpy( real coef, const gene_vector & X, gene_vector & Y, int N)
{
cblas_daxpy(N,coef,X,1,Y,1);
}
static inline void axpy(float coef, const gene_vector & X, gene_vector & Y, int N)
{
cblas_saxpy(N,coef,X,1,Y,1);
}
INTEL_BLAS_comments
Vendor Blas distribution tuned for intel CPUs.
|
typedef real * gene_matrix;
typedef real * gene_vector; |
INTEL_BLAS compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)
|
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);
}
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
cblas_sgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
cblas_dgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
// cblas_sgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
cblas_sgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
}
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
scale(X,0.0);
mult(A,trans(A),X);
}
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
scale(X,0.0);
mult(trans(A),A,X);
}
static inline void axpy(const real coef, gene_vector & X, gene_vector & Y, int N)
{
add(scaled(X,coef),Y);
}
MTL_comments
This great library provides excellent results with appropriate compiler and installation. Unfortunately the results
with gcc are not very good. In addition our installation is not OK since we failed in using BLAIS computation kernels. We decide to
present our bad MTL results on this page to encourage contributors to help us to make a better use of this very promising tool.
|
typedef typename matrix < real ,
rectangle < > ,
dense < > ,
column_major > ::type gene_matrix;
typedef dense1D < real > gene_vector;
|
MTL compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)
|
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
scale(X,0.0);
mult(A,B,X);
// typedef typename gene_matrix ::shape Shape;
// matmat_mult(A,B,X,Shape());
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
scale(X,0.0);
mult(A,B,X);
}
static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A[i][k]*A[j][k];
}
X[i][j]=somme;
}
}
}
static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A[k][i]*A[k][j];
}
X[i][j]=somme;
}
}
}
static inline void axpy( real coef, const gene_vector & X, gene_vector & Y, int N)
{
for (int i=0;i < N;i++){
Y[i]+=coef*X[i];
}
}
STL_comments
Ok, Ok, our use of STL containers does not match the STL philosophy. The aim of the test is just to show that
replacing C pointers with STL vectors does not brings performance penalties.
|
typedef std::vector < real > stl_vector;
typedef std::vector < stl_vector > stl_matrix;
typedef stl_matrix gene_matrix;
typedef stl_vector gene_vector;
|
STL compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)
|
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A[i][k]*B[k][j];
}
X[i][j]=somme;
}
}
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
real somme;
for (int i=0;i < N;i++){
somme=0.0;
for (int j=0;j < N;j++){
somme+=A[i][j]*B[j];
}
X[i]=somme;
}
}
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
// X = prod(A,trans(A));
X.assign(prod(A,trans(A)));
}
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
// X = prod(trans(A),A);
X.assign(prod(trans(A),A));
}
static inline void axpy_slow(const real coef, const gene_vector & X, gene_vector & Y, int N)
{
Y+=coef*X;
}
static inline void axpy(const real coef, const gene_vector & X, gene_vector & Y, int N)
{
Y.plus_assign(coef*X);
}
ublas_comments
This very promising library is younger than Blitz and MTL ones and combines technics from both. It provides
good performance with gcc ! Using uBlas in combination with BLAS (ATLAS) kernels could be a very efficient strategy...to be
tested.
|
typedef numerics::matrix < real > gene_matrix;
typedef numerics::vector < real > gene_vector;
|
ublas compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE) -DNDEBUG -finline-functions
|
static inline void matrix_matrix_product_slow( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
X = prod(A,B);
}
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
X.assign(prod(A,B));
}
static inline void matrix_vector_product_slow( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
X = prod(A,B);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
X.assign(prod(A,B));
}
home