.gear-rules | 2 + .../tags/59ae7dbbc04bf57b482668e1d112d4d98d4840aa | 13 ++ .gear/tags/list | 1 + README.ALT | 64 ++++++ atlas-run.sh | 3 + atlas.mk | 161 ++++++++++++++ atlas.spec | 226 ++++++++++++++++++++ bin/uumtst.c | 5 +- include/atlas_f77wrap.h | 2 + include/atlas_kernel2.h | 3 +- include/atlas_kernel3.h | 2 + include/atlas_lvl2.h | 2 + include/atlas_rblas3.h | 2 + include/atlas_reflvl2.h | 2 + include/atlas_reflvl3.h | 2 + include/contrib/ATL_gemv_ger_SSE.h | 3 + include/contrib/camm_dpa.h | 2 +- include/contrib/camm_util.h | 9 + interfaces/lapack/F77/src/ilaenv.f | 185 +++++++++++++++- src/blas/gemm/ATL_cmmJIK.c | 2 +- tune/blas/gemm/CASES/ATL_dmm4x4x2_US.c | 2 +- tune/blas/gemm/CASES/ATL_dmm6x1x60_sse2.c | 5 + tune/blas/gemv/CASES/ATL_cgemvN_1x1_1.c | 1 + tune/blas/gemv/CASES/ATL_cgemvN_1x1_1a.c | 1 + tune/blas/gemv/CASES/ATL_cgemvN_2x2_0.c | 1 + tune/blas/gemv/CASES/ATL_cgemvN_4x2_1.c | 1 + tune/blas/gemv/CASES/ATL_cgemvN_mm.c | 1 + tune/blas/gemv/CASES/ATL_cgemvT_1x1_1.c | 1 + tune/blas/gemv/CASES/ATL_cgemvT_2x2_0.c | 1 + tune/blas/gemv/CASES/ATL_cgemvT_2x4_1.c | 1 + tune/blas/gemv/CASES/ATL_cgemvT_mm.c | 1 + tune/blas/gemv/CASES/ATL_gemvN_dummy.c | 1 + tune/blas/gemv/CASES/ATL_gemvN_dummy2.c | 1 + tune/blas/gemv/CASES/ATL_gemvT_1x1_1.c | 1 + tune/blas/gemv/CASES/ATL_gemvT_2x16_1.c | 1 + tune/blas/gemv/CASES/ATL_gemvT_2x8_0.c | 1 + tune/blas/gemv/CASES/ATL_gemvT_4x16_1.c | 1 + tune/blas/gemv/CASES/ATL_gemvT_4x8_1.c | 1 + tune/blas/gemv/CASES/ATL_gemvT_dummy.c | 1 + tune/blas/gemv/CASES/ATL_gemvT_dummy2.c | 1 + tune/blas/gemv/CASES/ATL_gemvT_mm.c | 1 + tune/blas/gemv/CASES/ATL_gemv_SSE.c | 1 + tune/blas/level1/IAMAX/iamax_sse.c | 9 + 43 files changed, 714 insertions(+), 13 deletions(-) diff --git a/.gear-rules b/.gear-rules new file mode 100644 index 0000000..7cffa45 --- /dev/null +++ b/.gear-rules @@ -0,0 +1,2 @@ +tar: @version@:. name=atlas-@version@ base=ATLAS +diff: @version@:. . diff --git a/.gear/tags/59ae7dbbc04bf57b482668e1d112d4d98d4840aa b/.gear/tags/59ae7dbbc04bf57b482668e1d112d4d98d4840aa new file mode 100644 index 0000000..2b25533 --- /dev/null +++ b/.gear/tags/59ae7dbbc04bf57b482668e1d112d4d98d4840aa @@ -0,0 +1,13 @@ +object 4faefe4e1ccaa87c4ea4374640366febb71fe86a +type commit +tag 3.7.11 +tagger Alexey Tourbin <at@localhost.localdomain> 1164114354 +0300 + +3.7.11 +-----BEGIN PGP SIGNATURE----- +Version: GnuPG v1.4.5 (GNU/Linux) + +iD8DBQBFYvm0fBKgtDjnu0YRAgR4AKCHBBq84K9oIHmiBw0wJu5kc85R4QCgtppf +NEbZrsGYdsrVrf9EtdqNSF8= +=uIvo +-----END PGP SIGNATURE----- diff --git a/.gear/tags/list b/.gear/tags/list new file mode 100644 index 0000000..404c860 --- /dev/null +++ b/.gear/tags/list @@ -0,0 +1 @@ +59ae7dbbc04bf57b482668e1d112d4d98d4840aa 3.7.11 diff --git a/README.ALT b/README.ALT new file mode 100644 index 0000000..4ec3068 --- /dev/null +++ b/README.ALT @@ -0,0 +1,64 @@ +Automatically Tuned Linear Algebra Software +README.ALT by Alexey Tourbin + +The "libatlas" package provides the default BLAS library for ALT Linux +Sisyphus and derivative products. The BLAS library implements the +standard Fortran API for the Basic Linear Algebra Subprograms. It is +currently used by the R environment, python Numeric module, and by GNU +Octave (the latter has not been really packaged yet). + +The package also provides the "cblas" library, which implements the +standard API for calling BLAS from C. It also provides a subset of +LAPACK routines, which is used by the LAPACK library (the LAPACK library +is packaged separately). + +The BLAS and cblas libraries wrap the ATLAS library (libatlas), which +provides a highly efficient implementation of lower-level internal +routines. The ATLAS library performs compile-time optimizations for the +host system CPU, i.e. the code is automatically tuned for the current +CPU so as to make it run faster. Therefore, strictly speaking, the +ATLAS library from this package is likely to exhibit suboptimal +performance. However, I have done some research to ensure that nearly +optimal performance is actually possible. + +For the i586 platform, the package provides two different copies of +the ATLAS library: /usr/lib/libatlas.so* and /usr/lib/sse2/libatlas.so*. +The former library is generic; it uses only modest optimizations for +relatively old CPUs. However, compared to the plain Fortran BLAS +implementation, it gives a serious performance boost, usually by +a factor of 3. The latter of these libraries is for modern CPUs: it +makes use of SSE prefetch instruction, and also it is optimized for +larger L1 cache size. Compared to the former library, it gives yet more +50-70% speedup. Note that the SSE2-enabled library is picked up by the +dynamic linker automatically, whenever the host processor is capable of +SSE2 instructions, based on "hwcap" information from the Linux kernel. + +Further research showed that full-fledged tuning for the host system CPU +can add up only about 10-20% of performance. + +Here are some numbers for my Athlon64 3200+ CPU, kindly donated by +Dmitry V. Levin. I use the following shell command to measure GEMM +(general matrix multiplication) performance. + +R -q --vanilla <<<'mm <- matrix(rnorm(10^6), ncol = 10^3); system.time(crossprod(mm))' + +With Fortran reference implementation: +[1] 1.572 0.004 1.580 0.000 0.000 + +With /usr/lib/libatlas.so.3: +[1] 0.532 0.020 0.552 0.000 0.000 + +With /usr/lib/sse2/libatlas.so.3: +[1] 0.312 0.020 0.334 0.000 0.000 + +With GotoBLAS (another highly efficient yet non-free BLAS implementation): +[1] 0.296 0.008 0.301 0.000 0.000 + +So I hope that the benefits of prepackaged (and pretested) software can +outweigh a minor CPU-specific performance loss in most cases. + +Also note that, at the moment, this package does not provide SMP-enabled +BLAS. I do not believe that simply using SMP makes a considerable +performance boost easily possible (and only level3 BLAS routines can +deploy parallelism anyway). Please let me know if you think that you +can benefit from parallel ATLAS (or if you can donate a cluster). diff --git a/atlas-run.sh b/atlas-run.sh new file mode 100755 index 0000000..012801e --- /dev/null +++ b/atlas-run.sh @@ -0,0 +1,3 @@ +#!/bin/sh +cmd=$1/$2; shift 2 +$cmd $* diff --git a/atlas.mk b/atlas.mk new file mode 100644 index 0000000..e918593 --- /dev/null +++ b/atlas.mk @@ -0,0 +1,161 @@ +# ----------------------------- +# Make.ARCH for ATLAS3.7.11 +# ----------------------------- + +# ---------------------------------- +# Make sure we get the correct shell +# ---------------------------------- + SHELL = /bin/sh + +# ------------------------------------------------- +# Name indicating the platform to configure BLAS to +# ------------------------------------------------- + ARCH = $(arch) + +# ------------------- +# Various directories +# ------------------- + TOPdir = $(RPM_BUILD_DIR)/ATLAS + INCdir = $(TOPdir)/include/$(ARCH) + SYSdir = $(TOPdir)/tune/sysinfo/$(ARCH) + GMMdir = $(TOPdir)/src/blas/gemm/$(ARCH) + UMMdir = $(GMMdir) + GMVdir = $(TOPdir)/src/blas/gemv/$(ARCH) + GR1dir = $(TOPdir)/src/blas/ger/$(ARCH) + L1Bdir = $(TOPdir)/src/blas/level1/$(ARCH) + L2Bdir = $(TOPdir)/src/blas/level2/$(ARCH) + L3Bdir = $(TOPdir)/src/blas/level3/$(ARCH) + TSTdir = $(TOPdir)/src/testing/$(ARCH) + AUXdir = $(TOPdir)/src/auxil/$(ARCH) + CBLdir = $(TOPdir)/interfaces/blas/C/src/$(ARCH) + FBLdir = $(TOPdir)/interfaces/blas/F77/src/$(ARCH) + BINdir = $(TOPdir)/bin/$(ARCH) + LIBdir = $(TOPdir)/lib/$(ARCH) + PTSdir = $(TOPdir)/src/pthreads + MMTdir = $(TOPdir)/tune/blas/gemm/$(ARCH) + MVTdir = $(TOPdir)/tune/blas/gemv/$(ARCH) + R1Tdir = $(TOPdir)/tune/blas/ger/$(ARCH) + L1Tdir = $(TOPdir)/tune/blas/level1/$(ARCH) + L3Tdir = $(TOPdir)/tune/blas/level3/$(ARCH) + +# --------------------------------------------------------------------- +# Name and location of scripts for running executables during tuning +# --------------------------------------------------------------------- + ATLRUN = $(BINdir)/ATLrun.sh + ATLFWAIT = $(BINdir)/xatlas_waitfile + +# --------------------- +# Libraries to be built +# --------------------- + ATLASlib = $(LIBdir)/libatlas.a + CBLASlib = $(LIBdir)/libcblas.a + F77BLASlib = $(LIBdir)/libblas.a + LAPACKlib = $(LIBdir)/liblapack_atlas.a + + TESTlib = $(LIBdir)/libtstatlas.a +# ------------------------------------------- +# Upper bound on largest cache size, in bytes +# ------------------------------------------- + L2SIZE = -DL2SIZE=1048576 + +# --------------------------------------- +# Command setting up correct include path +# --------------------------------------- + INCLUDES = -I$(TOPdir)/include -I$(TOPdir)/include/$(ARCH) \ + -I$(TOPdir)/include/contrib + +# ------------------------------------------- +# Defines for setting up F77/C interoperation +# ------------------------------------------- + F2CDEFS = -DAdd_ -DStringSunStyle -fPIC + +# -------------------------------------- +# Special defines for user-supplied GEMM +# -------------------------------------- + UMMDEFS = + +# ------------------------------ +# Architecture identifying flags +# ------------------------------ + ARCHDEFS = @ARCHDEFS@ + +# ------------------------------------------------------------------- +# NM is the flag required to name a compiled object/executable +# OJ is the flag required to compile to object rather than executable +# These flags are used by all compilers. +# ------------------------------------------------------------------- + NM = -o + OJ = -c + + +# --------------------------------------------------------------------------- +# Fortran 77 compiler and the flags to use. Presently, ATLAS does not itself +# use any Fortran 77, but vendor BLAS are typically written for Fortran, so +# any links that include non-ATLAS BLAS will use FLINKER instead of CLINKER +# --------------------------------------------------------------------------- + F77 = g77 + F77FLAGS = $(RPM_OPT_FLAGS) + FLINKER = $(F77) + FLINKFLAGS = $(F77FLAGS) + FCLINKFLAGS = $(FLINKFLAGS) + + +# --------------------------------------------------------------------------- +# Various C compilers, and the linker to be used when we are not linking in +# non-ATLAS BLAS (which usually necessitate using the Fortran linker). +# The C compilers recognized by ATLAS are: +# CC : Compiler to use to compile regular, non-generated code +# MCC : Compiler to use to compile generated, highly-optimized code +# XCC : Compiler to be used on the compile engine of a cross-compiler +# These will typically all be the same. An example of where this is not +# the case would be DEC ALPHA 21164, where you want to use gcc for MCC, +# because DEC's cc does not allow the programmer access to all 32 floating +# point registers. However, on normal C code, DEC's cc produces much faster +# code than gcc, so you CC set to cc. Of course, any system where you are +# cross-compiling, you will need to set XCC differently than CC & MCC. +# --------------------------------------------------------------------------- + CDEFS = $(L2SIZE) $(INCLUDES) $(F2CDEFS) $(ARCHDEFS) + + GOODGCC = gcc + CC = gcc + CCFLAG0 = $(RPM_OPT_FLAGS) + CCFLAGS = $(CDEFS) $(CCFLAG0) + MCC = gcc + MMFLAGS = $(RPM_OPT_FLAGS) + XCC = gcc + XCCFLAGS = $(CDEFS) $(RPM_OPT_FLAGS) + CLINKER = $(CC) + CLINKFLAGS = $(CCFLAGS) + BC = $(CC) + BCFLAGS = $(CCFLAGS) + ARCHIVER = ar + ARFLAGS = r + RANLIB = echo + +# ------------------------------------- +# tar, gzip, gunzip, and parallel make +# ------------------------------------- + TAR = /bin/tar + GZIP = /bin/gzip + GUNZIP = /bin/gunzip + PMAKE = $(MAKE) -j 2 + +# ------------------------------------ +# Reference and system libraries +# ------------------------------------ + BLASlib = + FBLASlib = + FLAPACKlib = + LIBS = -lm + +# ---------------------------------------------------------- +# ATLAS install resources (include arch default directories) +# ---------------------------------------------------------- + ARCHDEF = @ARCHDEF@ + MMDEF = @MMDEF@ + INSTFLAGS = + +# --------------------------------------- +# Generic targets needed by all makefiles +# --------------------------------------- +waitfile: diff --git a/atlas.spec b/atlas.spec new file mode 100644 index 0000000..1d47c4b --- /dev/null +++ b/atlas.spec @@ -0,0 +1,226 @@ +Name: atlas +Version: 3.7.11 +Release: alt5 + +Summary: Automatically Tuned Linear Algebra Software (the BLAS library) +License: BSD +Group: System/Libraries + +URL: http://math-atlas.sourceforge.net +Source: %name-%version.tar +Patch: %name-%version-%release.patch + +ExclusiveArch: %ix86 amd64 x86_64 + +# Automatically added by buildreq on Fri Dec 08 2006 +BuildRequires: gcc-fortran + +%description +The ATLAS (Automatically Tuned Linear Algebra Software) project is an +ongoing research effort focusing on applying empirical techniques in +order to provide portable performance. At present, it provides C and +Fortran77 interfaces to a portably efficient BLAS implementation, as +well as a few routines from LAPACK. + +%prep +%setup -q -n ATLAS +%patch -p1 +ln -s atlas.mk Make.Linux_i586 +ln -s atlas.mk Make.Linux_sse2 +ln -s atlas.mk Make.Linux_amd64 +chmod +x atlas-run.sh +ln -s ../atlas-run.sh CONFIG/ATLrun.Linux_i586 +ln -s ../atlas-run.sh CONFIG/ATLrun.Linux_sse2 +ln -s ../atlas-run.sh CONFIG/ATLrun.Linux_amd64 + +%build +%add_optflags -fPIC -falign-loops=4 +%add_optflags -mfpmath=387 +%define soffix .so.3 +shared() +{ + lib=$1; shift + ${linker:-gcc} -shared -Wl,--whole-archive $lib.a -Wl,--no-whole-archive \ + -o $lib%soffix -Wl,-soname=$lib%soffix "$@" -lm -Wl,-z,defs +} +all_shared() +{ + cd $1 + linker=gcc shared libatlas + linker=gcc shared libcblas ./libatlas%soffix + linker=g77 shared libblas ./libatlas%soffix + linker=g77 shared liblapack_atlas ./libatlas%soffix ./libblas%soffix ./libcblas%soffix + cd - +} + +%ifarch %ix86 +make killall startup arch=Linux_i586 +make install sanity_test arch=Linux_i586 ARCHDEFS='-DATL_OS_Linux -DATL_GAS_x8632' \ + MMDEF=$PWD/CONFIG/ARCHS/PII/gcc/gemm ARCHDEF=$PWD/CONFIG/ARCHS/PII/gcc/misc +all_shared lib/Linux_i586 +make killall startup arch=Linux_sse2 +make install sanity_test arch=Linux_sse2 ARCHDEFS='-DATL_OS_Linux -DATL_GAS_x8632 -DATL_ARCH_HAMMER32 -DATL_SSE1 -DATL_SSE2' \ + MMDEF=$PWD/CONFIG/ARCHS/HAMMER32SSE2/gcc/gemm ARCHDEF=$PWD/CONFIG/ARCHS/HAMMER32SSE2/gcc/misc +all_shared lib/Linux_sse2 +%else +make killall startup arch=Linux_amd64 +make install sanity_test arch=Linux_amd64 ARCHDEFS='-DATL_OS_Linux -DATL_GAS_x8664 -DATL_ARCH_HAMMER64 -DATL_SSE1 -DATL_SSE2 -DATL_USE64BITS' \ + MMDEF=$PWD/CONFIG/ARCHS/HAMMER64SSE2/gcc/gemm ARCHDEF=$PWD/CONFIG/ARCHS/HAMMER64SSE2/gcc/misc +all_shared lib/Linux_amd64 +%endif + +%install +mkdir -p %buildroot%_libdir %buildroot%_includedir/atlas +cp -p include/*.h %buildroot%_includedir/atlas/ +ln -s atlas/cblas.h %buildroot%_includedir/cblas.h +ln -s atlas/clapack.h %buildroot%_includedir/clapack.h + +%ifarch %ix86 +cp -p include/Linux_i586/atlas*.h %buildroot%_includedir/atlas/ +cp -p lib/Linux_i586/lib*%soffix %buildroot%_libdir/ +cp -p lib/Linux_i586/lib*.a %buildroot%_libdir/ +mkdir %buildroot%_libdir/sse2 +cp -p lib/Linux_sse2/libatlas%soffix %buildroot%_libdir/sse2/ +%else +cp -p include/Linux_amd64/atlas*.h %buildroot%_includedir/atlas/ +cp -p lib/Linux_amd64/lib*%soffix %buildroot%_libdir/ +cp -p lib/Linux_amd64/lib*.a %buildroot%_libdir/ +%endif + +ln -s libatlas%soffix %buildroot%_libdir/libatlas.so +ln -s libcblas%soffix %buildroot%_libdir/libcblas.so +ln -s libblas%soffix %buildroot%_libdir/libblas.so +ln -s libblas%soffix %buildroot%_libdir/libf77blas.so +ln -s liblapack_atlas%soffix %buildroot%_libdir/liblapack_atlas.so + +mv %buildroot%_libdir/libatlas{,_}.a +echo 'GROUP(%_libdir/libatlas_.a -lm)' >%buildroot%_libdir/libatlas.a +mv %buildroot%_libdir/libcblas{,_}.a +echo 'GROUP(%_libdir/libcblas_.a %_libdir/libatlas.a)' >%buildroot%_libdir/libcblas.a +mv %buildroot%_libdir/libblas{,_}.a +echo 'GROUP(%_libdir/libblas_.a %_libdir/libatlas.a -lgfortran)' >%buildroot%_libdir/libblas.a +ln -s libblas.a %buildroot%_libdir/libf77blas.a +mv %buildroot%_libdir/liblapack_atlas{,_}.a +echo 'GROUP(%_libdir/liblapack_atlas_.a %_libdir/libcblas.a %_libdir/libblas.a)' >%buildroot%_libdir/liblapack_atlas.a + +%define pkgdocdir %_docdir/atlas-3.7 +mkdir -p %buildroot%pkgdocdir +cp -p README.ALT %buildroot%pkgdocdir/ +cp -p doc/AtlasCredits.txt doc/ChangeLog doc/LibReadme.txt %buildroot%pkgdocdir/ +cp -p doc/atlas_{contrib,devel,over}.ps doc/cblas.ps %buildroot%pkgdocdir/ +gzip -9nf %buildroot%pkgdocdir/*.ps +%ifarch %ix86 +cp -p bin/Linux_i586/INSTALL_LOG/SUMMARY.LOG %buildroot%pkgdocdir/SUMMARY.LOG +cp -p bin/Linux_sse2/INSTALL_LOG/SUMMARY.LOG %buildroot%pkgdocdir/SUMMARY.LOG.SSE2 +%else +cp -p bin/Linux_amd64/INSTALL_LOG/SUMMARY.LOG %buildroot%pkgdocdir/SUMMARY.LOG +%endif + +%package -n libatlas +Summary: Automatically Tuned Linear Algebra Software (the BLAS library) +Group: System/Libraries +Conflicts: liblapack < 1:3.0-alt3 + +%description -n libatlas +The ATLAS (Automatically Tuned Linear Algebra Software) project is an +ongoing research effort focusing on applying empirical techniques in +order to provide portable performance. At present, it provides C and +Fortran77 interfaces to a portably efficient BLAS implementation, as +well as a few routines from LAPACK. + +%files -n libatlas +%_libdir/lib*%soffix +%ifarch %ix86 +%dir %_libdir/sse2 +%_libdir/sse2/libatlas%soffix +%endif +%dir %pkgdocdir +%pkgdocdir/README.ALT +%pkgdocdir/*.txt +%pkgdocdir/ChangeLog +%pkgdocdir/SUMMARY.LOG* + +%post -n libatlas -p %post_ldconfig +%postun -n libatlas -p %post_ldconfig + +%package -n libatlas-devel +Summary: Automatically Tuned Linear Algebra Software (the BLAS library) +Group: Development/Other +Requires: libatlas = %version-%release + +%description -n libatlas-devel +The ATLAS (Automatically Tuned Linear Algebra Software) project is an +ongoing research effort focusing on applying empirical techniques in +order to provide portable performance. At present, it provides C and +Fortran77 interfaces to a portably efficient BLAS implementation, as +well as a few routines from LAPACK. + +%files -n libatlas-devel +%_libdir/*.so +%_includedir/cblas.h +%_includedir/clapack.h +%dir %_includedir/atlas +%_includedir/atlas/*.h + +%package -n libatlas-devel-static +Summary: Automatically Tuned Linear Algebra Software (the BLAS library) +Group: Development/Other +Requires: libatlas-devel = %version-%release +Requires: libgfortran-devel-static + +%description -n libatlas-devel-static +The ATLAS (Automatically Tuned Linear Algebra Software) project is an +ongoing research effort focusing on applying empirical techniques in +order to provide portable performance. At present, it provides C and +Fortran77 interfaces to a portably efficient BLAS implementation, as +well as a few routines from LAPACK. + +%files -n libatlas-devel-static +%_libdir/lib*.a +%exclude %_libdir/libtstatlas.a + +%package doc +Summary: Automatically Tuned Linear Algebra Software (the BLAS library) +Group: Development/Other +Requires: libatlas = %version-%release + +%description doc +The ATLAS (Automatically Tuned Linear Algebra Software) project is an +ongoing research effort focusing on applying empirical techniques in +order to provide portable performance. At present, it provides C and +Fortran77 interfaces to a portably efficient BLAS implementation, as +well as a few routines from LAPACK. + +%files doc +%dir %pkgdocdir +%pkgdocdir/*.ps.gz + +%changelog +* Mon Oct 29 2007 Alexey Tourbin <at@altlinux.ru> 3.7.11-alt5 +- packaged static libraries (requested by Pavel A. Piminov); + use e.g. "g77 -static test.f -lblas" for static linkage +- changed src.rpm packaging to keep separate upstream tarball + +* Tue Jan 09 2007 Alexey Tourbin <at@altlinux.ru> 3.7.11-alt4 +- backported bugfix for "complex C = A A' bug" +- backported fix for ilaenv.f which improves LAPACK performance + +* Sun Dec 10 2006 Alexey Tourbin <at@altlinux.ru> 3.7.11-alt3 +- compiled with -mfpmath=387 to fix LAPACK test suite on x86_64 +- actually packaged README.ALT + +* Fri Dec 08 2006 Alexey Tourbin <at@altlinux.ru> 3.7.11-alt2 +- imported sources into git and built with gear +- removed PRM_OPT_FLAGS hack that broke SSE2 +- enabled /usr/lib/sse2/libatlas.so.3 for i586 +- libatlas.so.3, libblas.so.3: made some internal functions hidden +- packaged docs (atlas-doc package has postscript documentation) +- added README.ALT, which explains some performance issues + +* Tue Jun 06 2006 Alexey Tourbin <at@altlinux.ru> 3.7.11-alt1 +- initial revision, with debian fixes +- PII-optimized BLAS (for i586) yields three-fold improvement + over plain fortran BLAS (matrix cross-product in R) +- SSE2-optimized /usr/lib/sse2/libatlas.so disabled by default + because of segfaults (try to rebuild `--with sse2' and let me know) +- HAMMER64SSE2 for x86_64 diff --git a/bin/uumtst.c b/bin/uumtst.c index 9940e80..c393106 100644 --- a/bin/uumtst.c +++ b/bin/uumtst.c @@ -59,13 +59,14 @@ static void test_lauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, ATL_lauum(Maj_, Uplo_, N_, A_, lda_) #endif +static void ATL_L2GE(const enum CBLAS_ORDER, const int, const TYPE*, + const int, TYPE*, const int); + static void ATL_U2GE (const enum CBLAS_ORDER Order, const int N, const TYPE *U, const int ldu, TYPE *C, const int ldc) { int j; - static void ATL_L2GE(const enum CBLAS_ORDER, const int, const TYPE*, - const int, TYPE*, const int); if (Order == CblasRowMajor) ATL_L2GE(CblasColMajor, N, U, ldu, C, ldc); else diff --git a/include/atlas_f77wrap.h b/include/atlas_f77wrap.h index 5a849df..e77b193 100644 --- a/include/atlas_f77wrap.h +++ b/include/atlas_f77wrap.h @@ -895,6 +895,7 @@ * Prototypes for F77 interface wrappers ATLAS BLAS routines * ===================================================================== */ +#pragma GCC visibility push(hidden) void Mjoin( PATLF77WRAP, rotg ) ( TYPE *, TYPE *, TYPE *, TYPE * ); #ifdef TREAL @@ -1085,6 +1086,7 @@ void Mjoin( PATLF77WRAP, trsm ) F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER *, TYPE *, F77_INTEGER * ); +#pragma GCC visibility pop #endif /* * End of atlas_f77wrap.h diff --git a/include/atlas_kernel2.h b/include/atlas_kernel2.h index fc7985b..f1bca28 100644 --- a/include/atlas_kernel2.h +++ b/include/atlas_kernel2.h @@ -80,6 +80,7 @@ * Recursive Level 2 BLAS function prototypes * ===================================================================== */ +#pragma GCC visibility push(hidden) void ATL_strsvLTU ( const int, @@ -5404,7 +5405,7 @@ void ATL_zgbmvN_a1_x1_b0_y1 double *, const int ); - +#pragma GCC visibility pop #endif /* * End of atlas_kernel2.h diff --git a/include/atlas_kernel3.h b/include/atlas_kernel3.h index b3f666d..a8b589d 100644 --- a/include/atlas_kernel3.h +++ b/include/atlas_kernel3.h @@ -33,6 +33,7 @@ /* * Real level 3 kernels */ +#pragma GCC visibility push(hidden) void ATL_ssymmRU (const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc); @@ -1390,4 +1391,5 @@ void ATL_ztrsm0RUCU (const int M, const int N, const double *alpha, const double *A, const int lda, double *C, const int ldc); +#pragma GCC visibility pop #endif diff --git a/include/atlas_lvl2.h b/include/atlas_lvl2.h index 69b1b54..a56df27 100644 --- a/include/atlas_lvl2.h +++ b/include/atlas_lvl2.h @@ -36,6 +36,7 @@ /* * Real kernels */ +#pragma GCC visibility push(hidden) void ATL_sger1_a1_x1_yX (const int M, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A, const int lda); @@ -291,4 +292,5 @@ void ATL_zgemvN_a1_x1_b0_y1 const int lda, const double *X, const int incX, const double *beta, double *Y, const int incY); +#pragma GCC visibility pop #endif diff --git a/include/atlas_rblas3.h b/include/atlas_rblas3.h index 9aac8e0..ca3d563 100644 --- a/include/atlas_rblas3.h +++ b/include/atlas_rblas3.h @@ -202,6 +202,7 @@ typedef void (*RC3_FUN_TRSM_T) * Level 3 recursive BLAS internal function prototypes * ===================================================================== */ +#pragma GCC visibility push(hidden) void ATL_sgemmTN_RB ( const int, const int, const int, const void *, const void *, const int, const void *, const int, @@ -471,6 +472,7 @@ void ATL_rtrsmLLN const void *, const int, void *, const int, const int ); +#pragma GCC visibility pop #endif /* * End of atlas_rblas3.h diff --git a/include/atlas_reflvl2.h b/include/atlas_reflvl2.h index e64f90d..0697403 100644 --- a/include/atlas_reflvl2.h +++ b/include/atlas_reflvl2.h @@ -53,6 +53,7 @@ * Prototypes for Level 2 Reference Internal ATLAS BLAS routines * ===================================================================== */ +#pragma GCC visibility push(hidden) void ATL_srefgbmvN ( const int, const int, @@ -3181,6 +3182,7 @@ void ATL_zreftrsvUHU double *, const int ); +#pragma GCC visibility pop #endif /* * End of atlas_reflvl2.h diff --git a/include/atlas_reflvl3.h b/include/atlas_reflvl3.h index 8dfcfd0..0177cd9 100644 --- a/include/atlas_reflvl3.h +++ b/include/atlas_reflvl3.h @@ -53,6 +53,7 @@ * Prototypes for Level 3 Reference Internal ATLAS BLAS routines * ===================================================================== */ +#pragma GCC visibility push(hidden) void ATL_srefgemmNN ( const int, const int, const int, @@ -2289,6 +2290,7 @@ void ATL_zreftrsmRUCU double *, const int ); +#pragma GCC visibility pop #endif /* * End of atlas_reflvl3.h diff --git a/include/contrib/ATL_gemv_ger_SSE.h b/include/contrib/ATL_gemv_ger_SSE.h index 118d3de..10faf15 100644 --- a/include/contrib/ATL_gemv_ger_SSE.h +++ b/include/contrib/ATL_gemv_ger_SSE.h @@ -18,8 +18,11 @@ #include <stdio.h> #include <stdlib.h> +#include <string.h> /* for memset */ #include "camm_util.h" +#include "atlas_misc.h" +#include "atlas_lvl2.h" #ifndef GER #if defined(BETAX) || defined(BETAXI0) diff --git a/include/contrib/camm_dpa.h b/include/contrib/camm_dpa.h index af9c6b1..0912aee 100644 --- a/include/contrib/camm_dpa.h +++ b/include/contrib/camm_dpa.h @@ -1619,7 +1619,7 @@ MY_FUNCTION(aconst TYPE *a,int lda, #if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2)) ,"m" (w) #endif - :"ax","bx","cx","dx","si","di"); + :"ax","cx","dx","si","di"); } diff --git a/include/contrib/camm_util.h b/include/contrib/camm_util.h index 6b150d3..bcfcc99 100644 --- a/include/contrib/camm_util.h +++ b/include/contrib/camm_util.h @@ -316,6 +316,15 @@ typedef struct { #ifndef ATHLON +#ifdef SINGLE +#ifndef ATL_SSE1 +#error ATL_SSE1 is needed for this kernel +#endif +#else +#ifndef ATL_SSE2 +#error ATL_SSE2 is needed for this kernel +#endif +#endif #if defined(DREAL) || defined(DCPLX) #undef SSESUF diff --git a/interfaces/lapack/F77/src/ilaenv.f b/interfaces/lapack/F77/src/ilaenv.f index 4e4d350..2beb66e 100644 --- a/interfaces/lapack/F77/src/ilaenv.f +++ b/interfaces/lapack/F77/src/ilaenv.f @@ -111,6 +111,7 @@ CHARACTER*3 C3 CHARACTER*6 SUBNAM INTEGER I, IC, IZ, NB, NBMIN, NX + INTEGER MN, ATLNB * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, INT, MIN, REAL @@ -207,13 +208,10 @@ * ISPEC = 1: block size * * ATLAS sets default blocking factor to the internal blocking factor of -* GEMM. In routines such as QR where unblocked code addresses a panel -* (as opposed to a NBxNB block as in POTF2), this will often be too large -* to provide the optimal performance due to the slowness of the panel -* factorization. Asymptotically, however, you will want it at least -* this big. Therefore, we choose this NB as default since how much smaller -* an NB is required is very problem size and system architecture -* dependent. Note that the recursive routines provided by ATLAS +* GEMM. For particular routines, we then examine input parameters to +* see if we should reduce it (for instance, to ensure that we don't +* spend too much time in the unblocked level-2 portion of the algorithm). +* Note that the recursive routines provided by ATLAS * (presently, LU & Cholesky) do not need this value. * NB = 60 @@ -226,6 +224,179 @@ ELSE IF (DCPLX) THEN CALL ATL_F77WRAP_ZGETNB(NB) ENDIF + ATLNB = NB +* +* Find minimum dimension of problem: don't want NB bigger than that +* + MN = N1 + IF (N2 .GT. 0) THEN + IF (MN .GT. N2) MN = N2 + IF (N3 .GT. 0) THEN + IF (MN .GT. N3) MN = N3 + IF (N4 .GT. 0) THEN + IF (MN .GT. N4) MN = N4 + END IF + END IF + END IF +* +* Make sure NB is not close to size of entire dimension, unless smallest +* dimension is so small you might as well use unblocked code the whole way +* + IF (MN .LE. 3) THEN + NB = MN; + ELSE IF (NB*5 .GT. MN) THEN + IF (MN .LT. 16) THEN + IF (MN .LT. 8) THEN + NB = 2 + ELSE + NB = 4 + END IF + ELSE + NB = (MN / 4) + IF (NB .GT. 4) THEN + NB = (NB/4)*4 + ENDIF + END IF + END IF + IF( C2.EQ.'GE' ) THEN + IF( C3.EQ.'TRF' ) THEN + NB = NB +* +* QR requires extra flops for blocking, so restrain total NB +* + ELSE IF( C3.EQ.'QRF' .OR. C3.EQ.'RQF' .OR. C3.EQ.'LQF' .OR. + $ C3.EQ.'QLF' ) THEN + IF (NB .GT. 80) THEN + IF( SNAME ) THEN + NB = 60 + ELSE + NB = 40 + END IF + END IF + ELSE IF( C3.EQ.'HRD' ) THEN + IF (NB .GT. 80) THEN + IF( SNAME ) THEN + NB = 60 + ELSE + NB = 40 + END IF + END IF + ELSE IF( C3.EQ.'BRD' ) THEN + IF (NB .GT. 80) THEN + IF( SNAME ) THEN + NB = 60 + ELSE + NB = 40 + END IF + END IF + ELSE IF( C3.EQ.'TRI' ) THEN + NB = NB + END IF + ELSE IF( C2.EQ.'PO' ) THEN + IF( C3.EQ.'TRF' ) THEN + NB = NB + END IF + ELSE IF( C2.EQ.'SY' ) THEN + IF( C3.EQ.'TRF' ) THEN + NB = NB + ELSE IF( SNAME .AND. C3.EQ.'TRD' ) THEN + IF (NB .GT. 80) THEN + IF( SNAME ) THEN + NB = 60 + ELSE + NB = 40 + END IF + END IF + ELSE IF( SNAME .AND. C3.EQ.'GST' ) THEN + NB = NB + END IF + ELSE IF( CNAME .AND. C2.EQ.'HE' ) THEN + IF( C3.EQ.'TRF' ) THEN + NB = NB + ELSE IF( C3.EQ.'TRD' ) THEN + IF (NB .GT. 80) THEN + IF( SNAME ) THEN + NB = 60 + ELSE + NB = 40 + END IF + END IF + ELSE IF( C3.EQ.'GST' ) THEN + NB = NB + END IF + ELSE IF( SNAME .AND. C2.EQ.'OR' ) THEN + IF( C3( 1:1 ).EQ.'G' ) THEN + IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR. + $ C4.EQ.'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR. + $ C4.EQ.'BR' ) THEN + IF (NB .GT. 80) NB = 60 + END IF + ELSE IF( C3( 1:1 ).EQ.'M' ) THEN + IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR. + $ C4.EQ.'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR. + $ C4.EQ.'BR' ) THEN + IF (NB .GT. 80) NB = 60 + END IF + END IF + ELSE IF( CNAME .AND. C2.EQ.'UN' ) THEN + IF( C3( 1:1 ).EQ.'G' ) THEN + IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR. + $ C4.EQ.'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR. + $ C4.EQ.'BR' ) THEN + IF (NB .GT. 80) NB = 40 + END IF + ELSE IF( C3( 1:1 ).EQ.'M' ) THEN + IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR. + $ C4.EQ.'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR. + $ C4.EQ.'BR' ) THEN + IF (NB .GT. 80) NB = 40 + END IF + END IF + ELSE IF( C2.EQ.'GB' ) THEN + IF( C3.EQ.'TRF' ) THEN + IF( SNAME ) THEN + NB = NB + ELSE + NB = NB + END IF + END IF + ELSE IF( C2.EQ.'PB' ) THEN + IF( C3.EQ.'TRF' ) THEN + IF( SNAME ) THEN + IF( N2.LE.64 ) THEN + NB = NB + ELSE + NB = NB + END IF + ELSE + IF( N2.LE.64 ) THEN + NB = NB + ELSE + NB = NB + END IF + END IF + END IF + ELSE IF( C2.EQ.'TR' ) THEN + IF( C3.EQ.'TRI' ) THEN + IF( SNAME ) THEN + NB = NB + ELSE + NB = NB + END IF + END IF + ELSE IF( C2.EQ.'LA' ) THEN + IF( C3.EQ.'UUM' ) THEN + IF( SNAME ) THEN + NB = NB + ELSE + NB = NB + END IF + END IF + ELSE IF( SNAME .AND. C2.EQ.'ST' ) THEN + IF( C3.EQ.'EBZ' ) THEN + NB = 1 + END IF + END IF * ILAENV = NB RETURN diff --git a/src/blas/gemm/ATL_cmmJIK.c b/src/blas/gemm/ATL_cmmJIK.c index ed9ad75..e9231fa 100644 --- a/src/blas/gemm/ATL_cmmJIK.c +++ b/src/blas/gemm/ATL_cmmJIK.c @@ -190,7 +190,7 @@ int Mjoin(PATL,mmJIK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, pA = ATL_AlignPtr(vB); if (TA == AtlasNoTrans) Mjoin(PATL,row2blkT2_a1)(M, K, A, lda, pA, alpha); - else Mjoin(PATL,col2blk_a1)(K, M, A, lda, pA, alpha); + else Mjoin(PATL,col2blk2_a1)(K, M, A, lda, pA, alpha); /* * Can't write directly to C if alpha is not one */ diff --git a/tune/blas/gemm/CASES/ATL_dmm4x4x2_US.c b/tune/blas/gemm/CASES/ATL_dmm4x4x2_US.c index bdffd5e..ffb64f0 100644 --- a/tune/blas/gemm/CASES/ATL_dmm4x4x2_US.c +++ b/tune/blas/gemm/CASES/ATL_dmm4x4x2_US.c @@ -104,7 +104,7 @@ #define CSH 3 #define CMUL(arg_) arg_ #endif -if 1 +#if 1 #define prefR1(mem) prefetch mem, 0 #define prefR2(mem) prefetch mem, 0 #define prefW2(mem) prefetch mem, 2 diff --git a/tune/blas/gemm/CASES/ATL_dmm6x1x60_sse2.c b/tune/blas/gemm/CASES/ATL_dmm6x1x60_sse2.c index 5c39f5b..5b1b522 100644 --- a/tune/blas/gemm/CASES/ATL_dmm6x1x60_sse2.c +++ b/tune/blas/gemm/CASES/ATL_dmm6x1x60_sse2.c @@ -40,6 +40,11 @@ #if (NB/6)*6 != NB #error "NB must be multiple of 6!" #endif + +#ifndef ATL_Has3DNow + #define prefetchw prefetchnta +#endif + # # Integer register usage shown be these defines # diff --git a/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1.c b/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1.c index afb6ccc..334a394 100644 --- a/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1.c +++ b/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1.c @@ -31,6 +31,7 @@ #include "atlas_misc.h" #include "atlas_level2.h" #include "atlas_level1.h" +#include "atlas_lvl2.h" #ifdef Conj_ #define PEQ -= diff --git a/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1a.c b/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1a.c index 6329073..e4f0345 100644 --- a/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1a.c +++ b/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1a.c @@ -31,6 +31,7 @@ #include "atlas_misc.h" #include "atlas_level2.h" #include "atlas_level1.h" +#include "atlas_lvl2.h" #ifdef Conj_ #define PEQ -= diff --git a/tune/blas/gemv/CASES/ATL_cgemvN_2x2_0.c b/tune/blas/gemv/CASES/ATL_cgemvN_2x2_0.c index c351d1f..2ee4ba1 100644 --- a/tune/blas/gemv/CASES/ATL_cgemvN_2x2_0.c +++ b/tune/blas/gemv/CASES/ATL_cgemvN_2x2_0.c @@ -31,6 +31,7 @@ #include "atlas_misc.h" #include "atlas_level2.h" #include "atlas_level1.h" +#include "atlas_lvl2.h" #ifdef Conj_ #define PEQ -= diff --git a/tune/blas/gemv/CASES/ATL_cgemvN_4x2_1.c b/tune/blas/gemv/CASES/ATL_cgemvN_4x2_1.c index fa407f3..99520d6 100644 --- a/tune/blas/gemv/CASES/ATL_cgemvN_4x2_1.c +++ b/tune/blas/gemv/CASES/ATL_cgemvN_4x2_1.c @@ -31,6 +31,7 @@ #include "atlas_misc.h" #include "atlas_level2.h" #include "atlas_level1.h" +#include "atlas_lvl2.h" #include "atlas_prefetch.h" #ifdef Conj_ diff --git a/tune/blas/gemv/CASES/ATL_cgemvN_mm.c b/tune/blas/gemv/CASES/ATL_cgemvN_mm.c index cbae230..ed16aab 100644 --- a/tune/blas/gemv/CASES/ATL_cgemvN_mm.c +++ b/tune/blas/gemv/CASES/ATL_cgemvN_mm.c @@ -40,6 +40,7 @@ #define MEQ -= #endif +#include "atlas_lvl2.h" #include "atlas_lvl3.h" #ifdef Conj_ diff --git a/tune/blas/gemv/CASES/ATL_cgemvT_1x1_1.c b/tune/blas/gemv/CASES/ATL_cgemvT_1x1_1.c index 12256b2..53268c1 100644 --- a/tune/blas/gemv/CASES/ATL_cgemvT_1x1_1.c +++ b/tune/blas/gemv/CASES/ATL_cgemvT_1x1_1.c @@ -31,6 +31,7 @@ #include "atlas_misc.h" #include "atlas_level1.h" #include "atlas_level2.h" +#include "atlas_lvl2.h" #ifdef Conj_ #define PEQ -= diff --git a/tune/blas/gemv/CASES/ATL_cgemvT_2x2_0.c b/tune/blas/gemv/CASES/ATL_cgemvT_2x2_0.c index d617a8a..4cbf0fe 100644 --- a/tune/blas/gemv/CASES/ATL_cgemvT_2x2_0.c +++ b/tune/blas/gemv/CASES/ATL_cgemvT_2x2_0.c @@ -31,6 +31,7 @@ #include "atlas_misc.h" #include "atlas_level1.h" #include "atlas_level2.h" +#include "atlas_lvl2.h" #include "atlas_prefetch.h" #ifdef Conj_ diff --git a/tune/blas/gemv/CASES/ATL_cgemvT_2x4_1.c b/tune/blas/gemv/CASES/ATL_cgemvT_2x4_1.c index 1a1d257..5631903 100644 --- a/tune/blas/gemv/CASES/ATL_cgemvT_2x4_1.c +++ b/tune/blas/gemv/CASES/ATL_cgemvT_2x4_1.c @@ -31,6 +31,7 @@ #include "atlas_misc.h" #include "atlas_level1.h" #include "atlas_level2.h" +#include "atlas_lvl2.h" #include "atlas_prefetch.h" #ifdef Conj_ diff --git a/tune/blas/gemv/CASES/ATL_cgemvT_mm.c b/tune/blas/gemv/CASES/ATL_cgemvT_mm.c index 691b3a6..65cbbdc 100644 --- a/tune/blas/gemv/CASES/ATL_cgemvT_mm.c +++ b/tune/blas/gemv/CASES/ATL_cgemvT_mm.c @@ -31,6 +31,7 @@ #include "atlas_misc.h" #include "atlas_level1.h" #include "atlas_level2.h" +#include "atlas_lvl2.h" #include "atlas_lvl3.h" #ifdef Conj_ diff --git a/tune/blas/gemv/CASES/ATL_gemvN_dummy.c b/tune/blas/gemv/CASES/ATL_gemvN_dummy.c index 8627ff7..0366cde 100644 --- a/tune/blas/gemv/CASES/ATL_gemvN_dummy.c +++ b/tune/blas/gemv/CASES/ATL_gemvN_dummy.c @@ -30,6 +30,7 @@ #include "atlas_misc.h" #include "atlas_level2.h" +#include "atlas_lvl2.h" #define gemv0 Mjoin(Mjoin(Mjoin(Mjoin(gemvN,NM),_x1),BNM),_y1) #define gemvC Mjoin(Mjoin(Mjoin(Mjoin(gemvC,NM),_x1),BNM),_y1) #define gemvNc Mjoin(Mjoin(Mjoin(Mjoin(gemvNc,NM),_x1),BNM),_y1) diff --git a/tune/blas/gemv/CASES/ATL_gemvN_dummy2.c b/tune/blas/gemv/CASES/ATL_gemvN_dummy2.c index 5bee7b4..0ed1d67 100644 --- a/tune/blas/gemv/CASES/ATL_gemvN_dummy2.c +++ b/tune/blas/gemv/CASES/ATL_gemvN_dummy2.c @@ -30,6 +30,7 @@ #include "atlas_misc.h" #include "atlas_level2.h" +#include "atlas_lvl2.h" #ifdef Conj_ void Mjoin(Mjoin(Mjoin(Mjoin(Mjoin(PATL,gemvNc),NM),_x1),BNM),_y1) #else diff --git a/tune/blas/gemv/CASES/ATL_gemvT_1x1_1.c b/tune/blas/gemv/CASES/ATL_gemvT_1x1_1.c index d5b4ae2..a4e0c00 100644 --- a/tune/blas/gemv/CASES/ATL_gemvT_1x1_1.c +++ b/tune/blas/gemv/CASES/ATL_gemvT_1x1_1.c @@ -30,6 +30,7 @@ #include "atlas_misc.h" #include "atlas_level1.h" +#include "atlas_lvl2.h" #include "atlas_level2.h" #define gemv0 Mjoin(Mjoin(Mjoin(Mjoin(gemvT,NM),_x1),BNM),_y1) diff --git a/tune/blas/gemv/CASES/ATL_gemvT_2x16_1.c b/tune/blas/gemv/CASES/ATL_gemvT_2x16_1.c index 73c1535..9919bf4 100644 --- a/tune/blas/gemv/CASES/ATL_gemvT_2x16_1.c +++ b/tune/blas/gemv/CASES/ATL_gemvT_2x16_1.c @@ -31,6 +31,7 @@ #include "atlas_misc.h" #include "atlas_level1.h" #include "atlas_level2.h" +#include "atlas_lvl2.h" #include "atlas_prefetch.h" static void gemvT_Msmall(const int M, const int N, const TYPE *A, const int lda, diff --git a/tune/blas/gemv/CASES/ATL_gemvT_2x8_0.c b/tune/blas/gemv/CASES/ATL_gemvT_2x8_0.c index a822395..b953b57 100644 --- a/tune/blas/gemv/CASES/ATL_gemvT_2x8_0.c +++ b/tune/blas/gemv/CASES/ATL_gemvT_2x8_0.c @@ -31,6 +31,7 @@ #include "atlas_misc.h" #include "atlas_level1.h" #include "atlas_level2.h" +#include "atlas_lvl2.h" static void gemvT_Nsmall(const int M, const int N, const TYPE *A, const int lda, const TYPE *X, const SCALAR beta, TYPE *Y) diff --git a/tune/blas/gemv/CASES/ATL_gemvT_4x16_1.c b/tune/blas/gemv/CASES/ATL_gemvT_4x16_1.c index 9b7625d..74780ba 100644 --- a/tune/blas/gemv/CASES/ATL_gemvT_4x16_1.c +++ b/tune/blas/gemv/CASES/ATL_gemvT_4x16_1.c @@ -31,6 +31,7 @@ #include "atlas_misc.h" #include "atlas_level1.h" #include "atlas_level2.h" +#include "atlas_lvl2.h" #include "atlas_prefetch.h" static void gemvT_Nsmall(const int M, const int N, const TYPE *A, const int lda, diff --git a/tune/blas/gemv/CASES/ATL_gemvT_4x8_1.c b/tune/blas/gemv/CASES/ATL_gemvT_4x8_1.c index add1ee3..de00f75 100644 --- a/tune/blas/gemv/CASES/ATL_gemvT_4x8_1.c +++ b/tune/blas/gemv/CASES/ATL_gemvT_4x8_1.c @@ -30,6 +30,7 @@ #include "atlas_misc.h" #include "atlas_level1.h" +#include "atlas_lvl2.h" #include "atlas_level2.h" static void gemvT_Nsmall(const int M, const int N, const TYPE *A, const int lda, diff --git a/tune/blas/gemv/CASES/ATL_gemvT_dummy.c b/tune/blas/gemv/CASES/ATL_gemvT_dummy.c index 48b51cb..e813faa 100644 --- a/tune/blas/gemv/CASES/ATL_gemvT_dummy.c +++ b/tune/blas/gemv/CASES/ATL_gemvT_dummy.c @@ -30,6 +30,7 @@ #include "atlas_misc.h" #include "atlas_level2.h" +#include "atlas_lvl2.h" #define gemv0 Mjoin(Mjoin(Mjoin(Mjoin(gemvT,NM),_x1),BNM),_y1) #define gemvC Mjoin(Mjoin(Mjoin(Mjoin(gemvC,NM),_x1),BNM),_y1) #define gemvNc Mjoin(Mjoin(Mjoin(Mjoin(gemvNc,NM),_x1),BNM),_y1) diff --git a/tune/blas/gemv/CASES/ATL_gemvT_dummy2.c b/tune/blas/gemv/CASES/ATL_gemvT_dummy2.c index 39bcee6..901dc3f 100644 --- a/tune/blas/gemv/CASES/ATL_gemvT_dummy2.c +++ b/tune/blas/gemv/CASES/ATL_gemvT_dummy2.c @@ -30,6 +30,7 @@ #include "atlas_misc.h" #include "atlas_level2.h" +#include "atlas_lvl2.h" #ifdef Conj_ void Mjoin(Mjoin(Mjoin(Mjoin(Mjoin(PATL,gemvC),NM),_x1),BNM),_y1) #else diff --git a/tune/blas/gemv/CASES/ATL_gemvT_mm.c b/tune/blas/gemv/CASES/ATL_gemvT_mm.c index 8a9438f..a4af18d 100644 --- a/tune/blas/gemv/CASES/ATL_gemvT_mm.c +++ b/tune/blas/gemv/CASES/ATL_gemvT_mm.c @@ -31,6 +31,7 @@ #include "atlas_misc.h" #include "atlas_level1.h" #include "atlas_level2.h" +#include "atlas_lvl2.h" #include "atlas_lvl3.h" #define gemv0 Mjoin(Mjoin(Mjoin(Mjoin(gemvT,NM),_x1),BNM),_y1) diff --git a/tune/blas/gemv/CASES/ATL_gemv_SSE.c b/tune/blas/gemv/CASES/ATL_gemv_SSE.c index d74fe2f..233f743 100644 --- a/tune/blas/gemv/CASES/ATL_gemv_SSE.c +++ b/tune/blas/gemv/CASES/ATL_gemv_SSE.c @@ -32,6 +32,7 @@ #include <stdlib.h> #include "atlas_misc.h" +#include "atlas_lvl2.h" #include "camm_util.h" #ifndef ATL_GAS_x8632 diff --git a/tune/blas/level1/IAMAX/iamax_sse.c b/tune/blas/level1/IAMAX/iamax_sse.c index 344ba82..16973fc 100644 --- a/tune/blas/level1/IAMAX/iamax_sse.c +++ b/tune/blas/level1/IAMAX/iamax_sse.c @@ -1,5 +1,9 @@ #ifdef SREAL +#ifndef ATL_SSE1 +#error ATL_SSE1 needed for this kernel +#endif + #ifdef ATL_GAS_x8632 #define movq movl #define addq addl @@ -318,6 +322,11 @@ FA_NEWMAX: jmp FA_INC #else +#ifndef ATL_SSE2 +#error ATL_SSE2 needed for this kernel +#endif + + #ifdef ATL_GAS_x8632 #define movq movl #define addq addl