Sophie

Sophie

distrib > Altlinux > 4.1 > i586 > media > core-src > by-pkgid > 338876bc573fda35c173d6997fef3768 > files > 1

atlas-3.7.11-alt5.src.rpm

 .gear-rules                                        |    2 +
 .../tags/59ae7dbbc04bf57b482668e1d112d4d98d4840aa  |   13 ++
 .gear/tags/list                                    |    1 +
 README.ALT                                         |   64 ++++++
 atlas-run.sh                                       |    3 +
 atlas.mk                                           |  161 ++++++++++++++
 atlas.spec                                         |  226 ++++++++++++++++++++
 bin/uumtst.c                                       |    5 +-
 include/atlas_f77wrap.h                            |    2 +
 include/atlas_kernel2.h                            |    3 +-
 include/atlas_kernel3.h                            |    2 +
 include/atlas_lvl2.h                               |    2 +
 include/atlas_rblas3.h                             |    2 +
 include/atlas_reflvl2.h                            |    2 +
 include/atlas_reflvl3.h                            |    2 +
 include/contrib/ATL_gemv_ger_SSE.h                 |    3 +
 include/contrib/camm_dpa.h                         |    2 +-
 include/contrib/camm_util.h                        |    9 +
 interfaces/lapack/F77/src/ilaenv.f                 |  185 +++++++++++++++-
 src/blas/gemm/ATL_cmmJIK.c                         |    2 +-
 tune/blas/gemm/CASES/ATL_dmm4x4x2_US.c             |    2 +-
 tune/blas/gemm/CASES/ATL_dmm6x1x60_sse2.c          |    5 +
 tune/blas/gemv/CASES/ATL_cgemvN_1x1_1.c            |    1 +
 tune/blas/gemv/CASES/ATL_cgemvN_1x1_1a.c           |    1 +
 tune/blas/gemv/CASES/ATL_cgemvN_2x2_0.c            |    1 +
 tune/blas/gemv/CASES/ATL_cgemvN_4x2_1.c            |    1 +
 tune/blas/gemv/CASES/ATL_cgemvN_mm.c               |    1 +
 tune/blas/gemv/CASES/ATL_cgemvT_1x1_1.c            |    1 +
 tune/blas/gemv/CASES/ATL_cgemvT_2x2_0.c            |    1 +
 tune/blas/gemv/CASES/ATL_cgemvT_2x4_1.c            |    1 +
 tune/blas/gemv/CASES/ATL_cgemvT_mm.c               |    1 +
 tune/blas/gemv/CASES/ATL_gemvN_dummy.c             |    1 +
 tune/blas/gemv/CASES/ATL_gemvN_dummy2.c            |    1 +
 tune/blas/gemv/CASES/ATL_gemvT_1x1_1.c             |    1 +
 tune/blas/gemv/CASES/ATL_gemvT_2x16_1.c            |    1 +
 tune/blas/gemv/CASES/ATL_gemvT_2x8_0.c             |    1 +
 tune/blas/gemv/CASES/ATL_gemvT_4x16_1.c            |    1 +
 tune/blas/gemv/CASES/ATL_gemvT_4x8_1.c             |    1 +
 tune/blas/gemv/CASES/ATL_gemvT_dummy.c             |    1 +
 tune/blas/gemv/CASES/ATL_gemvT_dummy2.c            |    1 +
 tune/blas/gemv/CASES/ATL_gemvT_mm.c                |    1 +
 tune/blas/gemv/CASES/ATL_gemv_SSE.c                |    1 +
 tune/blas/level1/IAMAX/iamax_sse.c                 |    9 +
 43 files changed, 714 insertions(+), 13 deletions(-)

diff --git a/.gear-rules b/.gear-rules
new file mode 100644
index 0000000..7cffa45
--- /dev/null
+++ b/.gear-rules
@@ -0,0 +1,2 @@
+tar: @version@:. name=atlas-@version@ base=ATLAS
+diff: @version@:. .
diff --git a/.gear/tags/59ae7dbbc04bf57b482668e1d112d4d98d4840aa b/.gear/tags/59ae7dbbc04bf57b482668e1d112d4d98d4840aa
new file mode 100644
index 0000000..2b25533
--- /dev/null
+++ b/.gear/tags/59ae7dbbc04bf57b482668e1d112d4d98d4840aa
@@ -0,0 +1,13 @@
+object 4faefe4e1ccaa87c4ea4374640366febb71fe86a
+type commit
+tag 3.7.11
+tagger Alexey Tourbin <at@localhost.localdomain> 1164114354 +0300
+
+3.7.11
+-----BEGIN PGP SIGNATURE-----
+Version: GnuPG v1.4.5 (GNU/Linux)
+
+iD8DBQBFYvm0fBKgtDjnu0YRAgR4AKCHBBq84K9oIHmiBw0wJu5kc85R4QCgtppf
+NEbZrsGYdsrVrf9EtdqNSF8=
+=uIvo
+-----END PGP SIGNATURE-----
diff --git a/.gear/tags/list b/.gear/tags/list
new file mode 100644
index 0000000..404c860
--- /dev/null
+++ b/.gear/tags/list
@@ -0,0 +1 @@
+59ae7dbbc04bf57b482668e1d112d4d98d4840aa 3.7.11
diff --git a/README.ALT b/README.ALT
new file mode 100644
index 0000000..4ec3068
--- /dev/null
+++ b/README.ALT
@@ -0,0 +1,64 @@
+Automatically Tuned Linear Algebra Software
+README.ALT by Alexey Tourbin
+
+The "libatlas" package provides the default BLAS library for ALT Linux
+Sisyphus and derivative products.  The BLAS library implements the
+standard Fortran API for the Basic Linear Algebra Subprograms.  It is
+currently used by the R environment, python Numeric module, and by GNU
+Octave (the latter has not been really packaged yet).
+
+The package also provides the "cblas" library, which implements the
+standard API for calling BLAS from C.  It also provides a subset of
+LAPACK routines, which is used by the LAPACK library (the LAPACK library
+is packaged separately).
+
+The BLAS and cblas libraries wrap the ATLAS library (libatlas), which
+provides a highly efficient implementation of lower-level internal
+routines.  The ATLAS library performs compile-time optimizations for the
+host system CPU, i.e. the code is automatically tuned for the current
+CPU so as to make it run faster.  Therefore, strictly speaking, the
+ATLAS library from this package is likely to exhibit suboptimal
+performance.  However, I have done some research to ensure that nearly
+optimal performance is actually possible.
+
+For the i586 platform, the package provides two different copies of
+the ATLAS library: /usr/lib/libatlas.so* and /usr/lib/sse2/libatlas.so*.
+The former library is generic; it uses only modest optimizations for
+relatively old CPUs.  However, compared to the plain Fortran BLAS
+implementation, it gives a serious performance boost, usually by
+a factor of 3.  The latter of these libraries is for modern CPUs: it
+makes use of SSE prefetch instruction, and also it is optimized for
+larger L1 cache size.  Compared to the former library, it gives yet more
+50-70% speedup.  Note that the SSE2-enabled library is picked up by the
+dynamic linker automatically, whenever the host processor is capable of
+SSE2 instructions, based on "hwcap" information from the Linux kernel.
+
+Further research showed that full-fledged tuning for the host system CPU
+can add up only about 10-20% of performance.
+
+Here are some numbers for my Athlon64 3200+ CPU, kindly donated by
+Dmitry V. Levin.  I use the following shell command to measure GEMM
+(general matrix multiplication) performance.
+
+R -q --vanilla <<<'mm <- matrix(rnorm(10^6), ncol = 10^3); system.time(crossprod(mm))'
+
+With Fortran reference implementation:
+[1] 1.572 0.004 1.580 0.000 0.000
+
+With /usr/lib/libatlas.so.3:
+[1] 0.532 0.020 0.552 0.000 0.000
+
+With /usr/lib/sse2/libatlas.so.3:
+[1] 0.312 0.020 0.334 0.000 0.000
+
+With GotoBLAS (another highly efficient yet non-free BLAS implementation):
+[1] 0.296 0.008 0.301 0.000 0.000
+
+So I hope that the benefits of prepackaged (and pretested) software can
+outweigh a minor CPU-specific performance loss in most cases.
+
+Also note that, at the moment, this package does not provide SMP-enabled
+BLAS.  I do not believe that simply using SMP makes a considerable
+performance boost easily possible (and only level3 BLAS routines can
+deploy parallelism anyway).  Please let me know if you think that you
+can benefit from parallel ATLAS (or if you can donate a cluster).
diff --git a/atlas-run.sh b/atlas-run.sh
new file mode 100755
index 0000000..012801e
--- /dev/null
+++ b/atlas-run.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+cmd=$1/$2; shift 2
+$cmd $*
diff --git a/atlas.mk b/atlas.mk
new file mode 100644
index 0000000..e918593
--- /dev/null
+++ b/atlas.mk
@@ -0,0 +1,161 @@
+#  -----------------------------
+#  Make.ARCH for ATLAS3.7.11
+#  -----------------------------
+
+#  ----------------------------------
+#  Make sure we get the correct shell
+#  ----------------------------------
+   SHELL = /bin/sh
+
+#  -------------------------------------------------
+#  Name indicating the platform to configure BLAS to
+#  -------------------------------------------------
+   ARCH = $(arch)
+
+#  -------------------
+#  Various directories
+#  -------------------
+   TOPdir = $(RPM_BUILD_DIR)/ATLAS
+   INCdir = $(TOPdir)/include/$(ARCH)
+   SYSdir = $(TOPdir)/tune/sysinfo/$(ARCH)
+   GMMdir = $(TOPdir)/src/blas/gemm/$(ARCH)
+   UMMdir = $(GMMdir)
+   GMVdir = $(TOPdir)/src/blas/gemv/$(ARCH)
+   GR1dir = $(TOPdir)/src/blas/ger/$(ARCH)
+   L1Bdir = $(TOPdir)/src/blas/level1/$(ARCH)
+   L2Bdir = $(TOPdir)/src/blas/level2/$(ARCH)
+   L3Bdir = $(TOPdir)/src/blas/level3/$(ARCH)
+   TSTdir = $(TOPdir)/src/testing/$(ARCH)
+   AUXdir = $(TOPdir)/src/auxil/$(ARCH)
+   CBLdir = $(TOPdir)/interfaces/blas/C/src/$(ARCH)
+   FBLdir = $(TOPdir)/interfaces/blas/F77/src/$(ARCH)
+   BINdir = $(TOPdir)/bin/$(ARCH)
+   LIBdir = $(TOPdir)/lib/$(ARCH)
+   PTSdir = $(TOPdir)/src/pthreads
+   MMTdir = $(TOPdir)/tune/blas/gemm/$(ARCH)
+   MVTdir = $(TOPdir)/tune/blas/gemv/$(ARCH)
+   R1Tdir = $(TOPdir)/tune/blas/ger/$(ARCH)
+   L1Tdir = $(TOPdir)/tune/blas/level1/$(ARCH)
+   L3Tdir = $(TOPdir)/tune/blas/level3/$(ARCH)
+
+#  ---------------------------------------------------------------------
+#  Name and location of scripts for running executables during tuning
+#  ---------------------------------------------------------------------
+   ATLRUN = $(BINdir)/ATLrun.sh
+   ATLFWAIT = $(BINdir)/xatlas_waitfile
+
+#  ---------------------
+#  Libraries to be built
+#  ---------------------
+   ATLASlib = $(LIBdir)/libatlas.a
+   CBLASlib = $(LIBdir)/libcblas.a
+   F77BLASlib = $(LIBdir)/libblas.a
+   LAPACKlib = $(LIBdir)/liblapack_atlas.a
+
+   TESTlib = $(LIBdir)/libtstatlas.a
+#  -------------------------------------------
+#  Upper bound on largest cache size, in bytes
+#  -------------------------------------------
+   L2SIZE = -DL2SIZE=1048576
+
+#  ---------------------------------------
+#  Command setting up correct include path
+#  ---------------------------------------
+   INCLUDES = -I$(TOPdir)/include -I$(TOPdir)/include/$(ARCH) \
+              -I$(TOPdir)/include/contrib 
+
+#  -------------------------------------------
+#  Defines for setting up F77/C interoperation
+#  -------------------------------------------
+   F2CDEFS = -DAdd_ -DStringSunStyle -fPIC
+
+#  --------------------------------------
+#  Special defines for user-supplied GEMM
+#  --------------------------------------
+   UMMDEFS = 
+
+#  ------------------------------
+#  Architecture identifying flags
+#  ------------------------------
+   ARCHDEFS = @ARCHDEFS@
+
+#  -------------------------------------------------------------------
+#  NM is the flag required to name a compiled object/executable
+#  OJ is the flag required to compile to object rather than executable
+#  These flags are used by all compilers.
+#  -------------------------------------------------------------------
+   NM = -o
+   OJ = -c
+
+
+#  ---------------------------------------------------------------------------
+#  Fortran 77 compiler and the flags to use.  Presently, ATLAS does not itself
+#  use any Fortran 77, but vendor BLAS are typically written for Fortran, so
+#  any links that include non-ATLAS BLAS will use FLINKER instead of CLINKER
+#  ---------------------------------------------------------------------------
+   F77 = g77
+   F77FLAGS = $(RPM_OPT_FLAGS)
+   FLINKER = $(F77)
+   FLINKFLAGS = $(F77FLAGS)
+   FCLINKFLAGS = $(FLINKFLAGS)
+
+
+#  ---------------------------------------------------------------------------
+#  Various C compilers, and the linker to be used when we are not linking in
+#  non-ATLAS BLAS (which usually necessitate using the Fortran linker).
+#  The C compilers recognized by ATLAS are:
+#     CC :  Compiler to use to compile regular, non-generated code
+#    MCC :  Compiler to use to compile generated, highly-optimized code
+#    XCC :  Compiler to be used on the compile engine of a cross-compiler
+#  These will typically all be the same.  An example of where this is not
+#  the case would be DEC ALPHA 21164, where you want to use gcc for MCC,
+#  because DEC's cc does not allow the programmer access to all 32 floating
+#  point registers.  However, on normal C code, DEC's cc produces much faster
+#  code than gcc, so you CC set to cc.  Of course, any system where you are
+#  cross-compiling, you will need to set XCC differently than CC & MCC.
+#  ---------------------------------------------------------------------------
+   CDEFS = $(L2SIZE) $(INCLUDES) $(F2CDEFS) $(ARCHDEFS)
+
+   GOODGCC = gcc
+   CC = gcc
+   CCFLAG0 = $(RPM_OPT_FLAGS)
+   CCFLAGS = $(CDEFS) $(CCFLAG0)
+   MCC = gcc
+   MMFLAGS = $(RPM_OPT_FLAGS)
+   XCC = gcc
+   XCCFLAGS = $(CDEFS) $(RPM_OPT_FLAGS)
+   CLINKER = $(CC)
+   CLINKFLAGS = $(CCFLAGS)
+   BC = $(CC)
+   BCFLAGS = $(CCFLAGS)
+   ARCHIVER = ar
+   ARFLAGS  = r
+   RANLIB   = echo
+
+#  -------------------------------------
+#  tar, gzip, gunzip, and parallel make
+#  -------------------------------------
+   TAR    = /bin/tar 
+   GZIP   = /bin/gzip 
+   GUNZIP = /bin/gunzip 
+   PMAKE  = $(MAKE) -j 2
+
+#  ------------------------------------
+#  Reference and system libraries
+#  ------------------------------------
+   BLASlib = 
+   FBLASlib = 
+   FLAPACKlib = 
+   LIBS = -lm
+
+#  ----------------------------------------------------------
+#  ATLAS install resources (include arch default directories)
+#  ----------------------------------------------------------
+   ARCHDEF = @ARCHDEF@
+   MMDEF = @MMDEF@
+   INSTFLAGS = 
+
+#  ---------------------------------------
+#  Generic targets needed by all makefiles
+#  ---------------------------------------
+waitfile:
diff --git a/atlas.spec b/atlas.spec
new file mode 100644
index 0000000..1d47c4b
--- /dev/null
+++ b/atlas.spec
@@ -0,0 +1,226 @@
+Name: atlas
+Version: 3.7.11
+Release: alt5
+
+Summary: Automatically Tuned Linear Algebra Software (the BLAS library)
+License: BSD
+Group: System/Libraries
+
+URL: http://math-atlas.sourceforge.net
+Source: %name-%version.tar
+Patch: %name-%version-%release.patch
+
+ExclusiveArch: %ix86 amd64 x86_64
+
+# Automatically added by buildreq on Fri Dec 08 2006
+BuildRequires: gcc-fortran
+
+%description
+The ATLAS (Automatically Tuned Linear Algebra Software) project is an
+ongoing research effort focusing on applying empirical techniques in
+order to provide portable performance. At present, it provides C and
+Fortran77 interfaces to a portably efficient BLAS implementation, as
+well as a few routines from LAPACK.
+
+%prep
+%setup -q -n ATLAS
+%patch -p1
+ln -s atlas.mk Make.Linux_i586
+ln -s atlas.mk Make.Linux_sse2
+ln -s atlas.mk Make.Linux_amd64
+chmod +x atlas-run.sh
+ln -s ../atlas-run.sh CONFIG/ATLrun.Linux_i586
+ln -s ../atlas-run.sh CONFIG/ATLrun.Linux_sse2
+ln -s ../atlas-run.sh CONFIG/ATLrun.Linux_amd64
+
+%build
+%add_optflags -fPIC -falign-loops=4
+%add_optflags -mfpmath=387
+%define soffix .so.3
+shared()
+{
+	lib=$1; shift
+	${linker:-gcc} -shared -Wl,--whole-archive $lib.a -Wl,--no-whole-archive \
+		-o $lib%soffix -Wl,-soname=$lib%soffix "$@" -lm -Wl,-z,defs
+}
+all_shared()
+{
+	cd $1
+	linker=gcc shared libatlas
+	linker=gcc shared libcblas ./libatlas%soffix
+	linker=g77 shared libblas ./libatlas%soffix
+	linker=g77 shared liblapack_atlas ./libatlas%soffix ./libblas%soffix ./libcblas%soffix
+	cd -
+}
+
+%ifarch %ix86
+make killall startup arch=Linux_i586
+make install sanity_test arch=Linux_i586 ARCHDEFS='-DATL_OS_Linux -DATL_GAS_x8632' \
+	MMDEF=$PWD/CONFIG/ARCHS/PII/gcc/gemm ARCHDEF=$PWD/CONFIG/ARCHS/PII/gcc/misc
+all_shared lib/Linux_i586
+make killall startup arch=Linux_sse2
+make install sanity_test arch=Linux_sse2 ARCHDEFS='-DATL_OS_Linux -DATL_GAS_x8632 -DATL_ARCH_HAMMER32 -DATL_SSE1 -DATL_SSE2' \
+	MMDEF=$PWD/CONFIG/ARCHS/HAMMER32SSE2/gcc/gemm ARCHDEF=$PWD/CONFIG/ARCHS/HAMMER32SSE2/gcc/misc
+all_shared lib/Linux_sse2
+%else
+make killall startup arch=Linux_amd64
+make install sanity_test arch=Linux_amd64 ARCHDEFS='-DATL_OS_Linux -DATL_GAS_x8664 -DATL_ARCH_HAMMER64 -DATL_SSE1 -DATL_SSE2 -DATL_USE64BITS' \
+	MMDEF=$PWD/CONFIG/ARCHS/HAMMER64SSE2/gcc/gemm ARCHDEF=$PWD/CONFIG/ARCHS/HAMMER64SSE2/gcc/misc
+all_shared lib/Linux_amd64
+%endif
+
+%install
+mkdir -p %buildroot%_libdir %buildroot%_includedir/atlas
+cp -p include/*.h %buildroot%_includedir/atlas/
+ln -s atlas/cblas.h %buildroot%_includedir/cblas.h
+ln -s atlas/clapack.h %buildroot%_includedir/clapack.h
+
+%ifarch %ix86
+cp -p include/Linux_i586/atlas*.h %buildroot%_includedir/atlas/
+cp -p lib/Linux_i586/lib*%soffix %buildroot%_libdir/
+cp -p lib/Linux_i586/lib*.a %buildroot%_libdir/
+mkdir %buildroot%_libdir/sse2
+cp -p lib/Linux_sse2/libatlas%soffix %buildroot%_libdir/sse2/
+%else
+cp -p include/Linux_amd64/atlas*.h %buildroot%_includedir/atlas/
+cp -p lib/Linux_amd64/lib*%soffix %buildroot%_libdir/
+cp -p lib/Linux_amd64/lib*.a %buildroot%_libdir/
+%endif
+
+ln -s libatlas%soffix %buildroot%_libdir/libatlas.so
+ln -s libcblas%soffix %buildroot%_libdir/libcblas.so
+ln -s libblas%soffix %buildroot%_libdir/libblas.so
+ln -s libblas%soffix %buildroot%_libdir/libf77blas.so
+ln -s liblapack_atlas%soffix %buildroot%_libdir/liblapack_atlas.so
+
+mv %buildroot%_libdir/libatlas{,_}.a
+echo 'GROUP(%_libdir/libatlas_.a -lm)' >%buildroot%_libdir/libatlas.a
+mv %buildroot%_libdir/libcblas{,_}.a
+echo 'GROUP(%_libdir/libcblas_.a %_libdir/libatlas.a)' >%buildroot%_libdir/libcblas.a
+mv %buildroot%_libdir/libblas{,_}.a
+echo 'GROUP(%_libdir/libblas_.a %_libdir/libatlas.a -lgfortran)' >%buildroot%_libdir/libblas.a
+ln -s libblas.a %buildroot%_libdir/libf77blas.a
+mv %buildroot%_libdir/liblapack_atlas{,_}.a
+echo 'GROUP(%_libdir/liblapack_atlas_.a %_libdir/libcblas.a %_libdir/libblas.a)' >%buildroot%_libdir/liblapack_atlas.a
+
+%define pkgdocdir %_docdir/atlas-3.7
+mkdir -p %buildroot%pkgdocdir
+cp -p README.ALT %buildroot%pkgdocdir/
+cp -p doc/AtlasCredits.txt doc/ChangeLog doc/LibReadme.txt %buildroot%pkgdocdir/
+cp -p doc/atlas_{contrib,devel,over}.ps doc/cblas.ps %buildroot%pkgdocdir/
+gzip -9nf %buildroot%pkgdocdir/*.ps
+%ifarch %ix86
+cp -p bin/Linux_i586/INSTALL_LOG/SUMMARY.LOG %buildroot%pkgdocdir/SUMMARY.LOG
+cp -p bin/Linux_sse2/INSTALL_LOG/SUMMARY.LOG %buildroot%pkgdocdir/SUMMARY.LOG.SSE2
+%else
+cp -p bin/Linux_amd64/INSTALL_LOG/SUMMARY.LOG %buildroot%pkgdocdir/SUMMARY.LOG
+%endif
+
+%package -n libatlas
+Summary: Automatically Tuned Linear Algebra Software (the BLAS library)
+Group: System/Libraries
+Conflicts: liblapack < 1:3.0-alt3
+
+%description -n libatlas
+The ATLAS (Automatically Tuned Linear Algebra Software) project is an
+ongoing research effort focusing on applying empirical techniques in
+order to provide portable performance. At present, it provides C and
+Fortran77 interfaces to a portably efficient BLAS implementation, as
+well as a few routines from LAPACK.
+
+%files -n libatlas
+%_libdir/lib*%soffix
+%ifarch %ix86
+%dir %_libdir/sse2
+%_libdir/sse2/libatlas%soffix
+%endif
+%dir %pkgdocdir
+%pkgdocdir/README.ALT
+%pkgdocdir/*.txt
+%pkgdocdir/ChangeLog
+%pkgdocdir/SUMMARY.LOG*
+
+%post -n libatlas -p %post_ldconfig
+%postun -n libatlas -p %post_ldconfig
+
+%package -n libatlas-devel
+Summary: Automatically Tuned Linear Algebra Software (the BLAS library)
+Group: Development/Other
+Requires: libatlas = %version-%release
+
+%description -n libatlas-devel
+The ATLAS (Automatically Tuned Linear Algebra Software) project is an
+ongoing research effort focusing on applying empirical techniques in
+order to provide portable performance. At present, it provides C and
+Fortran77 interfaces to a portably efficient BLAS implementation, as
+well as a few routines from LAPACK.
+
+%files -n libatlas-devel
+%_libdir/*.so
+%_includedir/cblas.h
+%_includedir/clapack.h
+%dir %_includedir/atlas
+%_includedir/atlas/*.h
+
+%package -n libatlas-devel-static
+Summary: Automatically Tuned Linear Algebra Software (the BLAS library)
+Group: Development/Other
+Requires: libatlas-devel = %version-%release
+Requires: libgfortran-devel-static
+
+%description -n libatlas-devel-static
+The ATLAS (Automatically Tuned Linear Algebra Software) project is an
+ongoing research effort focusing on applying empirical techniques in
+order to provide portable performance. At present, it provides C and
+Fortran77 interfaces to a portably efficient BLAS implementation, as
+well as a few routines from LAPACK.
+
+%files -n libatlas-devel-static
+%_libdir/lib*.a
+%exclude %_libdir/libtstatlas.a
+
+%package doc
+Summary: Automatically Tuned Linear Algebra Software (the BLAS library)
+Group: Development/Other
+Requires: libatlas = %version-%release
+
+%description doc
+The ATLAS (Automatically Tuned Linear Algebra Software) project is an
+ongoing research effort focusing on applying empirical techniques in
+order to provide portable performance. At present, it provides C and
+Fortran77 interfaces to a portably efficient BLAS implementation, as
+well as a few routines from LAPACK.
+
+%files doc
+%dir %pkgdocdir
+%pkgdocdir/*.ps.gz
+
+%changelog
+* Mon Oct 29 2007 Alexey Tourbin <at@altlinux.ru> 3.7.11-alt5
+- packaged static libraries (requested by Pavel A. Piminov);
+  use e.g. "g77 -static test.f -lblas" for static linkage
+- changed src.rpm packaging to keep separate upstream tarball
+
+* Tue Jan 09 2007 Alexey Tourbin <at@altlinux.ru> 3.7.11-alt4
+- backported bugfix for "complex C = A A' bug"
+- backported fix for ilaenv.f which improves LAPACK performance
+
+* Sun Dec 10 2006 Alexey Tourbin <at@altlinux.ru> 3.7.11-alt3
+- compiled with -mfpmath=387 to fix LAPACK test suite on x86_64
+- actually packaged README.ALT
+
+* Fri Dec 08 2006 Alexey Tourbin <at@altlinux.ru> 3.7.11-alt2
+- imported sources into git and built with gear
+- removed PRM_OPT_FLAGS hack that broke SSE2
+- enabled /usr/lib/sse2/libatlas.so.3 for i586
+- libatlas.so.3, libblas.so.3: made some internal functions hidden
+- packaged docs (atlas-doc package has postscript documentation)
+- added README.ALT, which explains some performance issues
+
+* Tue Jun 06 2006 Alexey Tourbin <at@altlinux.ru> 3.7.11-alt1
+- initial revision, with debian fixes
+- PII-optimized BLAS (for i586) yields three-fold improvement
+  over plain fortran BLAS (matrix cross-product in R)
+- SSE2-optimized /usr/lib/sse2/libatlas.so disabled by default
+  because of segfaults (try to rebuild `--with sse2' and let me know)
+- HAMMER64SSE2 for x86_64
diff --git a/bin/uumtst.c b/bin/uumtst.c
index 9940e80..c393106 100644
--- a/bin/uumtst.c
+++ b/bin/uumtst.c
@@ -59,13 +59,14 @@ static void test_lauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
       ATL_lauum(Maj_, Uplo_, N_, A_, lda_)
 #endif
 
+static void ATL_L2GE(const enum CBLAS_ORDER, const int, const TYPE*,
+                     const int, TYPE*, const int);
+
 static void ATL_U2GE
    (const enum CBLAS_ORDER Order, const int N, const TYPE *U, const int ldu,
     TYPE *C, const int ldc)
 {
    int j;
-   static void ATL_L2GE(const enum CBLAS_ORDER, const int, const TYPE*,
-                        const int, TYPE*, const int);
 
    if (Order == CblasRowMajor) ATL_L2GE(CblasColMajor, N, U, ldu, C, ldc);
    else
diff --git a/include/atlas_f77wrap.h b/include/atlas_f77wrap.h
index 5a849df..e77b193 100644
--- a/include/atlas_f77wrap.h
+++ b/include/atlas_f77wrap.h
@@ -895,6 +895,7 @@
  * Prototypes for F77 interface wrappers ATLAS BLAS routines
  * =====================================================================
  */
+#pragma GCC visibility push(hidden)
 void       Mjoin( PATLF77WRAP,  rotg  )
 ( TYPE *,          TYPE *,          TYPE *,          TYPE * );
 #ifdef TREAL
@@ -1085,6 +1086,7 @@ void       Mjoin( PATLF77WRAP,  trsm  )
   F77_INTEGER *,   F77_INTEGER *,   TYPE *,          TYPE *,
   F77_INTEGER *,   TYPE *,          F77_INTEGER * );
 
+#pragma GCC visibility pop
 #endif
 /*
  * End of atlas_f77wrap.h
diff --git a/include/atlas_kernel2.h b/include/atlas_kernel2.h
index fc7985b..f1bca28 100644
--- a/include/atlas_kernel2.h
+++ b/include/atlas_kernel2.h
@@ -80,6 +80,7 @@
  * Recursive Level 2 BLAS function prototypes
  * =====================================================================
  */
+#pragma GCC visibility push(hidden)
 void       ATL_strsvLTU
 (
   const int,
@@ -5404,7 +5405,7 @@ void       ATL_zgbmvN_a1_x1_b0_y1
   double *,               const int
 );
 
-
+#pragma GCC visibility pop
 #endif
 /*
  * End of atlas_kernel2.h
diff --git a/include/atlas_kernel3.h b/include/atlas_kernel3.h
index b3f666d..a8b589d 100644
--- a/include/atlas_kernel3.h
+++ b/include/atlas_kernel3.h
@@ -33,6 +33,7 @@
 /*
  * Real level 3 kernels
  */
+#pragma GCC visibility push(hidden)
 void ATL_ssymmRU
    (const int M, const int N, const void *alpha, const void *A, const int lda,
     const void *B, const int ldb, const void *beta, void *C, const int ldc);
@@ -1390,4 +1391,5 @@ void ATL_ztrsm0RUCU
    (const int M, const int N, const double *alpha, const double *A,
     const int lda, double *C, const int ldc);
 
+#pragma GCC visibility pop
 #endif
diff --git a/include/atlas_lvl2.h b/include/atlas_lvl2.h
index 69b1b54..a56df27 100644
--- a/include/atlas_lvl2.h
+++ b/include/atlas_lvl2.h
@@ -36,6 +36,7 @@
 /*
  * Real kernels
  */
+#pragma GCC visibility push(hidden)
 void ATL_sger1_a1_x1_yX
    (const int M, const int N, const float alpha, const float *X,
     const int incX, const float *Y, const int incY, float *A, const int lda);
@@ -291,4 +292,5 @@ void ATL_zgemvN_a1_x1_b0_y1
     const int lda, const double *X, const int incX, const double *beta,
     double *Y, const int incY);
 
+#pragma GCC visibility pop
 #endif
diff --git a/include/atlas_rblas3.h b/include/atlas_rblas3.h
index 9aac8e0..ca3d563 100644
--- a/include/atlas_rblas3.h
+++ b/include/atlas_rblas3.h
@@ -202,6 +202,7 @@ typedef void           (*RC3_FUN_TRSM_T)
  * Level 3 recursive BLAS internal function prototypes
  * =====================================================================
  */
+#pragma GCC visibility push(hidden)
 void           ATL_sgemmTN_RB
 (  const int,       const int,       const int,       const void *,
    const void *,    const int,       const void *,    const int,
@@ -471,6 +472,7 @@ void           ATL_rtrsmLLN
    const void *,    const int,       void *,          const int,
    const int );
 
+#pragma GCC visibility pop
 #endif
 /*
  * End of atlas_rblas3.h
diff --git a/include/atlas_reflvl2.h b/include/atlas_reflvl2.h
index e64f90d..0697403 100644
--- a/include/atlas_reflvl2.h
+++ b/include/atlas_reflvl2.h
@@ -53,6 +53,7 @@
  * Prototypes for Level 2 Reference Internal ATLAS BLAS routines
  * =====================================================================
  */
+#pragma GCC visibility push(hidden)
 void       ATL_srefgbmvN
 (
   const int,              const int,
@@ -3181,6 +3182,7 @@ void       ATL_zreftrsvUHU
   double *,               const int
 );
 
+#pragma GCC visibility pop
 #endif
 /*
  * End of atlas_reflvl2.h
diff --git a/include/atlas_reflvl3.h b/include/atlas_reflvl3.h
index 8dfcfd0..0177cd9 100644
--- a/include/atlas_reflvl3.h
+++ b/include/atlas_reflvl3.h
@@ -53,6 +53,7 @@
  * Prototypes for Level 3 Reference Internal ATLAS BLAS routines
  * =====================================================================
  */
+#pragma GCC visibility push(hidden)
 void       ATL_srefgemmNN
 (
   const int,              const int,              const int,
@@ -2289,6 +2290,7 @@ void       ATL_zreftrsmRUCU
   double *,               const int
 );
 
+#pragma GCC visibility pop
 #endif
 /*
  * End of atlas_reflvl3.h
diff --git a/include/contrib/ATL_gemv_ger_SSE.h b/include/contrib/ATL_gemv_ger_SSE.h
index 118d3de..10faf15 100644
--- a/include/contrib/ATL_gemv_ger_SSE.h
+++ b/include/contrib/ATL_gemv_ger_SSE.h
@@ -18,8 +18,11 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h> /* for memset */
 
 #include "camm_util.h"
+#include "atlas_misc.h"
+#include "atlas_lvl2.h"
 
 #ifndef GER
 #if defined(BETAX) || defined(BETAXI0)
diff --git a/include/contrib/camm_dpa.h b/include/contrib/camm_dpa.h
index af9c6b1..0912aee 100644
--- a/include/contrib/camm_dpa.h
+++ b/include/contrib/camm_dpa.h
@@ -1619,7 +1619,7 @@ MY_FUNCTION(aconst TYPE *a,int lda,
 #if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2))
 	 ,"m" (w)
 #endif
-	 :"ax","bx","cx","dx","si","di");
+	 :"ax","cx","dx","si","di");
 
 
 }
diff --git a/include/contrib/camm_util.h b/include/contrib/camm_util.h
index 6b150d3..bcfcc99 100644
--- a/include/contrib/camm_util.h
+++ b/include/contrib/camm_util.h
@@ -316,6 +316,15 @@ typedef struct {
 
 #ifndef ATHLON
 
+#ifdef SINGLE
+#ifndef ATL_SSE1
+#error ATL_SSE1 is needed for this kernel
+#endif
+#else
+#ifndef ATL_SSE2
+#error ATL_SSE2 is needed for this kernel
+#endif
+#endif
 
 #if defined(DREAL) || defined(DCPLX)
 #undef SSESUF
diff --git a/interfaces/lapack/F77/src/ilaenv.f b/interfaces/lapack/F77/src/ilaenv.f
index 4e4d350..2beb66e 100644
--- a/interfaces/lapack/F77/src/ilaenv.f
+++ b/interfaces/lapack/F77/src/ilaenv.f
@@ -111,6 +111,7 @@
       CHARACTER*3        C3
       CHARACTER*6        SUBNAM
       INTEGER            I, IC, IZ, NB, NBMIN, NX
+      INTEGER            MN, ATLNB
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          CHAR, ICHAR, INT, MIN, REAL
@@ -207,13 +208,10 @@
 *     ISPEC = 1:  block size
 *
 *     ATLAS sets default blocking factor to the internal blocking factor of
-*     GEMM.  In routines such as QR where unblocked code addresses a panel
-*     (as opposed to a NBxNB block as in POTF2), this will often be too large
-*     to provide the optimal performance due to the slowness of the panel
-*     factorization.  Asymptotically, however, you will want it at least
-*     this big.  Therefore, we choose this NB as default since how much smaller
-*     an NB is required is very problem size and system architecture
-*     dependent.  Note that the recursive routines provided by ATLAS
+*     GEMM.  For particular routines, we then examine input parameters to
+*     see if we should reduce it (for instance, to ensure that we don't
+*     spend too much time in the unblocked level-2 portion of the algorithm).
+*     Note that the recursive routines provided by ATLAS
 *     (presently, LU & Cholesky) do not need this value.
 *
       NB = 60
@@ -226,6 +224,179 @@
       ELSE IF (DCPLX) THEN
          CALL ATL_F77WRAP_ZGETNB(NB)
       ENDIF
+      ATLNB = NB
+*
+*     Find minimum dimension of problem: don't want NB bigger than that
+*
+      MN = N1
+      IF (N2 .GT. 0) THEN
+         IF (MN .GT. N2) MN = N2
+         IF (N3 .GT. 0) THEN
+            IF (MN .GT. N3) MN = N3
+            IF (N4 .GT. 0) THEN
+               IF (MN .GT. N4) MN = N4
+            END IF
+         END IF
+      END IF
+*
+*     Make sure NB is not close to size of entire dimension, unless smallest
+*     dimension is so small you might as well use unblocked code the whole way
+*
+      IF (MN .LE. 3) THEN
+         NB = MN;
+      ELSE IF (NB*5 .GT. MN) THEN
+         IF (MN .LT. 16) THEN
+            IF (MN .LT. 8) THEN
+               NB = 2
+            ELSE
+               NB = 4
+            END IF
+         ELSE
+           NB = (MN / 4)
+           IF (NB .GT. 4) THEN
+              NB = (NB/4)*4
+           ENDIF
+         END IF
+      END IF
+      IF( C2.EQ.'GE' ) THEN
+         IF( C3.EQ.'TRF' ) THEN
+            NB = NB
+*
+*        QR requires extra flops for blocking, so restrain total NB
+*
+         ELSE IF( C3.EQ.'QRF' .OR. C3.EQ.'RQF' .OR. C3.EQ.'LQF' .OR.
+     $            C3.EQ.'QLF' ) THEN
+            IF (NB .GT. 80) THEN
+               IF( SNAME ) THEN
+                  NB = 60
+               ELSE
+                  NB = 40
+               END IF
+            END IF
+         ELSE IF( C3.EQ.'HRD' ) THEN
+            IF (NB .GT. 80) THEN
+               IF( SNAME ) THEN
+                  NB = 60
+               ELSE
+                  NB = 40
+               END IF
+            END IF
+         ELSE IF( C3.EQ.'BRD' ) THEN
+            IF (NB .GT. 80) THEN
+               IF( SNAME ) THEN
+                  NB = 60
+               ELSE
+                  NB = 40
+               END IF
+            END IF
+         ELSE IF( C3.EQ.'TRI' ) THEN
+               NB = NB
+         END IF
+      ELSE IF( C2.EQ.'PO' ) THEN
+         IF( C3.EQ.'TRF' ) THEN
+            NB = NB
+         END IF
+      ELSE IF( C2.EQ.'SY' ) THEN
+         IF( C3.EQ.'TRF' ) THEN
+            NB = NB
+         ELSE IF( SNAME .AND. C3.EQ.'TRD' ) THEN
+            IF (NB .GT. 80) THEN
+               IF( SNAME ) THEN
+                  NB = 60
+               ELSE
+                  NB = 40
+               END IF
+            END IF
+         ELSE IF( SNAME .AND. C3.EQ.'GST' ) THEN
+            NB = NB
+         END IF
+      ELSE IF( CNAME .AND. C2.EQ.'HE' ) THEN
+         IF( C3.EQ.'TRF' ) THEN
+            NB = NB
+         ELSE IF( C3.EQ.'TRD' ) THEN
+            IF (NB .GT. 80) THEN
+               IF( SNAME ) THEN
+                  NB = 60
+               ELSE
+                  NB = 40
+               END IF
+            END IF
+         ELSE IF( C3.EQ.'GST' ) THEN
+            NB = NB
+         END IF
+      ELSE IF( SNAME .AND. C2.EQ.'OR' ) THEN
+         IF( C3( 1:1 ).EQ.'G' ) THEN
+            IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR.
+     $          C4.EQ.'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR.
+     $          C4.EQ.'BR' ) THEN
+               IF (NB .GT. 80) NB = 60
+            END IF
+         ELSE IF( C3( 1:1 ).EQ.'M' ) THEN
+            IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR.
+     $          C4.EQ.'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR.
+     $          C4.EQ.'BR' ) THEN
+               IF (NB .GT. 80) NB = 60
+            END IF
+         END IF
+      ELSE IF( CNAME .AND. C2.EQ.'UN' ) THEN
+         IF( C3( 1:1 ).EQ.'G' ) THEN
+            IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR.
+     $          C4.EQ.'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR.
+     $          C4.EQ.'BR' ) THEN
+               IF (NB .GT. 80) NB = 40
+            END IF
+         ELSE IF( C3( 1:1 ).EQ.'M' ) THEN
+            IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR.
+     $          C4.EQ.'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR.
+     $          C4.EQ.'BR' ) THEN
+               IF (NB .GT. 80) NB = 40
+            END IF
+         END IF
+      ELSE IF( C2.EQ.'GB' ) THEN
+         IF( C3.EQ.'TRF' ) THEN
+            IF( SNAME ) THEN
+               NB = NB
+            ELSE
+               NB = NB
+            END IF
+         END IF
+      ELSE IF( C2.EQ.'PB' ) THEN
+         IF( C3.EQ.'TRF' ) THEN
+            IF( SNAME ) THEN
+               IF( N2.LE.64 ) THEN
+                  NB = NB
+               ELSE
+                  NB = NB
+               END IF
+            ELSE
+               IF( N2.LE.64 ) THEN
+                  NB = NB
+               ELSE
+                  NB = NB
+               END IF
+            END IF
+         END IF
+      ELSE IF( C2.EQ.'TR' ) THEN
+         IF( C3.EQ.'TRI' ) THEN
+            IF( SNAME ) THEN
+               NB = NB
+            ELSE
+               NB = NB
+            END IF
+         END IF
+      ELSE IF( C2.EQ.'LA' ) THEN
+         IF( C3.EQ.'UUM' ) THEN
+            IF( SNAME ) THEN
+               NB = NB
+            ELSE
+               NB = NB
+            END IF
+         END IF
+      ELSE IF( SNAME .AND. C2.EQ.'ST' ) THEN
+         IF( C3.EQ.'EBZ' ) THEN
+            NB = 1
+         END IF
+      END IF
 *
       ILAENV = NB
       RETURN
diff --git a/src/blas/gemm/ATL_cmmJIK.c b/src/blas/gemm/ATL_cmmJIK.c
index ed9ad75..e9231fa 100644
--- a/src/blas/gemm/ATL_cmmJIK.c
+++ b/src/blas/gemm/ATL_cmmJIK.c
@@ -190,7 +190,7 @@ int Mjoin(PATL,mmJIK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
          pA = ATL_AlignPtr(vB);
          if (TA == AtlasNoTrans)
             Mjoin(PATL,row2blkT2_a1)(M, K, A, lda, pA, alpha);
-         else Mjoin(PATL,col2blk_a1)(K, M, A, lda, pA, alpha);
+         else Mjoin(PATL,col2blk2_a1)(K, M, A, lda, pA, alpha);
 /*
  *       Can't write directly to C if alpha is not one
  */
diff --git a/tune/blas/gemm/CASES/ATL_dmm4x4x2_US.c b/tune/blas/gemm/CASES/ATL_dmm4x4x2_US.c
index bdffd5e..ffb64f0 100644
--- a/tune/blas/gemm/CASES/ATL_dmm4x4x2_US.c
+++ b/tune/blas/gemm/CASES/ATL_dmm4x4x2_US.c
@@ -104,7 +104,7 @@
    #define CSH 3
    #define CMUL(arg_) arg_
 #endif
-if 1
+#if 1
    #define prefR1(mem) prefetch mem, 0
    #define prefR2(mem) prefetch mem, 0
    #define prefW2(mem) prefetch mem, 2
diff --git a/tune/blas/gemm/CASES/ATL_dmm6x1x60_sse2.c b/tune/blas/gemm/CASES/ATL_dmm6x1x60_sse2.c
index 5c39f5b..5b1b522 100644
--- a/tune/blas/gemm/CASES/ATL_dmm6x1x60_sse2.c
+++ b/tune/blas/gemm/CASES/ATL_dmm6x1x60_sse2.c
@@ -40,6 +40,11 @@
 #if (NB/6)*6 != NB
    #error "NB must be multiple of 6!"
 #endif
+
+#ifndef ATL_Has3DNow
+   #define prefetchw prefetchnta
+#endif
+
 #
 #  Integer register usage shown be these defines
 #
diff --git a/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1.c b/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1.c
index afb6ccc..334a394 100644
--- a/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1.c
+++ b/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1.c
@@ -31,6 +31,7 @@
 #include "atlas_misc.h"
 #include "atlas_level2.h"
 #include "atlas_level1.h"
+#include "atlas_lvl2.h"
 
 #ifdef Conj_
    #define PEQ -=
diff --git a/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1a.c b/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1a.c
index 6329073..e4f0345 100644
--- a/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1a.c
+++ b/tune/blas/gemv/CASES/ATL_cgemvN_1x1_1a.c
@@ -31,6 +31,7 @@
 #include "atlas_misc.h"
 #include "atlas_level2.h"
 #include "atlas_level1.h"
+#include "atlas_lvl2.h"
 
 #ifdef Conj_
    #define PEQ -=
diff --git a/tune/blas/gemv/CASES/ATL_cgemvN_2x2_0.c b/tune/blas/gemv/CASES/ATL_cgemvN_2x2_0.c
index c351d1f..2ee4ba1 100644
--- a/tune/blas/gemv/CASES/ATL_cgemvN_2x2_0.c
+++ b/tune/blas/gemv/CASES/ATL_cgemvN_2x2_0.c
@@ -31,6 +31,7 @@
 #include "atlas_misc.h"
 #include "atlas_level2.h"
 #include "atlas_level1.h"
+#include "atlas_lvl2.h"
 
 #ifdef Conj_
    #define PEQ -=
diff --git a/tune/blas/gemv/CASES/ATL_cgemvN_4x2_1.c b/tune/blas/gemv/CASES/ATL_cgemvN_4x2_1.c
index fa407f3..99520d6 100644
--- a/tune/blas/gemv/CASES/ATL_cgemvN_4x2_1.c
+++ b/tune/blas/gemv/CASES/ATL_cgemvN_4x2_1.c
@@ -31,6 +31,7 @@
 #include "atlas_misc.h"
 #include "atlas_level2.h"
 #include "atlas_level1.h"
+#include "atlas_lvl2.h"
 #include "atlas_prefetch.h"
 
 #ifdef Conj_
diff --git a/tune/blas/gemv/CASES/ATL_cgemvN_mm.c b/tune/blas/gemv/CASES/ATL_cgemvN_mm.c
index cbae230..ed16aab 100644
--- a/tune/blas/gemv/CASES/ATL_cgemvN_mm.c
+++ b/tune/blas/gemv/CASES/ATL_cgemvN_mm.c
@@ -40,6 +40,7 @@
    #define MEQ -=
 #endif
 
+#include "atlas_lvl2.h"
 #include "atlas_lvl3.h"
 
 #ifdef Conj_
diff --git a/tune/blas/gemv/CASES/ATL_cgemvT_1x1_1.c b/tune/blas/gemv/CASES/ATL_cgemvT_1x1_1.c
index 12256b2..53268c1 100644
--- a/tune/blas/gemv/CASES/ATL_cgemvT_1x1_1.c
+++ b/tune/blas/gemv/CASES/ATL_cgemvT_1x1_1.c
@@ -31,6 +31,7 @@
 #include "atlas_misc.h"
 #include "atlas_level1.h"
 #include "atlas_level2.h"
+#include "atlas_lvl2.h"
 
 #ifdef Conj_
    #define PEQ -=
diff --git a/tune/blas/gemv/CASES/ATL_cgemvT_2x2_0.c b/tune/blas/gemv/CASES/ATL_cgemvT_2x2_0.c
index d617a8a..4cbf0fe 100644
--- a/tune/blas/gemv/CASES/ATL_cgemvT_2x2_0.c
+++ b/tune/blas/gemv/CASES/ATL_cgemvT_2x2_0.c
@@ -31,6 +31,7 @@
 #include "atlas_misc.h"
 #include "atlas_level1.h"
 #include "atlas_level2.h"
+#include "atlas_lvl2.h"
 #include "atlas_prefetch.h"
 
 #ifdef Conj_
diff --git a/tune/blas/gemv/CASES/ATL_cgemvT_2x4_1.c b/tune/blas/gemv/CASES/ATL_cgemvT_2x4_1.c
index 1a1d257..5631903 100644
--- a/tune/blas/gemv/CASES/ATL_cgemvT_2x4_1.c
+++ b/tune/blas/gemv/CASES/ATL_cgemvT_2x4_1.c
@@ -31,6 +31,7 @@
 #include "atlas_misc.h"
 #include "atlas_level1.h"
 #include "atlas_level2.h"
+#include "atlas_lvl2.h"
 #include "atlas_prefetch.h"
 
 #ifdef Conj_
diff --git a/tune/blas/gemv/CASES/ATL_cgemvT_mm.c b/tune/blas/gemv/CASES/ATL_cgemvT_mm.c
index 691b3a6..65cbbdc 100644
--- a/tune/blas/gemv/CASES/ATL_cgemvT_mm.c
+++ b/tune/blas/gemv/CASES/ATL_cgemvT_mm.c
@@ -31,6 +31,7 @@
 #include "atlas_misc.h"
 #include "atlas_level1.h"
 #include "atlas_level2.h"
+#include "atlas_lvl2.h"
 #include "atlas_lvl3.h"
 
 #ifdef Conj_
diff --git a/tune/blas/gemv/CASES/ATL_gemvN_dummy.c b/tune/blas/gemv/CASES/ATL_gemvN_dummy.c
index 8627ff7..0366cde 100644
--- a/tune/blas/gemv/CASES/ATL_gemvN_dummy.c
+++ b/tune/blas/gemv/CASES/ATL_gemvN_dummy.c
@@ -30,6 +30,7 @@
 
 #include "atlas_misc.h"
 #include "atlas_level2.h"
+#include "atlas_lvl2.h"
 #define gemv0 Mjoin(Mjoin(Mjoin(Mjoin(gemvN,NM),_x1),BNM),_y1)
 #define gemvC Mjoin(Mjoin(Mjoin(Mjoin(gemvC,NM),_x1),BNM),_y1)
 #define gemvNc Mjoin(Mjoin(Mjoin(Mjoin(gemvNc,NM),_x1),BNM),_y1)
diff --git a/tune/blas/gemv/CASES/ATL_gemvN_dummy2.c b/tune/blas/gemv/CASES/ATL_gemvN_dummy2.c
index 5bee7b4..0ed1d67 100644
--- a/tune/blas/gemv/CASES/ATL_gemvN_dummy2.c
+++ b/tune/blas/gemv/CASES/ATL_gemvN_dummy2.c
@@ -30,6 +30,7 @@
 
 #include "atlas_misc.h"
 #include "atlas_level2.h"
+#include "atlas_lvl2.h"
 #ifdef Conj_
    void Mjoin(Mjoin(Mjoin(Mjoin(Mjoin(PATL,gemvNc),NM),_x1),BNM),_y1)
 #else
diff --git a/tune/blas/gemv/CASES/ATL_gemvT_1x1_1.c b/tune/blas/gemv/CASES/ATL_gemvT_1x1_1.c
index d5b4ae2..a4e0c00 100644
--- a/tune/blas/gemv/CASES/ATL_gemvT_1x1_1.c
+++ b/tune/blas/gemv/CASES/ATL_gemvT_1x1_1.c
@@ -30,6 +30,7 @@
 
 #include "atlas_misc.h"
 #include "atlas_level1.h"
+#include "atlas_lvl2.h"
 #include "atlas_level2.h"
 
 #define gemv0 Mjoin(Mjoin(Mjoin(Mjoin(gemvT,NM),_x1),BNM),_y1)
diff --git a/tune/blas/gemv/CASES/ATL_gemvT_2x16_1.c b/tune/blas/gemv/CASES/ATL_gemvT_2x16_1.c
index 73c1535..9919bf4 100644
--- a/tune/blas/gemv/CASES/ATL_gemvT_2x16_1.c
+++ b/tune/blas/gemv/CASES/ATL_gemvT_2x16_1.c
@@ -31,6 +31,7 @@
 #include "atlas_misc.h"
 #include "atlas_level1.h"
 #include "atlas_level2.h"
+#include "atlas_lvl2.h"
 #include "atlas_prefetch.h"
 
 static void gemvT_Msmall(const int M, const int N, const TYPE *A, const int lda,
diff --git a/tune/blas/gemv/CASES/ATL_gemvT_2x8_0.c b/tune/blas/gemv/CASES/ATL_gemvT_2x8_0.c
index a822395..b953b57 100644
--- a/tune/blas/gemv/CASES/ATL_gemvT_2x8_0.c
+++ b/tune/blas/gemv/CASES/ATL_gemvT_2x8_0.c
@@ -31,6 +31,7 @@
 #include "atlas_misc.h"
 #include "atlas_level1.h"
 #include "atlas_level2.h"
+#include "atlas_lvl2.h"
 
 static void gemvT_Nsmall(const int M, const int N, const TYPE *A, const int lda,
                          const TYPE *X, const SCALAR beta, TYPE *Y)
diff --git a/tune/blas/gemv/CASES/ATL_gemvT_4x16_1.c b/tune/blas/gemv/CASES/ATL_gemvT_4x16_1.c
index 9b7625d..74780ba 100644
--- a/tune/blas/gemv/CASES/ATL_gemvT_4x16_1.c
+++ b/tune/blas/gemv/CASES/ATL_gemvT_4x16_1.c
@@ -31,6 +31,7 @@
 #include "atlas_misc.h"
 #include "atlas_level1.h"
 #include "atlas_level2.h"
+#include "atlas_lvl2.h"
 #include "atlas_prefetch.h"
 
 static void gemvT_Nsmall(const int M, const int N, const TYPE *A, const int lda,
diff --git a/tune/blas/gemv/CASES/ATL_gemvT_4x8_1.c b/tune/blas/gemv/CASES/ATL_gemvT_4x8_1.c
index add1ee3..de00f75 100644
--- a/tune/blas/gemv/CASES/ATL_gemvT_4x8_1.c
+++ b/tune/blas/gemv/CASES/ATL_gemvT_4x8_1.c
@@ -30,6 +30,7 @@
 
 #include "atlas_misc.h"
 #include "atlas_level1.h"
+#include "atlas_lvl2.h"
 #include "atlas_level2.h"
 
 static void gemvT_Nsmall(const int M, const int N, const TYPE *A, const int lda,
diff --git a/tune/blas/gemv/CASES/ATL_gemvT_dummy.c b/tune/blas/gemv/CASES/ATL_gemvT_dummy.c
index 48b51cb..e813faa 100644
--- a/tune/blas/gemv/CASES/ATL_gemvT_dummy.c
+++ b/tune/blas/gemv/CASES/ATL_gemvT_dummy.c
@@ -30,6 +30,7 @@
 
 #include "atlas_misc.h"
 #include "atlas_level2.h"
+#include "atlas_lvl2.h"
 #define gemv0 Mjoin(Mjoin(Mjoin(Mjoin(gemvT,NM),_x1),BNM),_y1)
 #define gemvC Mjoin(Mjoin(Mjoin(Mjoin(gemvC,NM),_x1),BNM),_y1)
 #define gemvNc Mjoin(Mjoin(Mjoin(Mjoin(gemvNc,NM),_x1),BNM),_y1)
diff --git a/tune/blas/gemv/CASES/ATL_gemvT_dummy2.c b/tune/blas/gemv/CASES/ATL_gemvT_dummy2.c
index 39bcee6..901dc3f 100644
--- a/tune/blas/gemv/CASES/ATL_gemvT_dummy2.c
+++ b/tune/blas/gemv/CASES/ATL_gemvT_dummy2.c
@@ -30,6 +30,7 @@
 
 #include "atlas_misc.h"
 #include "atlas_level2.h"
+#include "atlas_lvl2.h"
 #ifdef Conj_
    void Mjoin(Mjoin(Mjoin(Mjoin(Mjoin(PATL,gemvC),NM),_x1),BNM),_y1)
 #else
diff --git a/tune/blas/gemv/CASES/ATL_gemvT_mm.c b/tune/blas/gemv/CASES/ATL_gemvT_mm.c
index 8a9438f..a4af18d 100644
--- a/tune/blas/gemv/CASES/ATL_gemvT_mm.c
+++ b/tune/blas/gemv/CASES/ATL_gemvT_mm.c
@@ -31,6 +31,7 @@
 #include "atlas_misc.h"
 #include "atlas_level1.h"
 #include "atlas_level2.h"
+#include "atlas_lvl2.h"
 #include "atlas_lvl3.h"
 
 #define gemv0 Mjoin(Mjoin(Mjoin(Mjoin(gemvT,NM),_x1),BNM),_y1)
diff --git a/tune/blas/gemv/CASES/ATL_gemv_SSE.c b/tune/blas/gemv/CASES/ATL_gemv_SSE.c
index d74fe2f..233f743 100644
--- a/tune/blas/gemv/CASES/ATL_gemv_SSE.c
+++ b/tune/blas/gemv/CASES/ATL_gemv_SSE.c
@@ -32,6 +32,7 @@
 #include <stdlib.h>
 
 #include "atlas_misc.h"
+#include "atlas_lvl2.h"
 
 #include "camm_util.h"
 #ifndef ATL_GAS_x8632
diff --git a/tune/blas/level1/IAMAX/iamax_sse.c b/tune/blas/level1/IAMAX/iamax_sse.c
index 344ba82..16973fc 100644
--- a/tune/blas/level1/IAMAX/iamax_sse.c
+++ b/tune/blas/level1/IAMAX/iamax_sse.c
@@ -1,5 +1,9 @@
 #ifdef SREAL
 
+#ifndef ATL_SSE1
+#error ATL_SSE1 needed for this kernel
+#endif
+
 #ifdef ATL_GAS_x8632
    #define movq movl
    #define addq addl
@@ -318,6 +322,11 @@ FA_NEWMAX:
 	jmp	FA_INC
 #else
 
+#ifndef ATL_SSE2
+#error ATL_SSE2 needed for this kernel
+#endif
+
+
 #ifdef ATL_GAS_x8632
    #define movq movl
    #define addq addl