Sophie

Sophie

distrib > Mageia > 5 > x86_64 > media > core-release-src > by-pkgid > c95eef8fb630bc3e4e28644eaf2060a5 > files > 2

x11-driver-video-intel-2.99.917-14.mga5.src.rpm


This is a sync from 2.99.917..master

Currently topmost commit:

commit 44452825fb862c81957f53e04f739e2d96864f91
Author: Chris Wilson <chris@chris-wilson.co.uk>
Date:   Fri Mar 20 13:13:43 2015 +0000

    sna: Remove bogus assert(output->randr_output)


 configure.ac                       |   83 ++-
 src/backlight.c                    |    2 +-
 src/compat-api.h                   |   23 +
 src/i915_pciids.h                  |   32 +-
 src/intel_device.c                 |   20 +-
 src/intel_driver.h                 |    1 +
 src/intel_module.c                 |   11 +-
 src/intel_options.c                |   86 +++
 src/intel_options.h                |    2 +
 src/legacy/i810/i810_memory.c      |    6 +-
 src/sna/blt.c                      |  361 ++++++++++++
 src/sna/compiler.h                 |    2 +
 src/sna/gen2_render.c              |    6 +-
 src/sna/gen3_render.c              |   12 +-
 src/sna/gen4_render.c              |    8 +-
 src/sna/gen5_render.c              |    6 +-
 src/sna/gen6_common.h              |  110 ++--
 src/sna/gen6_render.c              |   22 +-
 src/sna/gen7_render.c              |   37 +-
 src/sna/gen8_render.c              |   34 +-
 src/sna/kgem.c                     |  138 +++--
 src/sna/kgem.h                     |   26 +-
 src/sna/sna.h                      |   57 +-
 src/sna/sna_accel.c                |  239 +++++---
 src/sna/sna_blt.c                  |   71 ++-
 src/sna/sna_composite.c            |  104 ++--
 src/sna/sna_display.c              | 1105 +++++++++++++++++++++++++++---------
 src/sna/sna_display_fake.c         |    3 +-
 src/sna/sna_dri2.c                 |  706 +++++++++++++----------
 src/sna/sna_driver.c               |  233 ++++----
 src/sna/sna_glyphs.c               |    2 +-
 src/sna/sna_present.c              |  383 ++++++++++---
 src/sna/sna_render.c               |    4 +
 src/sna/sna_render_inline.h        |    6 +
 src/sna/sna_trapezoids_imprecise.c |  340 +++++------
 src/sna/sna_trapezoids_mono.c      |   73 ++-
 src/sna/sna_trapezoids_precise.c   |  340 +++++------
 src/sna/sna_video.c                |   66 +++
 src/sna/sna_video.h                |    3 +
 src/sna/sna_video_overlay.c        |   12 +-
 src/sna/sna_video_sprite.c         |    9 +-
 src/sna/sna_video_textured.c       |    5 +-
 src/sna/xassert.h                  |   11 +
 src/uxa/i965_video.c               |    1 -
 src/uxa/intel.h                    |   16 +-
 src/uxa/intel_display.c            |  115 ++--
 src/uxa/intel_dri.c                |  484 +++++++++-------
 src/uxa/intel_driver.c             |   23 +-
 src/uxa/intel_present.c            |   24 +-
 test/Makefile.am                   |    2 +
 test/dri2-race.c                   |  274 ++++++++-
 test/dri2-speed.c                  |  296 ++++++++++
 test/dri2-test.c                   |   30 +-
 test/present-speed.c               |  351 ++++++++++++
 test/present-test.c                | 1065 ++++++++++++++++++++++++++++++++--
 tools/Makefile.am                  |   15 +
 tools/cursor.c                     |  108 ++++
 tools/dri3info.c                   |  139 +++++
 tools/virtual.c                    |   18 +-
 61 files changed, 5882 insertions(+), 1885 deletions(-)

diff --git a/configure.ac b/configure.ac
index 61bea43..9aadc73 100644
--- a/configure.ac
+++ b/configure.ac
@@ -195,17 +195,22 @@ AC_ARG_ENABLE(udev,
               [UDEV="$enableval"],
               [UDEV=auto])
 
+udev_msg=" disabled"
 if test "x$UDEV" != "xno"; then
 	PKG_CHECK_MODULES(UDEV, [libudev], [udev="yes"], [udev="no"])
+	AC_CHECK_HEADERS([sys/stat.h], [], [udev="no"])
 	if test "x$UDEV" = "xyes" -a "x$udev" != "xyes"; then
 		AC_MSG_ERROR([udev support requested but not found (libudev)])
 	fi
 	if test "x$udev" = "xyes"; then
 		AC_DEFINE(HAVE_UDEV,1,[Enable udev-based monitor hotplug detection])
+		udev_msg=" yes"
+	else
+		udev_msg=" no"
 	fi
 fi
 
-PKG_CHECK_MODULES(X11, [x11 xrender xrandr xext xfixes cairo cairo-xlib-xrender pixman-1 libpng], [x11="yes"], [x11="no"])
+PKG_CHECK_MODULES(X11, [x11 x11-xcb xcb-dri2 xrender xrandr xext xfixes cairo cairo-xlib-xrender pixman-1 libpng], [x11="yes"], [x11="no"])
 AM_CONDITIONAL(HAVE_X11, test "x$x11" = "xyes")
 
 cpuid="yes"
@@ -270,7 +275,7 @@ if test "x$shm" = "xyes"; then
 	AC_DEFINE([HAVE_MIT_SHM], 1, [Define to 1 if MIT-SHM is available])
 fi
 
-PKG_CHECK_MODULES(X11_DRI3, [xcb-dri3 xcb-sync xcb-present x11-xcb xshmfence x11 xrender xext libdrm], [x11_dri3="yes"], [x11_dri3="no"])
+PKG_CHECK_MODULES(X11_DRI3, [xcb-dri3 xcb-sync xcb-xfixes xcb-present x11-xcb xshmfence x11 xrender xext libdrm], [x11_dri3="yes"], [x11_dri3="no"])
 AM_CONDITIONAL(X11_DRI3, test "x$x11_dri3" = "xyes" -a "x$shm" = "xyes")
 AM_CONDITIONAL(X11_SHM, test "x$shm" = "xyes")
 
@@ -307,6 +312,8 @@ if test "x$tools" != "xno"; then
 		tools="no"
 	fi
 
+	PKG_CHECK_MODULES(TOOL_CURSOR, [xfixes x11 libpng], [cursor="yes"], [ivo="no"])
+
 	IVO_CFLAGS="$IVO_CFLAGS $extra_cflags"
 fi
 if test "x$tools" != "xno"; then
@@ -315,6 +322,7 @@ fi
 AC_MSG_CHECKING([whether to build additional tools])
 AC_MSG_RESULT([$tools])
 AM_CONDITIONAL(BUILD_TOOLS, test "x$tools" != "xno")
+AM_CONDITIONAL(BUILD_TOOL_CURSOR, test "x$cursor" = "xyes")
 
 # Define a configure option for an alternate module directory
 AC_ARG_WITH(xorg-module-dir,
@@ -339,10 +347,20 @@ AC_ARG_ENABLE(dri2,
 	      [DRI2=$enableval],
 	      [DRI2=yes])
 AC_ARG_ENABLE(dri3,
-	      AS_HELP_STRING([--enable-dri3],
-			     [Enable DRI3 support [[default=no]]]),
+	      AS_HELP_STRING([--disable-dri3],
+			     [Disable DRI3 support [[default=yes]]]),
 	      [DRI3=$enableval],
-	      [DRI3=no])
+	      [DRI3=yes])
+AC_ARG_WITH(default-dri,
+	    AS_HELP_STRING([--with-default-dri],
+			   [Select the default maximum DRI level [default 2]]),
+	      [DRI_DEFAULT=$withval],
+	      [DRI_DEFAULT=2])
+if test "x$DRI_DEFAULT" = "x0"; then
+	AC_DEFINE(DEFAULT_DRI_LEVEL, 0,[Default DRI level])
+else
+	AC_DEFINE(DEFAULT_DRI_LEVEL, ~0, [Default DRI level])
+fi
 
 AC_ARG_ENABLE(xvmc, AS_HELP_STRING([--disable-xvmc],
                                   [Disable XvMC support [[default=yes]]]),
@@ -398,6 +416,8 @@ AC_ARG_ENABLE(sna,
 	      [SNA="$enableval"],
 	      [SNA=auto])
 
+AC_CHECK_HEADERS([dev/wscons/wsconsio.h])
+
 if test "x$SNA" != "xno"; then
 	AC_DEFINE(USE_SNA, 1, [Enable SNA support])
 	AC_CHECK_HEADERS([sys/sysinfo.h], AC_CHECK_MEMBERS([struct sysinfo.totalram], [], [], [[#include <sys/sysinfo.h>]]))
@@ -426,6 +446,8 @@ fi
 
 PKG_CHECK_MODULES(XORG, [xorg-server >= $required_xorg_server_version xproto fontsproto pixman-1 >= $required_pixman_version $REQUIRED_MODULES])
 ABI_VERSION=`$PKG_CONFIG --variable=abi_videodrv xorg-server`
+XSERVER_VERSION=`$PKG_CONFIG --modversion xorg-server`
+PIXMAN_VERSION=`$PKG_CONFIG --modversion pixman-1`
 
 if test "x$ONLY_UMS" = "xyes"; then
 	UMS="yes"
@@ -519,7 +541,12 @@ AC_MSG_RESULT([$have_dri1])
 AM_CONDITIONAL(DRI1, test "x$have_dri1" != "xno")
 if test "x$have_dri1" != "xno"; then
         AC_DEFINE(HAVE_DRI1,1,[Enable DRI1 driver support])
-	dri_msg="$dri_msg DRI1"
+	str="DRI1"
+	if test "x$DRI_DEFAULT" = "x1"; then
+		AC_DEFINE(DEFAULT_DRI_LEVEL,1,[Default DRI level])
+		str="*$str"
+	fi
+	dri_msg="$dri_msg $str"
 else
         DRI1_CFLAGS=""
         DRI1_LIBS=""
@@ -576,7 +603,12 @@ AM_CONDITIONAL(DRI2, test "x$have_dri2" != "xno")
 AC_MSG_RESULT([$have_dri2])
 if test "x$have_dri2" != "xno"; then
         AC_DEFINE(HAVE_DRI2,1,[Enable DRI2 driver support])
-	dri_msg="$dri_msg DRI2"
+	str="DRI2"
+	if test "x$DRI_DEFAULT" = "x2"; then
+		AC_DEFINE(DEFAULT_DRI_LEVEL,2,[Default DRI level])
+		str="*$str"
+	fi
+	dri_msg="$dri_msg $str"
 else
 	if test "x$DRI" = "xyes" -a "x$DRI2" != "xno" -a "x$KMS" = "xyes"; then
 		AC_MSG_ERROR([DRI2 requested but prerequisites not found])
@@ -591,13 +623,21 @@ AM_CONDITIONAL(DRI3, test "x$have_dri3" != "xno")
 AC_MSG_RESULT([$have_dri3])
 if test "x$have_dri3" != "xno"; then
         AC_DEFINE(HAVE_DRI3,1,[Enable DRI3 driver support])
-	dri_msg="$dri_msg DRI3"
+	str="DRI3"
+	if test "x$DRI_DEFAULT" = "x3"; then
+		AC_DEFINE(DEFAULT_DRI_LEVEL,3,[Default DRI level])
+		str="*$str"
+	fi
+	dri_msg="$dri_msg $str"
 else
 	if test "x$DRI" = "xyes" -a "x$DRI3" != "xno" -a "x$KMS" = "xyes"; then
 		AC_MSG_ERROR([DRI3 requested but prerequisites not found])
 	fi
 fi
 
+AC_MSG_CHECKING([default DRI support])
+AC_MSG_RESULT([$DEFAULT_DRI_DEFAULT])
+
 AC_CHECK_HEADERS([X11/extensions/dpmsconst.h])
 
 PRESENT="no"
@@ -711,27 +751,6 @@ if test "x$TEARFREE" = "xyes"; then
 	xp_msg="$xp_msg TearFree"
 fi
 
-AC_ARG_ENABLE(rendernode,
-	      AS_HELP_STRING([--enable-rendernode],
-			     [Enable use of render nodes (experimental) [default=no]]),
-	      [RENDERNODE="$enableval"],
-	      [RENDERNODE="no"])
-AM_CONDITIONAL(USE_RENDERNODE, test "x$RENDERNODE" = "xyes")
-if test "x$RENDERNODE" = "xyes"; then
-	AC_DEFINE(USE_RENDERNODE,1,[Assume "rendernode" support])
-	xp_msg="$xp_msg rendernode"
-fi
-
-AC_ARG_ENABLE(wc-mmap,
-	      AS_HELP_STRING([--enable-wc-mmap],
-			     [Enable use of WriteCombining mmaps [default=no]]),
-	      [WC_MMAP="$enableval"],
-	      [WC_MMAP="no"])
-if test "x$WC_MMAP" = "xyes"; then
-	AC_DEFINE(USE_WC_MMAP,1,[Enable use of WriteCombining mmaps])
-	xp_msg="$xp_msg mmap(wc)"
-fi
-
 AC_ARG_ENABLE(create2,
 	      AS_HELP_STRING([--enable-create2],
 			     [Enable use of create2 ioctl (experimental) [default=no]]),
@@ -855,7 +874,7 @@ AC_OUTPUT
 
 echo ""
 echo ""
-test -e `pwd $0`/README && cat `pwd $0`/README
+cat $srcdir/README
 
 accel_msg=""
 if test "x$SNA" != "xno"; then
@@ -895,13 +914,15 @@ fi
 
 echo ""
 echo "AC_PACKAGE_STRING will be compiled with:"
-echo "  Xorg Video ABI version: $ABI_VERSION"
+echo "  Xorg Video ABI version: $ABI_VERSION (xorg-server-$XSERVER_VERSION)"
+echo "  pixman version: pixman-1-$PIXMAN_VERSION"
 echo "  Acceleration backends:$accel_msg"
 echo "  Additional debugging support?$debug_msg"
 echo "  Support for Kernel Mode Setting? $KMS"
 echo "  Support for legacy User Mode Setting (for i810)? $UMS"
 echo "  Support for Direct Rendering Infrastructure:$dri_msg"
 echo "  Support for Xv motion compensation (XvMC and libXvMC):$xvmc_msg"
+echo "  Support for display hotplug notifications (udev):$udev_msg"
 echo "  Build additional tools and utilities?$tools_msg"
 if test -n "$xp_msg"; then
 echo "  Experimental support:$xp_msg"
diff --git a/src/backlight.c b/src/backlight.c
index 9f23986..5d63b2c 100644
--- a/src/backlight.c
+++ b/src/backlight.c
@@ -84,7 +84,7 @@ void backlight_init(struct backlight *b)
 	b->has_power = 0;
 }
 
-#ifdef __OpenBSD__
+#ifdef HAVE_DEV_WSCONS_WSCONSIO_H
 
 #include <dev/wscons/wsconsio.h>
 #include <xf86Priv.h>
diff --git a/src/compat-api.h b/src/compat-api.h
index d09e1fb..aa93bee 100644
--- a/src/compat-api.h
+++ b/src/compat-api.h
@@ -39,7 +39,13 @@
 
 #ifndef XF86_HAS_SCRN_CONV
 #define xf86ScreenToScrn(s) xf86Screens[(s)->myNum]
+#if XORG_VERSION_CURRENT < XORG_VERSION_NUMERIC(1,1,0,0,0)
 #define xf86ScrnToScreen(s) screenInfo.screens[(s)->scrnIndex]
+#else
+#define xf86ScrnToScreen(s) ((s)->pScreen)
+#endif
+#else
+#define xf86ScrnToScreen(s) ((s)->pScreen)
 #endif
 
 #ifndef XF86_SCRN_INTERFACE
@@ -131,6 +137,17 @@ region_rects(const RegionRec *r)
 	return r->data ? (const BoxRec *)(r->data + 1) :  &r->extents;
 }
 
+inline static void
+region_get_boxes(const RegionRec *r, const BoxRec **s, const BoxRec **e)
+{
+	int n;
+	if (r->data)
+		*s = region_boxptr(r), n = r->data->numRects;
+	else
+		*s = &r->extents, n = 1;
+	*e = *s + n;
+}
+
 #ifndef INCLUDE_LEGACY_REGION_DEFINES
 #define RegionCreate(r, s) REGION_CREATE(NULL, r, s)
 #define RegionBreak(r) REGION_BREAK(NULL, r)
@@ -223,4 +240,10 @@ static inline void FreePixmap(PixmapPtr pixmap)
 			  dstx, dsty)
 #endif
 
+#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,12,99,901,0)
+#define isGPU(S) (S)->is_gpu
+#else
+#define isGPU(S) 0
+#endif
+
 #endif
diff --git a/src/i915_pciids.h b/src/i915_pciids.h
index 180ad0e..f2e47fd 100644
--- a/src/i915_pciids.h
+++ b/src/i915_pciids.h
@@ -214,9 +214,9 @@
 	INTEL_VGA_DEVICE((((gt) - 1) << 4) | (id), info)
 
 #define _INTEL_BDW_M_IDS(gt, info) \
-	_INTEL_BDW_M(gt, 0x1602, info), /* ULT */ \
+	_INTEL_BDW_M(gt, 0x1602, info), /* Halo */ \
 	_INTEL_BDW_M(gt, 0x1606, info), /* ULT */ \
-	_INTEL_BDW_M(gt, 0x160B, info), /* Iris */ \
+	_INTEL_BDW_M(gt, 0x160B, info), /* ULT */ \
 	_INTEL_BDW_M(gt, 0x160E, info) /* ULX */
 
 #define _INTEL_BDW_D_IDS(gt, info) \
@@ -259,21 +259,31 @@
 	INTEL_VGA_DEVICE(0x22b2, info), \
 	INTEL_VGA_DEVICE(0x22b3, info)
 
-#define INTEL_SKL_IDS(info) \
-	INTEL_VGA_DEVICE(0x1916, info), /* ULT GT2 */ \
+#define INTEL_SKL_GT1_IDS(info)	\
 	INTEL_VGA_DEVICE(0x1906, info), /* ULT GT1 */ \
-	INTEL_VGA_DEVICE(0x1926, info), /* ULT GT3 */ \
-	INTEL_VGA_DEVICE(0x1921, info), /* ULT GT2F */ \
 	INTEL_VGA_DEVICE(0x190E, info), /* ULX GT1 */ \
+	INTEL_VGA_DEVICE(0x1902, info), /* DT  GT1 */ \
+	INTEL_VGA_DEVICE(0x190B, info), /* Halo GT1 */ \
+	INTEL_VGA_DEVICE(0x190A, info) /* SRV GT1 */
+
+#define INTEL_SKL_GT2_IDS(info)	\
+	INTEL_VGA_DEVICE(0x1916, info), /* ULT GT2 */ \
+	INTEL_VGA_DEVICE(0x1921, info), /* ULT GT2F */ \
 	INTEL_VGA_DEVICE(0x191E, info), /* ULX GT2 */ \
 	INTEL_VGA_DEVICE(0x1912, info), /* DT  GT2 */ \
-	INTEL_VGA_DEVICE(0x1902, info), /* DT  GT1 */ \
 	INTEL_VGA_DEVICE(0x191B, info), /* Halo GT2 */ \
-	INTEL_VGA_DEVICE(0x192B, info), /* Halo GT3 */ \
-	INTEL_VGA_DEVICE(0x190B, info), /* Halo GT1 */ \
 	INTEL_VGA_DEVICE(0x191A, info), /* SRV GT2 */ \
-	INTEL_VGA_DEVICE(0x192A, info), /* SRV GT3 */ \
-	INTEL_VGA_DEVICE(0x190A, info), /* SRV GT1 */ \
 	INTEL_VGA_DEVICE(0x191D, info)  /* WKS GT2 */
 
+#define INTEL_SKL_GT3_IDS(info) \
+	INTEL_VGA_DEVICE(0x1926, info), /* ULT GT3 */ \
+	INTEL_VGA_DEVICE(0x192B, info), /* Halo GT3 */ \
+	INTEL_VGA_DEVICE(0x192A, info) /* SRV GT3 */ \
+
+#define INTEL_SKL_IDS(info) \
+	INTEL_SKL_GT1_IDS(info), \
+	INTEL_SKL_GT2_IDS(info), \
+	INTEL_SKL_GT3_IDS(info)
+
+
 #endif /* _I915_PCIIDS_H */
diff --git a/src/intel_device.c b/src/intel_device.c
index 140e153..76b0831 100644
--- a/src/intel_device.c
+++ b/src/intel_device.c
@@ -461,9 +461,9 @@ static int is_render_node(int fd, struct stat *st)
 
 static char *find_render_node(int fd)
 {
-#if defined(USE_RENDERNODE)
 	struct stat master, render;
 	char buf[128];
+	int i;
 
 	/* Are we a render-node ourselves? */
 	if (is_render_node(fd, &master))
@@ -472,9 +472,17 @@ static char *find_render_node(int fd)
 	sprintf(buf, "/dev/dri/renderD%d", (int)((master.st_rdev | 0x80) & 0xbf));
 	if (stat(buf, &render) == 0 &&
 	    master.st_mode == render.st_mode &&
-	    render.st_rdev == ((master.st_rdev | 0x80) & 0xbf))
+	    render.st_rdev == (master.st_rdev | 0x80))
 		return strdup(buf);
-#endif
+
+	/* Misaligned card <-> renderD, do a full search */
+	for (i = 0; i < 16; i++) {
+		sprintf(buf, "/dev/dri/renderD%d", i + 128);
+		if (stat(buf, &render) == 0 &&
+		    master.st_mode == render.st_mode &&
+		    render.st_rdev == (master.st_rdev | 0x80))
+			return strdup(buf);
+	}
 
 	return NULL;
 }
@@ -672,6 +680,12 @@ struct intel_device *intel_get_device(ScrnInfoPtr scrn, int *fd)
 	return dev;
 }
 
+const char *intel_get_master_name(struct intel_device *dev)
+{
+	assert(dev && dev->master_node);
+	return dev->master_node;
+}
+
 const char *intel_get_client_name(struct intel_device *dev)
 {
 	assert(dev && dev->render_node);
diff --git a/src/intel_driver.h b/src/intel_driver.h
index 28ed1a0..fc9beaf 100644
--- a/src/intel_driver.h
+++ b/src/intel_driver.h
@@ -127,6 +127,7 @@ int intel_open_device(int entity_num,
 int __intel_peek_fd(ScrnInfoPtr scrn);
 struct intel_device *intel_get_device(ScrnInfoPtr scrn, int *fd);
 int intel_has_render_node(struct intel_device *dev);
+const char *intel_get_master_name(struct intel_device *dev);
 const char *intel_get_client_name(struct intel_device *dev);
 int intel_get_client_fd(struct intel_device *dev);
 int intel_get_device_id(struct intel_device *dev);
diff --git a/src/intel_module.c b/src/intel_module.c
index 102d52a..bb74422 100644
--- a/src/intel_module.c
+++ b/src/intel_module.c
@@ -582,10 +582,17 @@ intel_scrn_create(DriverPtr		driver,
 	case NOACCEL:
 #endif
 	case UXA:
-		  return intel_init_scrn(scrn);
+		return intel_init_scrn(scrn);
 #endif
 
-	default: break;
+	default:
+#if USE_SNA
+		return sna_init_scrn(scrn, entity_num);
+#elif USE_UXA
+		return intel_init_scrn(scrn);
+#else
+		break;
+#endif
 	}
 #endif
 
diff --git a/src/intel_options.c b/src/intel_options.c
index ff8541a..034b591 100644
--- a/src/intel_options.c
+++ b/src/intel_options.c
@@ -2,6 +2,10 @@
 #include "config.h"
 #endif
 
+#include <xorg-server.h>
+#include <xorgVersion.h>
+#include <xf86Parser.h>
+
 #include "intel_options.h"
 
 const OptionInfoRec intel_options[] = {
@@ -54,3 +58,85 @@ OptionInfoPtr intel_options_get(ScrnInfoPtr scrn)
 
 	return options;
 }
+
+Bool intel_option_cast_to_bool(OptionInfoPtr options, int id, Bool val)
+{
+#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,7,99,901,0)
+	xf86getBoolValue(&val, xf86GetOptValString(options, id));
+#endif
+	return val;
+}
+
+static int
+namecmp(const char *s1, const char *s2)
+{
+	char c1, c2;
+
+	if (!s1 || *s1 == 0) {
+		if (!s2 || *s2 == 0)
+			return 0;
+		else
+			return 1;
+	}
+
+	while (*s1 == '_' || *s1 == ' ' || *s1 == '\t')
+		s1++;
+
+	while (*s2 == '_' || *s2 == ' ' || *s2 == '\t')
+		s2++;
+
+	c1 = isupper(*s1) ? tolower(*s1) : *s1;
+	c2 = isupper(*s2) ? tolower(*s2) : *s2;
+	while (c1 == c2) {
+		if (c1 == '\0')
+			return 0;
+
+		s1++;
+		while (*s1 == '_' || *s1 == ' ' || *s1 == '\t')
+			s1++;
+
+		s2++;
+		while (*s2 == '_' || *s2 == ' ' || *s2 == '\t')
+			s2++;
+
+		c1 = isupper(*s1) ? tolower(*s1) : *s1;
+		c2 = isupper(*s2) ? tolower(*s2) : *s2;
+	}
+
+	return c1 - c2;
+}
+
+unsigned intel_option_cast_to_unsigned(OptionInfoPtr options, int id, unsigned val)
+{
+#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,7,99,901,0)
+	const char *str = xf86GetOptValString(options, id);
+#else
+	const char *str = NULL;
+#endif
+	unsigned v;
+
+	if (str == NULL || *str == '\0')
+		return val;
+
+	if (namecmp(str, "on") == 0)
+		return val;
+	if (namecmp(str, "true") == 0)
+		return val;
+	if (namecmp(str, "yes") == 0)
+		return val;
+
+	if (namecmp(str, "0") == 0)
+		return 0;
+	if (namecmp(str, "off") == 0)
+		return 0;
+	if (namecmp(str, "false") == 0)
+		return 0;
+	if (namecmp(str, "no") == 0)
+		return 0;
+
+	v = atoi(str);
+	if (v)
+		return v;
+
+	return val;
+}
diff --git a/src/intel_options.h b/src/intel_options.h
index 7e2cbd9..56ba279 100644
--- a/src/intel_options.h
+++ b/src/intel_options.h
@@ -51,5 +51,7 @@ enum intel_options {
 
 extern const OptionInfoRec intel_options[];
 OptionInfoPtr intel_options_get(ScrnInfoPtr scrn);
+unsigned intel_option_cast_to_unsigned(OptionInfoPtr, int id, unsigned val);
+Bool intel_option_cast_to_bool(OptionInfoPtr, int id, Bool val);
 
 #endif /* INTEL_OPTIONS_H */
diff --git a/src/legacy/i810/i810_memory.c b/src/legacy/i810/i810_memory.c
index c3de277..6f27483 100644
--- a/src/legacy/i810/i810_memory.c
+++ b/src/legacy/i810/i810_memory.c
@@ -76,7 +76,7 @@ I810AllocateGARTMemory(ScrnInfoPtr pScrn)
    unsigned long size = pScrn->videoRam * 1024UL;
    I810Ptr pI810 = I810PTR(pScrn);
    int key;
-   long tom = 0;
+   unsigned long tom = 0;
    unsigned long physical;
 
    if (!xf86AgpGARTSupported() || !xf86AcquireGART(pScrn->scrnIndex)) {
@@ -132,8 +132,8 @@ I810AllocateGARTMemory(ScrnInfoPtr pScrn)
     * Keep it 512K aligned for the sake of tiled regions.
     */
 
-   tom += 0x7ffff;
-   tom &= ~0x7ffff;
+   tom += 0x7ffffUL;
+   tom &= ~0x7ffffUL;
 
    if ((key = xf86AllocateGARTMemory(pScrn->scrnIndex, size, 1, NULL)) != -1) {
       pI810->DcacheOffset = tom;
diff --git a/src/sna/blt.c b/src/sna/blt.c
index b5bfee6..9df7b2b 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -30,6 +30,7 @@
 #endif
 
 #include "sna.h"
+#include <pixman.h>
 
 #if __x86_64__
 #define USE_SSE2 1
@@ -745,8 +746,130 @@ memcpy_from_tiled_x__swizzle_9_11(const void *src, void *dst, int bpp,
 	}
 }
 
+static fast_memcpy void
+memcpy_to_tiled_x__gen2(const void *src, void *dst, int bpp,
+			int32_t src_stride, int32_t dst_stride,
+			int16_t src_x, int16_t src_y,
+			int16_t dst_x, int16_t dst_y,
+			uint16_t width, uint16_t height)
+{
+	const unsigned tile_width = 128;
+	const unsigned tile_height = 16;
+	const unsigned tile_size = 2048;
+
+	const unsigned cpp = bpp / 8;
+	const unsigned tile_pixels = tile_width / cpp;
+	const unsigned tile_shift = ffs(tile_pixels) - 1;
+	const unsigned tile_mask = tile_pixels - 1;
+
+	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+	assert(src != dst);
+
+	if (src_x | src_y)
+		src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
+	assert(src_stride >= width * cpp);
+	src_stride -= width * cpp;
+
+	while (height--) {
+		unsigned w = width * cpp;
+		uint8_t *tile_row = dst;
+
+		tile_row += dst_y / tile_height * dst_stride * tile_height;
+		tile_row += (dst_y & (tile_height-1)) * tile_width;
+		if (dst_x) {
+			tile_row += (dst_x >> tile_shift) * tile_size;
+			if (dst_x & tile_mask) {
+				const unsigned x = (dst_x & tile_mask) * cpp;
+				const unsigned len = min(tile_width - x, w);
+				memcpy(tile_row + x, src, len);
+
+				tile_row += tile_size;
+				src = (const uint8_t *)src + len;
+				w -= len;
+			}
+		}
+		while (w >= tile_width) {
+			memcpy(tile_row, src, tile_width);
+
+			tile_row += tile_size;
+			src = (const uint8_t *)src + tile_width;
+			w -= tile_width;
+		}
+		memcpy(tile_row, src, w);
+		src = (const uint8_t *)src + src_stride + w;
+		dst_y++;
+	}
+}
+
+static fast_memcpy void
+memcpy_from_tiled_x__gen2(const void *src, void *dst, int bpp,
+			  int32_t src_stride, int32_t dst_stride,
+			  int16_t src_x, int16_t src_y,
+			  int16_t dst_x, int16_t dst_y,
+			  uint16_t width, uint16_t height)
+{
+	const unsigned tile_width = 128;
+	const unsigned tile_height = 16;
+	const unsigned tile_size = 2048;
+
+	const unsigned cpp = bpp / 8;
+	const unsigned tile_pixels = tile_width / cpp;
+	const unsigned tile_shift = ffs(tile_pixels) - 1;
+	const unsigned tile_mask = tile_pixels - 1;
+
+	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+	assert(src != dst);
+
+	if (dst_x | dst_y)
+		dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
+	assert(dst_stride >= width * cpp);
+	dst_stride -= width * cpp;
+
+	while (height--) {
+		unsigned w = width * cpp;
+		const uint8_t *tile_row = src;
+
+		tile_row += src_y / tile_height * src_stride * tile_height;
+		tile_row += (src_y & (tile_height-1)) * tile_width;
+		if (src_x) {
+			tile_row += (src_x >> tile_shift) * tile_size;
+			if (src_x & tile_mask) {
+				const unsigned x = (src_x & tile_mask) * cpp;
+				const unsigned len = min(tile_width - x, w);
+				memcpy(dst, tile_row + x, len);
+
+				tile_row += tile_size;
+				dst = (uint8_t *)dst + len;
+				w -= len;
+			}
+		}
+		while (w >= tile_width) {
+			memcpy(dst, tile_row, tile_width);
+
+			tile_row += tile_size;
+			dst = (uint8_t *)dst + tile_width;
+			w -= tile_width;
+		}
+		memcpy(dst, tile_row, w);
+		dst = (uint8_t *)dst + dst_stride + w;
+		src_y++;
+	}
+}
+
 void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling)
 {
+	if (kgem->gen < 030) {
+		if (swizzling == I915_BIT_6_SWIZZLE_NONE) {
+			DBG(("%s: gen2, no swizzling\n", __FUNCTION__));
+			kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__gen2;
+			kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__gen2;
+		} else
+			DBG(("%s: no detiling with swizzle functions for gen2\n", __FUNCTION__));
+		return;
+	}
+
 	switch (swizzling) {
 	default:
 		DBG(("%s: unknown swizzling, %d\n", __FUNCTION__, swizzling));
@@ -1118,3 +1241,241 @@ memcpy_xor(const void *src, void *dst, int bpp,
 		}
 	}
 }
+
+#define BILINEAR_INTERPOLATION_BITS 4
+static force_inline int
+bilinear_weight(pixman_fixed_t x)
+{
+	return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
+		((1 << BILINEAR_INTERPOLATION_BITS) - 1);
+}
+
+#if BILINEAR_INTERPOLATION_BITS <= 4
+/* Inspired by Filter_32_opaque from Skia */
+static force_inline uint32_t
+bilinear_interpolation(uint32_t tl, uint32_t tr,
+		       uint32_t bl, uint32_t br,
+		       int distx, int disty)
+{
+	int distxy, distxiy, distixy, distixiy;
+	uint32_t lo, hi;
+
+	distx <<= (4 - BILINEAR_INTERPOLATION_BITS);
+	disty <<= (4 - BILINEAR_INTERPOLATION_BITS);
+
+	distxy = distx * disty;
+	distxiy = (distx << 4) - distxy;	/* distx * (16 - disty) */
+	distixy = (disty << 4) - distxy;	/* disty * (16 - distx) */
+	distixiy =
+		16 * 16 - (disty << 4) -
+		(distx << 4) + distxy; /* (16 - distx) * (16 - disty) */
+
+	lo = (tl & 0xff00ff) * distixiy;
+	hi = ((tl >> 8) & 0xff00ff) * distixiy;
+
+	lo += (tr & 0xff00ff) * distxiy;
+	hi += ((tr >> 8) & 0xff00ff) * distxiy;
+
+	lo += (bl & 0xff00ff) * distixy;
+	hi += ((bl >> 8) & 0xff00ff) * distixy;
+
+	lo += (br & 0xff00ff) * distxy;
+	hi += ((br >> 8) & 0xff00ff) * distxy;
+
+	return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff);
+}
+#elif SIZEOF_LONG > 4
+static force_inline uint32_t
+bilinear_interpolation(uint32_t tl, uint32_t tr,
+		       uint32_t bl, uint32_t br,
+		       int distx, int disty)
+{
+	uint64_t distxy, distxiy, distixy, distixiy;
+	uint64_t tl64, tr64, bl64, br64;
+	uint64_t f, r;
+
+	distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
+	disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
+	distxy = distx * disty;
+	distxiy = distx * (256 - disty);
+	distixy = (256 - distx) * disty;
+	distixiy = (256 - distx) * (256 - disty);
+
+	/* Alpha and Blue */
+	tl64 = tl & 0xff0000ff;
+	tr64 = tr & 0xff0000ff;
+	bl64 = bl & 0xff0000ff;
+	br64 = br & 0xff0000ff;
+
+	f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+	r = f & 0x0000ff0000ff0000ull;
+
+	/* Red and Green */
+	tl64 = tl;
+	tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
+
+	tr64 = tr;
+	tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
+
+	bl64 = bl;
+	bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
+
+	br64 = br;
+	br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
+
+	f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+	r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
+
+	return (uint32_t)(r >> 16);
+}
+#else
+static force_inline uint32_t
+bilinear_interpolation(uint32_t tl, uint32_t tr,
+		       uint32_t bl, uint32_t br,
+		       int distx, int disty)
+{
+	int distxy, distxiy, distixy, distixiy;
+	uint32_t f, r;
+
+	distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
+	disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
+	distxy = distx * disty;
+	distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
+	distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
+	distixiy =
+		256 * 256 - (disty << 8) -
+		(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
+
+	/* Blue */
+	r = ((tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy +
+	     (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy);
+
+	/* Green */
+	f = ((tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy +
+	     (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy);
+	r |= f & 0xff000000;
+
+	tl >>= 16;
+	tr >>= 16;
+	bl >>= 16;
+	br >>= 16;
+	r >>= 16;
+
+	/* Red */
+	f = ((tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy +
+	     (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy);
+	r |= f & 0x00ff0000;
+
+	/* Alpha */
+	f = ((tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy +
+	     (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy);
+	r |= f & 0xff000000;
+
+	return r;
+}
+#endif
+
+static inline uint32_t convert_pixel(const uint8_t *p, int x)
+{
+	return ((uint32_t *)p)[x];
+}
+
+fast void
+affine_blt(const void *src, void *dst, int bpp,
+	   int16_t src_x, int16_t src_y,
+	   int16_t src_width, int16_t src_height,
+	   int32_t src_stride,
+	   int16_t dst_x, int16_t dst_y,
+	   uint16_t dst_width, uint16_t dst_height,
+	   int32_t dst_stride,
+	   const struct pixman_f_transform *t)
+{
+	static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+	const pixman_fixed_t ux = pixman_double_to_fixed(t->m[0][0]);
+	const pixman_fixed_t uy = pixman_double_to_fixed(t->m[1][0]);
+	int i, j;
+
+	assert(bpp == 32);
+
+	for (j = 0; j < dst_height; j++) {
+		pixman_fixed_t x, y;
+		struct pixman_f_vector v;
+		uint32_t *b;
+
+		/* reference point is the center of the pixel */
+		v.v[0] = dst_x + 0.5;
+		v.v[1] = dst_y + j + 0.5;
+		v.v[2] = 1.0;
+
+		pixman_f_transform_point_3d(t, &v);
+
+		x = pixman_double_to_fixed(v.v[0]);
+		x += pixman_int_to_fixed(src_x - dst_x);
+		y = pixman_double_to_fixed(v.v[1]);
+		y +=  pixman_int_to_fixed(src_y - dst_y);
+
+		b = (uint32_t*)((uint8_t *)dst + (dst_y + j) * dst_stride + dst_x * bpp / 8);
+		for (i = 0; i < dst_width; i++) {
+			const uint8_t *row1;
+			const uint8_t *row2;
+			int x1, y1, x2, y2;
+			uint32_t tl, tr, bl, br;
+			int32_t fx, fy;
+
+			x1 = x - pixman_fixed_1/2;
+			y1 = y - pixman_fixed_1/2;
+
+			fx = bilinear_weight(x1);
+			fy = bilinear_weight(y1);
+
+			x1 = pixman_fixed_to_int(x1);
+			x2 = x1 + 1;
+			y1 = pixman_fixed_to_int(y1);
+			y2 = y1 + 1;
+
+			if (x1 >= src_width  || x2 < 0 ||
+			    y1 >= src_height || y2 < 0) {
+				b[i] = 0;
+				goto next;
+			}
+
+			if (y2 == 0) {
+				row1 = zero;
+			} else {
+				row1 = (uint8_t *)src + src_stride * y1;
+				row1 += bpp / 8 * x1;
+			}
+
+			if (y1 == src_height - 1) {
+				row2 = zero;
+			} else {
+				row2 = (uint8_t *)src + src_stride * y2;
+				row2 += bpp / 8 * x1;
+			}
+
+			if (x2 == 0) {
+				tl = 0;
+				bl = 0;
+			} else {
+				tl = convert_pixel(row1, 0);
+				bl = convert_pixel(row2, 0);
+			}
+
+			if (x1 == src_width - 1) {
+				tr = 0;
+				br = 0;
+			} else {
+				tr = convert_pixel(row1, 1);
+				br = convert_pixel(row2, 1);
+			}
+
+			b[i] = bilinear_interpolation(tl, tr, bl, br, fx, fy);
+
+next:
+			x += ux;
+			y += uy;
+		}
+	}
+}
diff --git a/src/sna/compiler.h b/src/sna/compiler.h
index ff41217..44d17db 100644
--- a/src/sna/compiler.h
+++ b/src/sna/compiler.h
@@ -39,6 +39,7 @@
 #define pure __attribute__((pure))
 #define tightly_packed __attribute__((__packed__))
 #define flatten __attribute__((flatten))
+#define nonnull __attribute__((nonnull))
 #define page_aligned __attribute__((aligned(4096)))
 #else
 #define likely(expr) (expr)
@@ -51,6 +52,7 @@
 #define pure
 #define tighly_packed
 #define flatten
+#define nonnull
 #define page_aligned
 #endif
 
diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 1104f46..12b741c 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -1572,12 +1572,12 @@ gen2_composite_picture(struct sna *sna,
 		if (channel->repeat &&
 		    (x >= 0 &&
 		     y >= 0 &&
-		     x + w < pixmap->drawable.width &&
-		     y + h < pixmap->drawable.height)) {
+		     x + w <= pixmap->drawable.width &&
+		     y + h <= pixmap->drawable.height)) {
 			struct sna_pixmap *priv = sna_pixmap(pixmap);
 			if (priv && priv->clear) {
 				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
-				return gen2_composite_solid_init(sna, channel, priv->clear_color);
+				return gen2_composite_solid_init(sna, channel, solid_color(picture->format, priv->clear_color));
 			}
 		}
 	} else
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 78289f0..c3134f4 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2251,7 +2251,7 @@ static int gen3_vertex_finish(struct sna *sna)
 		if (sna->render.vertex_reloc[0]) {
 			sna->kgem.batch[sna->render.vertex_reloc[0]] =
 				kgem_add_reloc(&sna->kgem, sna->render.vertex_reloc[0],
-					       bo, I915_GEM_DOMAIN_VERTEX << 16, 0);
+					       bo, I915_GEM_DOMAIN_VERTEX << 16 | KGEM_RELOC_FENCED, 0);
 
 			sna->render.vertex_reloc[0] = 0;
 		}
@@ -2345,7 +2345,7 @@ static void gen3_vertex_close(struct sna *sna)
 	DBG(("%s: reloc = %d\n", __FUNCTION__, sna->render.vertex_reloc[0]));
 	sna->kgem.batch[sna->render.vertex_reloc[0]] =
 		kgem_add_reloc(&sna->kgem, sna->render.vertex_reloc[0],
-			       bo, I915_GEM_DOMAIN_VERTEX << 16, delta);
+			       bo, I915_GEM_DOMAIN_VERTEX << 16 | KGEM_RELOC_FENCED, delta);
 	sna->render.vertex_reloc[0] = 0;
 
 	if (sna->render.vbo == NULL) {
@@ -3065,7 +3065,7 @@ gen3_composite_picture(struct sna *sna,
 
 	if (sna_picture_is_clear(picture, x, y, w, h, &color)) {
 		DBG(("%s: clear drawable [%08x]\n", __FUNCTION__, color));
-		return gen3_init_solid(channel, color_convert(color, picture->format, PICT_a8r8g8b8));
+		return gen3_init_solid(channel, solid_color(picture->format, color));
 	}
 
 	if (!gen3_check_repeat(picture))
@@ -3097,12 +3097,12 @@ gen3_composite_picture(struct sna *sna,
 		if (channel->repeat ||
 		    (x >= 0 &&
 		     y >= 0 &&
-		     x + w < pixmap->drawable.width &&
-		     y + h < pixmap->drawable.height)) {
+		     x + w <= pixmap->drawable.width &&
+		     y + h <= pixmap->drawable.height)) {
 			struct sna_pixmap *priv = sna_pixmap(pixmap);
 			if (priv && priv->clear) {
 				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
-				return gen3_init_solid(channel, priv->clear_color);
+				return gen3_init_solid(channel, solid_color(picture->format, priv->clear_color));
 			}
 		}
 	} else {
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 6c2d380..89cf07a 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -1585,12 +1585,14 @@ gen4_composite_picture(struct sna *sna,
 		if (channel->repeat &&
 		    (x >= 0 &&
 		     y >= 0 &&
-		     x + w < pixmap->drawable.width &&
-		     y + h < pixmap->drawable.height)) {
+		     x + w <= pixmap->drawable.width &&
+		     y + h <= pixmap->drawable.height)) {
 			struct sna_pixmap *priv = sna_pixmap(pixmap);
 			if (priv && priv->clear) {
 				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
-				return gen4_channel_init_solid(sna, channel, priv->clear_color);
+				return gen4_channel_init_solid(sna, channel,
+							       solid_color(picture->format,
+									   priv->clear_color));
 			}
 		}
 	} else
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 37cf1ff..0f9f673 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -1524,12 +1524,12 @@ gen5_composite_picture(struct sna *sna,
 		if (channel->repeat ||
 		    (x >= 0 &&
 		     y >= 0 &&
-		     x + w < pixmap->drawable.width &&
-		     y + h < pixmap->drawable.height)) {
+		     x + w <= pixmap->drawable.width &&
+		     y + h <= pixmap->drawable.height)) {
 			struct sna_pixmap *priv = sna_pixmap(pixmap);
 			if (priv && priv->clear) {
 				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
-				return gen4_channel_init_solid(sna, channel, priv->clear_color);
+				return gen4_channel_init_solid(sna, channel, solid_color(picture->format, priv->clear_color));
 			}
 		}
 	} else
diff --git a/src/sna/gen6_common.h b/src/sna/gen6_common.h
index 6668620..409bab3 100644
--- a/src/sna/gen6_common.h
+++ b/src/sna/gen6_common.h
@@ -30,8 +30,8 @@
 
 #include "sna.h"
 
-#define NO_RING_SWITCH 0
-#define PREFER_RENDER 0
+#define NO_RING_SWITCH(sna) (!(sna)->kgem.has_semaphores)
+#define PREFER_RENDER 0 /* -1 -> BLT, 1 -> RENDER */
 
 static inline bool is_uncached(struct sna *sna,
 			       struct kgem_bo *bo)
@@ -46,40 +46,22 @@ inline static bool can_switch_to_blt(struct sna *sna,
 	if (sna->kgem.ring != KGEM_RENDER)
 		return true;
 
-	if (NO_RING_SWITCH)
+	if (NO_RING_SWITCH(sna))
 		return false;
 
-	if (!sna->kgem.has_semaphores)
-		return false;
-
-	if (flags & COPY_LAST)
-		return true;
-
 	if (bo && RQ_IS_BLT(bo->rq))
 		return true;
 
 	if (sna->render_state.gt < 2)
 		return true;
 
-	return kgem_ring_is_idle(&sna->kgem, KGEM_BLT);
-}
-
-inline static bool can_switch_to_render(struct sna *sna,
-					struct kgem_bo *bo)
-{
-	if (sna->kgem.ring == KGEM_RENDER)
-		return true;
-
-	if (NO_RING_SWITCH)
+	if (bo && RQ_IS_RENDER(bo->rq))
 		return false;
 
-	if (!sna->kgem.has_semaphores)
-		return false;
-
-	if (bo && !RQ_IS_BLT(bo->rq) && !is_uncached(sna, bo))
+	if (flags & COPY_LAST)
 		return true;
 
-	return !kgem_ring_is_idle(&sna->kgem, KGEM_RENDER);
+	return kgem_ring_is_idle(&sna->kgem, KGEM_BLT);
 }
 
 static inline bool untiled_tlb_miss(struct kgem_bo *bo)
@@ -90,57 +72,89 @@ static inline bool untiled_tlb_miss(struct kgem_bo *bo)
 	return bo->tiling == I915_TILING_NONE && bo->pitch >= 4096;
 }
 
-static int prefer_blt_bo(struct sna *sna, struct kgem_bo *bo)
+static int prefer_blt_bo(struct sna *sna,
+			 struct kgem_bo *src,
+			 struct kgem_bo *dst)
 {
+	assert(dst != NULL);
+
 	if (PREFER_RENDER)
 		return PREFER_RENDER < 0;
 
-	if (bo->rq)
-		return RQ_IS_BLT(bo->rq);
+	if (dst->rq)
+		return RQ_IS_BLT(dst->rq);
 
 	if (sna->flags & SNA_POWERSAVE)
 		return true;
 
-	return bo->tiling == I915_TILING_NONE || is_uncached(sna, bo);
+	if (src) {
+		if (sna->render_state.gt > 1)
+			return false;
+
+		if (src->rq)
+			return RQ_IS_BLT(src->rq);
+        } else {
+                if (sna->render_state.gt > 2)
+                        return false;
+        }
+
+	if (sna->render_state.gt < 2)
+		return true;
+
+	return dst->tiling == I915_TILING_NONE || is_uncached(sna, dst);
 }
 
 inline static bool force_blt_ring(struct sna *sna)
 {
-	if (sna->flags & SNA_POWERSAVE)
-		return true;
-
 	if (sna->kgem.mode == KGEM_RENDER)
 		return false;
 
+	if (NO_RING_SWITCH(sna))
+		return sna->kgem.ring == KGEM_BLT;
+
+	if (sna->flags & SNA_POWERSAVE)
+		return true;
+
 	if (sna->render_state.gt < 2)
 		return true;
 
 	return false;
 }
 
-inline static bool prefer_blt_ring(struct sna *sna,
-				   struct kgem_bo *bo,
-				   unsigned flags)
+nonnull inline static bool
+prefer_blt_ring(struct sna *sna, struct kgem_bo *bo, unsigned flags)
 {
 	if (PREFER_RENDER)
 		return PREFER_RENDER < 0;
 
 	assert(!force_blt_ring(sna));
-	assert(!kgem_bo_is_render(bo));
+	assert(!kgem_bo_is_render(bo) || NO_RING_SWITCH(sna));
+
+	if (kgem_bo_is_blt(bo))
+		return true;
 
 	return can_switch_to_blt(sna, bo, flags);
 }
 
-inline static bool prefer_render_ring(struct sna *sna,
-				      struct kgem_bo *bo)
+nonnull inline static bool
+prefer_render_ring(struct sna *sna, struct kgem_bo *bo)
 {
+	if (sna->kgem.ring == KGEM_RENDER)
+		return true;
+
+	if (sna->kgem.ring != KGEM_NONE && NO_RING_SWITCH(sna))
+                return false;
+
+	if (kgem_bo_is_render(bo))
+		return true;
+
 	if (sna->flags & SNA_POWERSAVE)
 		return false;
 
-	if (sna->render_state.gt < 2)
-		return false;
+	if (!prefer_blt_bo(sna, NULL, bo))
+		return true;
 
-	return can_switch_to_render(sna, bo);
+	return !kgem_ring_is_idle(&sna->kgem, KGEM_RENDER);
 }
 
 inline static bool
@@ -156,22 +170,17 @@ prefer_blt_composite(struct sna *sna, struct sna_composite_op *tmp)
 	if (force_blt_ring(sna))
 		return true;
 
-	if (kgem_bo_is_render(tmp->dst.bo) ||
-	    kgem_bo_is_render(tmp->src.bo))
-		return false;
-
 	if (prefer_render_ring(sna, tmp->dst.bo))
 		return false;
 
 	if (!prefer_blt_ring(sna, tmp->dst.bo, 0))
 		return false;
 
-	return prefer_blt_bo(sna, tmp->dst.bo) || prefer_blt_bo(sna, tmp->src.bo);
+	return prefer_blt_bo(sna, tmp->src.bo, tmp->dst.bo);
 }
 
-static inline bool prefer_blt_fill(struct sna *sna,
-				   struct kgem_bo *bo,
-				   unsigned flags)
+nonnull static inline bool
+prefer_blt_fill(struct sna *sna, struct kgem_bo *bo, unsigned flags)
 {
 	if (PREFER_RENDER)
 		return PREFER_RENDER < 0;
@@ -183,9 +192,6 @@ static inline bool prefer_blt_fill(struct sna *sna,
 		return true;
 
 	if ((flags & (FILL_POINTS | FILL_SPANS)) == 0) {
-		if (kgem_bo_is_render(bo))
-			return false;
-
 		if (prefer_render_ring(sna, bo))
 			return false;
 
@@ -196,7 +202,7 @@ static inline bool prefer_blt_fill(struct sna *sna,
 		    return true;
 	}
 
-	return prefer_blt_bo(sna, bo);
+	return prefer_blt_bo(sna, NULL, bo);
 }
 
 void gen6_render_context_switch(struct kgem *kgem, int new_mode);
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 2504468..2afe5ee 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -1815,12 +1815,12 @@ gen6_composite_picture(struct sna *sna,
 		if (channel->repeat &&
 		    (x >= 0 &&
 		     y >= 0 &&
-		     x + w < pixmap->drawable.width &&
-		     y + h < pixmap->drawable.height)) {
+		     x + w <= pixmap->drawable.width &&
+		     y + h <= pixmap->drawable.height)) {
 			struct sna_pixmap *priv = sna_pixmap(pixmap);
 			if (priv && priv->clear) {
 				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
-				return gen4_channel_init_solid(sna, channel, priv->clear_color);
+				return gen4_channel_init_solid(sna, channel, solid_color(picture->format, priv->clear_color));
 			}
 		}
 	} else
@@ -1984,6 +1984,10 @@ try_blt(struct sna *sna,
 	bo = __sna_drawable_peek_bo(dst->pDrawable);
 	if (bo == NULL)
 		return true;
+
+	if (untiled_tlb_miss(bo))
+		return true;
+
 	if (bo->rq)
 		return RQ_IS_BLT(bo->rq);
 
@@ -1991,11 +1995,11 @@ try_blt(struct sna *sna,
 		return true;
 
 	if (src->pDrawable) {
-		bo = __sna_drawable_peek_bo(src->pDrawable);
-		if (bo == NULL)
+		struct kgem_bo *s = __sna_drawable_peek_bo(src->pDrawable);
+		if (s == NULL)
 			return true;
 
-		if (prefer_blt_bo(sna, bo))
+		if (prefer_blt_bo(sna, s, bo))
 			return true;
 	}
 
@@ -2690,13 +2694,17 @@ static inline bool prefer_blt_copy(struct sna *sna,
 	    kgem_bo_is_render(src_bo))
 		return false;
 
+	if (flags & COPY_LAST &&
+            can_switch_to_blt(sna, dst_bo, flags))
+		return true;
+
 	if (prefer_render_ring(sna, dst_bo))
 		return false;
 
 	if (!prefer_blt_ring(sna, dst_bo, flags))
 		return false;
 
-	return prefer_blt_bo(sna, src_bo) || prefer_blt_bo(sna, dst_bo);
+	return prefer_blt_bo(sna, src_bo, dst_bo);
 }
 
 static bool
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 2ecfd64..e8e14b5 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -60,8 +60,6 @@
 #define NO_FILL_ONE 0
 #define NO_FILL_CLEAR 0
 
-#define NO_RING_SWITCH 0
-
 #define USE_8_PIXEL_DISPATCH 1
 #define USE_16_PIXEL_DISPATCH 1
 #define USE_32_PIXEL_DISPATCH 0
@@ -149,7 +147,7 @@ static const struct gt_info hsw_gt1_info = {
 	.max_vs_threads = 70,
 	.max_gs_threads = 70,
 	.max_wm_threads =
-		(102 - 1) << HSW_PS_MAX_THREADS_SHIFT |
+		(70 - 1) << HSW_PS_MAX_THREADS_SHIFT |
 		1 << HSW_PS_SAMPLE_MASK_SHIFT,
 	.urb = { 128, 640, 256, 8 },
 	.gt = 1,
@@ -2048,12 +2046,13 @@ gen7_composite_picture(struct sna *sna,
 		if (channel->repeat ||
 		    (x >= 0 &&
 		     y >= 0 &&
-		     x + w < pixmap->drawable.width &&
-		     y + h < pixmap->drawable.height)) {
+		     x + w <= pixmap->drawable.width &&
+		     y + h <= pixmap->drawable.height)) {
 			struct sna_pixmap *priv = sna_pixmap(pixmap);
 			if (priv && priv->clear) {
 				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
-				return gen4_channel_init_solid(sna, channel, priv->clear_color);
+				return gen4_channel_init_solid(sna, channel,
+							       solid_color(picture->format, priv->clear_color));
 			}
 		}
 	} else
@@ -2204,6 +2203,10 @@ try_blt(struct sna *sna,
 	bo = __sna_drawable_peek_bo(dst->pDrawable);
 	if (bo == NULL)
 		return true;
+
+	if (untiled_tlb_miss(bo))
+		return true;
+
 	if (bo->rq)
 		return RQ_IS_BLT(bo->rq);
 
@@ -2211,11 +2214,11 @@ try_blt(struct sna *sna,
 		return true;
 
 	if (src->pDrawable) {
-		bo = __sna_drawable_peek_bo(src->pDrawable);
-		if (bo == NULL)
+		struct kgem_bo *s = __sna_drawable_peek_bo(src->pDrawable);
+		if (s == NULL)
 			return true;
 
-		if (prefer_blt_bo(sna, bo))
+		if (prefer_blt_bo(sna, s, bo))
 			return true;
 	}
 
@@ -2878,9 +2881,6 @@ prefer_blt_copy(struct sna *sna,
 
 	assert((flags & COPY_SYNC) == 0);
 
-	if (src_bo == dst_bo && can_switch_to_blt(sna, dst_bo, flags))
-		return true;
-
 	if (untiled_tlb_miss(src_bo) ||
 	    untiled_tlb_miss(dst_bo))
 		return true;
@@ -2888,17 +2888,26 @@ prefer_blt_copy(struct sna *sna,
 	if (force_blt_ring(sna))
 		return true;
 
+        if (sna->render_state.gt < 3 &&
+            src_bo == dst_bo &&
+            can_switch_to_blt(sna, dst_bo, flags))
+		return true;
+
 	if (kgem_bo_is_render(dst_bo) ||
 	    kgem_bo_is_render(src_bo))
 		return false;
 
+	if (flags & COPY_LAST &&
+            can_switch_to_blt(sna, dst_bo, flags))
+		return true;
+
 	if (prefer_render_ring(sna, dst_bo))
 		return false;
 
 	if (!prefer_blt_ring(sna, dst_bo, flags))
 		return false;
 
-	return prefer_blt_bo(sna, src_bo) || prefer_blt_bo(sna, dst_bo);
+	return prefer_blt_bo(sna, src_bo, dst_bo);
 }
 
 static bool
@@ -2946,7 +2955,7 @@ fallback_blt:
 		     &extents)) {
 		bool big = too_large(extents.x2-extents.x1, extents.y2-extents.y1);
 
-		if ((big || can_switch_to_blt(sna, dst_bo, flags)) &&
+		if ((big || !prefer_render_ring(sna, dst_bo)) &&
 		    sna_blt_copy_boxes(sna, alu,
 				       src_bo, src_dx, src_dy,
 				       dst_bo, dst_dx, dst_dy,
diff --git a/src/sna/gen8_render.c b/src/sna/gen8_render.c
index 6eb1145..b5a4895 100644
--- a/src/sna/gen8_render.c
+++ b/src/sna/gen8_render.c
@@ -1876,12 +1876,12 @@ gen8_composite_picture(struct sna *sna,
 		if (channel->repeat ||
 		    (x >= 0 &&
 		     y >= 0 &&
-		     x + w < pixmap->drawable.width &&
-		     y + h < pixmap->drawable.height)) {
+		     x + w <= pixmap->drawable.width &&
+		     y + h <= pixmap->drawable.height)) {
 			struct sna_pixmap *priv = sna_pixmap(pixmap);
 			if (priv && priv->clear) {
 				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
-				return gen4_channel_init_solid(sna, channel, priv->clear_color);
+				return gen4_channel_init_solid(sna, channel, solid_color(picture->format, priv->clear_color));
 			}
 		}
 	} else
@@ -2021,6 +2021,10 @@ try_blt(struct sna *sna,
 	bo = __sna_drawable_peek_bo(dst->pDrawable);
 	if (bo == NULL)
 		return true;
+
+	if (untiled_tlb_miss(bo))
+		return true;
+
 	if (bo->rq)
 		return RQ_IS_BLT(bo->rq);
 
@@ -2028,12 +2032,12 @@ try_blt(struct sna *sna,
 		return true;
 
 	if (src->pDrawable) {
-		bo = __sna_drawable_peek_bo(src->pDrawable);
-		if (bo == NULL)
+		struct kgem_bo *s = __sna_drawable_peek_bo(src->pDrawable);
+		if (s == NULL)
 			return true;
 
-		if (prefer_blt_bo(sna, bo))
-			return RQ_IS_BLT(bo->rq);
+		if (prefer_blt_bo(sna, s, bo))
+			return true;
 	}
 
 	if (sna->kgem.ring == KGEM_BLT) {
@@ -2700,9 +2704,6 @@ prefer_blt_copy(struct sna *sna,
 
 	assert((flags & COPY_SYNC) == 0);
 
-	if (src_bo == dst_bo && can_switch_to_blt(sna, dst_bo, flags))
-		return true;
-
 	if (untiled_tlb_miss(src_bo) ||
 	    untiled_tlb_miss(dst_bo))
 		return true;
@@ -2710,17 +2711,26 @@ prefer_blt_copy(struct sna *sna,
 	if (force_blt_ring(sna))
 		return true;
 
+        if (sna->render_state.gt < 3 &&
+            src_bo == dst_bo &&
+            can_switch_to_blt(sna, dst_bo, flags))
+		return true;
+
 	if (kgem_bo_is_render(dst_bo) ||
 	    kgem_bo_is_render(src_bo))
 		return false;
 
+	if (flags & COPY_LAST &&
+            can_switch_to_blt(sna, dst_bo, flags))
+		return true;
+
 	if (prefer_render_ring(sna, dst_bo))
 		return false;
 
 	if (!prefer_blt_ring(sna, dst_bo, flags))
 		return false;
 
-	return prefer_blt_bo(sna, src_bo) || prefer_blt_bo(sna, dst_bo);
+	return prefer_blt_bo(sna, src_bo, dst_bo);
 }
 
 static bool
@@ -2770,7 +2780,7 @@ fallback_blt:
 		     &extents)) {
 		bool big = too_large(extents.x2-extents.x1, extents.y2-extents.y1);
 
-		if ((big || can_switch_to_blt(sna, dst_bo, flags)) &&
+		if ((big || !prefer_render_ring(sna, dst_bo)) &&
 		    sna_blt_copy_boxes(sna, alu,
 				       src_bo, src_dx, src_dy,
 				       dst_bo, dst_dx, dst_dy,
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 78ed540..6f16cba 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -96,11 +96,6 @@ search_snoop_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 #define SHOW_BATCH_BEFORE 0
 #define SHOW_BATCH_AFTER 0
 
-#if !USE_WC_MMAP
-#undef DBG_NO_WC_MMAP
-#define DBG_NO_WC_MMAP 1
-#endif
-
 #if 0
 #define ASSERT_IDLE(kgem__, handle__) assert(!__kgem_busy(kgem__, handle__))
 #define ASSERT_MAYBE_IDLE(kgem__, handle__, expect__) assert(!(expect__) || !__kgem_busy(kgem__, handle__))
@@ -187,6 +182,15 @@ struct local_i915_gem_caching {
 #define LOCAL_IOCTL_I915_GEM_SET_CACHING DRM_IOW(DRM_COMMAND_BASE + LOCAL_I915_GEM_SET_CACHING, struct local_i915_gem_caching)
 #define LOCAL_IOCTL_I915_GEM_GET_CACHING DRM_IOW(DRM_COMMAND_BASE + LOCAL_I915_GEM_GET_CACHING, struct local_i915_gem_caching)
 
+struct local_i915_gem_mmap {
+	uint32_t handle;
+	uint32_t pad;
+	uint64_t offset;
+	uint64_t size;
+	uint64_t addr_ptr;
+};
+#define LOCAL_IOCTL_I915_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MMAP, struct local_i915_gem_mmap)
+
 struct local_i915_gem_mmap2 {
 	uint32_t handle;
 	uint32_t pad;
@@ -519,15 +523,15 @@ retry_wc:
 
 static void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
 {
-	struct drm_i915_gem_mmap mmap_arg;
+	struct local_i915_gem_mmap arg;
 	int err;
 
 retry:
-	VG_CLEAR(mmap_arg);
-	mmap_arg.handle = bo->handle;
-	mmap_arg.offset = 0;
-	mmap_arg.size = bytes(bo);
-	if ((err = do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg))) {
+	VG_CLEAR(arg);
+	arg.handle = bo->handle;
+	arg.offset = 0;
+	arg.size = bytes(bo);
+	if ((err = do_ioctl(kgem->fd, LOCAL_IOCTL_I915_GEM_MMAP, &arg))) {
 		assert(err != EINVAL);
 
 		if (__kgem_throttle_retire(kgem, 0))
@@ -541,10 +545,10 @@ retry:
 		return NULL;
 	}
 
-	VG(VALGRIND_MAKE_MEM_DEFINED(mmap_arg.addr_ptr, bytes(bo)));
+	VG(VALGRIND_MAKE_MEM_DEFINED(arg.addr_ptr, bytes(bo)));
 
 	DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
-	return bo->map__cpu = (void *)(uintptr_t)mmap_arg.addr_ptr;
+	return bo->map__cpu = (void *)(uintptr_t)arg.addr_ptr;
 }
 
 static int gem_write(int fd, uint32_t handle,
@@ -925,11 +929,11 @@ total_ram_size(void)
 #ifdef HAVE_STRUCT_SYSINFO_TOTALRAM
 	struct sysinfo info;
 	if (sysinfo(&info) == 0)
-		return info.totalram * info.mem_unit;
+		return (size_t)info.totalram * info.mem_unit;
 #endif
 
 #ifdef _SC_PHYS_PAGES
-	 return sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGE_SIZE);
+	 return (size_t)sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGE_SIZE);
 #endif
 
 	return 0;
@@ -1285,6 +1289,7 @@ static bool kgem_init_pinned_batches(struct kgem *kgem)
 {
 	int count[2] = { 16, 4 };
 	int size[2] = { 1, 4 };
+	int ret = 0;
 	int n, i;
 
 	if (kgem->wedged)
@@ -1311,7 +1316,8 @@ static bool kgem_init_pinned_batches(struct kgem *kgem)
 			}
 
 			pin.alignment = 0;
-			if (do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_PIN, &pin)) {
+			ret = do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_PIN, &pin);
+			if (ret) {
 				gem_close(kgem->fd, pin.handle);
 				free(bo);
 				goto err;
@@ -1333,6 +1339,14 @@ err:
 		}
 	}
 
+	/* If we fail to pin some memory for 830gm/845g, we need to disable
+	 * acceleration as otherwise the machine will eventually fail. However,
+	 * the kernel started arbitrarily rejecting PIN, so hope for the best
+	 * if the ioctl no longer works.
+	 */
+	if (ret != -ENODEV && kgem->gen == 020)
+		return false;
+
 	/* For simplicity populate the lists with a single unpinned bo */
 	for (n = 0; n < ARRAY_SIZE(count); n++) {
 		struct kgem_bo *bo;
@@ -1340,18 +1354,18 @@ err:
 
 		handle = gem_create(kgem->fd, size[n]);
 		if (handle == 0)
-			break;
+			return false;
 
 		bo = __kgem_bo_alloc(handle, size[n]);
 		if (bo == NULL) {
 			gem_close(kgem->fd, handle);
-			break;
+			return false;
 		}
 
 		debug_alloc__bo(kgem, bo);
 		list_add(&bo->list, &kgem->pinned_batches[n]);
 	}
-	return false;
+	return true;
 }
 
 static void kgem_init_swizzling(struct kgem *kgem)
@@ -1620,7 +1634,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
 	if (!kgem->has_relaxed_delta && kgem->batch_size > 4*1024)
 		kgem->batch_size = 4*1024;
 
-	if (!kgem_init_pinned_batches(kgem) && gen == 020) {
+	if (!kgem_init_pinned_batches(kgem)) {
 		xf86DrvMsg(kgem_get_screen_index(kgem), X_WARNING,
 			   "Unable to reserve memory for GPU, disabling acceleration.\n");
 		__kgem_set_wedged(kgem);
@@ -1651,6 +1665,9 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
 	     (long long)aperture.aper_size,
 	     (long long)aperture.aper_available_size));
 
+	/* clamp aperture to uint32_t for simplicity */
+	if (aperture.aper_size > 0xc0000000)
+		aperture.aper_size = 0xc0000000;
 	kgem->aperture_total = aperture.aper_size;
 	kgem->aperture_high = aperture.aper_size * 3/4;
 	kgem->aperture_low = aperture.aper_size * 1/3;
@@ -1659,7 +1676,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
 		kgem->aperture_high /= 2;
 		kgem->aperture_low /= 2;
 	}
-	DBG(("%s: aperture low=%d [%d], high=%d [%d]\n", __FUNCTION__,
+	DBG(("%s: aperture low=%u [%u], high=%u [%u]\n", __FUNCTION__,
 	     kgem->aperture_low, kgem->aperture_low / (1024*1024),
 	     kgem->aperture_high, kgem->aperture_high / (1024*1024)));
 
@@ -1697,7 +1714,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
 		     __FUNCTION__));
 		totalram = kgem->aperture_total;
 	}
-	DBG(("%s: total ram=%ld\n", __FUNCTION__, (long)totalram));
+	DBG(("%s: total ram=%lld\n", __FUNCTION__, (long long)totalram));
 	if (kgem->max_object_size > totalram / 2)
 		kgem->max_object_size = totalram / 2;
 	if (kgem->max_gpu_size > totalram / 4)
@@ -1749,11 +1766,11 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
 	if (DBG_NO_CPU)
 		kgem->max_cpu_size = 0;
 
-	DBG(("%s: maximum object size=%d\n",
+	DBG(("%s: maximum object size=%u\n",
 	     __FUNCTION__, kgem->max_object_size));
-	DBG(("%s: large object thresold=%d\n",
+	DBG(("%s: large object thresold=%u\n",
 	     __FUNCTION__, kgem->large_object_size));
-	DBG(("%s: max object sizes (gpu=%d, cpu=%d, tile upload=%d, copy=%d)\n",
+	DBG(("%s: max object sizes (gpu=%u, cpu=%u, tile upload=%u, copy=%u)\n",
 	     __FUNCTION__,
 	     kgem->max_gpu_size, kgem->max_cpu_size,
 	     kgem->max_upload_tile_size, kgem->max_copy_tile_size));
@@ -2656,6 +2673,34 @@ static bool kgem_retire__flushing(struct kgem *kgem)
 	return retired;
 }
 
+static bool __kgem_bo_flush(struct kgem *kgem, struct kgem_bo *bo)
+{
+	struct drm_i915_gem_busy busy;
+
+	if (!bo->needs_flush)
+		return false;
+
+	bo->needs_flush = false;
+
+	VG_CLEAR(busy);
+	busy.handle = bo->handle;
+	busy.busy = !kgem->wedged;
+	(void)do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_BUSY, &busy);
+	DBG(("%s: handle=%d, busy=%d, wedged=%d\n",
+	     __FUNCTION__, bo->handle, busy.busy, kgem->wedged));
+
+	if (busy.busy == 0)
+		return false;
+
+	DBG(("%s: moving %d to flushing\n",
+	     __FUNCTION__, bo->handle));
+	list_add(&bo->request, &kgem->flushing);
+	bo->rq = MAKE_REQUEST(kgem, !!(busy.busy & ~0x1ffff));
+	bo->needs_flush = true;
+	kgem->need_retire = true;
+	return true;
+}
+
 static bool __kgem_retire_rq(struct kgem *kgem, struct kgem_request *rq)
 {
 	bool retired = false;
@@ -2680,14 +2725,10 @@ static bool __kgem_retire_rq(struct kgem *kgem, struct kgem_request *rq)
 
 		list_del(&bo->request);
 
-		if (bo->needs_flush)
-			bo->needs_flush = __kgem_busy(kgem, bo->handle);
-		if (bo->needs_flush) {
-			DBG(("%s: moving %d to flushing\n",
+		if (unlikely(__kgem_bo_flush(kgem, bo))) {
+			assert(bo != rq->bo);
+			DBG(("%s: movied %d to flushing\n",
 			     __FUNCTION__, bo->handle));
-			list_add(&bo->request, &kgem->flushing);
-			bo->rq = MAKE_REQUEST(kgem, RQ_RING(bo->rq));
-			kgem->need_retire = true;
 			continue;
 		}
 
@@ -2845,16 +2886,18 @@ bool __kgem_ring_is_idle(struct kgem *kgem, int ring)
 	return true;
 }
 
-void __kgem_retire_requests_upto(struct kgem *kgem, struct kgem_bo *bo)
+bool __kgem_retire_requests_upto(struct kgem *kgem, struct kgem_bo *bo)
 {
 	struct kgem_request *rq = bo->rq, *tmp;
-	struct list *requests = &kgem->requests[RQ_RING(rq) == I915_EXEC_BLT];
+	struct list *requests = &kgem->requests[RQ_RING(rq) == KGEM_BLT];
+
+	DBG(("%s(handle=%d)\n", __FUNCTION__, bo->handle));
 
 	rq = RQ(rq);
 	assert(rq != &kgem->static_request);
 	if (rq == (struct kgem_request *)kgem) {
 		__kgem_bo_clear_busy(bo);
-		return;
+		return false;
 	}
 
 	do {
@@ -2862,6 +2905,11 @@ void __kgem_retire_requests_upto(struct kgem *kgem, struct kgem_bo *bo)
 		assert(tmp->ring == rq->ring);
 		__kgem_retire_rq(kgem, tmp);
 	} while (tmp != rq);
+
+	assert(bo->needs_flush || bo->rq == NULL);
+	assert(bo->needs_flush || list_is_empty(&bo->request));
+	assert(bo->needs_flush || bo->domain == DOMAIN_NONE);
+	return bo->rq;
 }
 
 #if 0
@@ -3210,7 +3258,9 @@ kgem_batch_write(struct kgem *kgem,
 	char *ptr;
 	int ret;
 
-	ASSERT_IDLE(kgem, bo->handle);
+	assert(bo->exec == NULL);
+	assert(bo->rq == NULL);
+	assert(!__kgem_busy(kgem, bo->handle));
 
 #if DBG_NO_EXEC
 	{
@@ -3619,7 +3669,8 @@ void _kgem_submit(struct kgem *kgem)
 		kgem->exec[i].relocs_ptr = (uintptr_t)kgem->reloc;
 		kgem->exec[i].alignment = 0;
 		kgem->exec[i].offset = rq->bo->presumed_offset;
-		kgem->exec[i].flags = 0;
+		/* Make sure the kernel releases any fence, ignored if gen4+ */
+		kgem->exec[i].flags = EXEC_OBJECT_NEEDS_FENCE;
 		kgem->exec[i].rsvd1 = 0;
 		kgem->exec[i].rsvd2 = 0;
 
@@ -4950,6 +5001,9 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 			}
 		}
 
+		if (flags & CREATE_CACHED)
+			return NULL;
+
 		bo = __kgem_bo_create_as_display(kgem, size, tiling, pitch);
 		if (bo)
 			return bo;
@@ -6216,8 +6270,8 @@ static void *__kgem_bo_map__gtt_or_wc(struct kgem *kgem, struct kgem_bo *bo)
 	kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 
 	if (bo->tiling || !kgem->has_wc_mmap) {
-		assert(num_pages(bo) <= kgem->aperture_mappable / 2);
 		assert(kgem->gen != 021 || bo->tiling != I915_TILING_Y);
+		warn_unless(num_pages(bo) <= kgem->aperture_mappable / 2);
 
 		ptr = bo->map__gtt;
 		if (ptr == NULL)
@@ -6319,14 +6373,16 @@ void *kgem_bo_map__wc(struct kgem *kgem, struct kgem_bo *bo)
 	     bo->handle, (long)bo->presumed_offset, bo->tiling, bo->map__gtt, bo->map__cpu, bo->domain));
 
 	assert(bo->proxy == NULL);
-	assert(bo->exec == NULL);
 	assert(list_is_empty(&bo->list));
 	assert_tiling(kgem, bo);
 	assert(!bo->purged || bo->reusable);
 
 	if (bo->map__wc)
 		return bo->map__wc;
+	if (!kgem->has_wc_mmap)
+		return NULL;
 
+	kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 	return __kgem_bo_map__wc(kgem, bo);
 }
 
@@ -6411,8 +6467,8 @@ struct kgem_bo *kgem_create_map(struct kgem *kgem,
 	first_page = (uintptr_t)ptr;
 	last_page = first_page + size + PAGE_SIZE - 1;
 
-	first_page &= ~(PAGE_SIZE-1);
-	last_page &= ~(PAGE_SIZE-1);
+	first_page &= ~(uintptr_t)(PAGE_SIZE-1);
+	last_page &= ~(uintptr_t)(PAGE_SIZE-1);
 	assert(last_page > first_page);
 
 	handle = gem_userptr(kgem->fd,
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 2267bac..f98d3b6 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -42,6 +42,7 @@ struct kgem_bo {
 #define RQ(rq) ((struct kgem_request *)((uintptr_t)(rq) & ~3))
 #define RQ_RING(rq) ((uintptr_t)(rq) & 3)
 #define RQ_IS_BLT(rq) (RQ_RING(rq) == KGEM_BLT)
+#define RQ_IS_RENDER(rq) (RQ_RING(rq) == KGEM_RENDER)
 #define MAKE_REQUEST(rq, ring) ((struct kgem_request *)((uintptr_t)(rq) | (ring)))
 
 	struct drm_i915_gem_exec_object2 *exec;
@@ -626,7 +627,7 @@ static inline bool kgem_bo_is_busy(struct kgem_bo *bo)
 	return bo->rq;
 }
 
-void __kgem_retire_requests_upto(struct kgem *kgem, struct kgem_bo *bo);
+bool __kgem_retire_requests_upto(struct kgem *kgem, struct kgem_bo *bo);
 static inline bool __kgem_bo_is_busy(struct kgem *kgem, struct kgem_bo *bo)
 {
 	DBG(("%s: handle=%d, domain: %d exec? %d, rq? %d\n", __FUNCTION__,
@@ -636,14 +637,13 @@ static inline bool __kgem_bo_is_busy(struct kgem *kgem, struct kgem_bo *bo)
 	if (bo->exec)
 		return true;
 
-	if (bo->rq && !__kgem_busy(kgem, bo->handle)) {
-		__kgem_retire_requests_upto(kgem, bo);
-		assert(list_is_empty(&bo->request));
-		assert(bo->rq == NULL);
-		assert(bo->domain == DOMAIN_NONE);
-	}
+	if (bo->rq == NULL)
+		return false;
+
+	if (__kgem_busy(kgem, bo->handle))
+		return true;
 
-	return kgem_bo_is_busy(bo);
+	return __kgem_retire_requests_upto(kgem, bo);
 }
 
 static inline bool kgem_bo_is_render(struct kgem_bo *bo)
@@ -651,7 +651,15 @@ static inline bool kgem_bo_is_render(struct kgem_bo *bo)
 	DBG(("%s: handle=%d, rq? %d [%d]\n", __FUNCTION__,
 	     bo->handle, bo->rq != NULL, (int)RQ_RING(bo->rq)));
 	assert(bo->refcnt);
-	return bo->rq && RQ_RING(bo->rq) == I915_EXEC_RENDER;
+	return bo->rq && RQ_RING(bo->rq) != KGEM_BLT;
+}
+
+static inline bool kgem_bo_is_blt(struct kgem_bo *bo)
+{
+	DBG(("%s: handle=%d, rq? %d\n", __FUNCTION__,
+	     bo->handle, bo->rq != NULL, (int)RQ_RING(bo->rq)));
+	assert(bo->refcnt);
+	return RQ_RING(bo->rq) == KGEM_BLT;
 }
 
 static inline void kgem_bo_mark_unreusable(struct kgem_bo *bo)
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 18425e3..a498484 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -284,7 +284,10 @@ struct sna {
 		struct kgem_bo *shadow;
 		unsigned front_active;
 		unsigned shadow_active;
+		unsigned rr_active;
 		unsigned flip_active;
+		unsigned hidden;
+		bool shadow_enabled;
 		bool dirty;
 
 		int max_crtc_width, max_crtc_height;
@@ -353,6 +356,8 @@ struct sna {
 		bool available;
 		bool open;
 #if HAVE_PRESENT
+		struct list vblank_queue;
+		uint64_t unflip;
 #endif
 	} present;
 
@@ -461,6 +466,11 @@ to_sna_from_screen(ScreenPtr screen)
 	return to_sna(xf86ScreenToScrn(screen));
 }
 
+pure static inline ScreenPtr to_screen_from_sna(struct sna *sna)
+{
+	return xf86ScrnToScreen(sna->scrn);
+}
+
 pure static inline struct sna *
 to_sna_from_pixmap(PixmapPtr pixmap)
 {
@@ -498,12 +508,11 @@ to_sna_from_kgem(struct kgem *kgem)
 extern xf86CrtcPtr sna_covering_crtc(struct sna *sna,
 				     const BoxRec *box,
 				     xf86CrtcPtr desired);
+extern xf86CrtcPtr sna_primary_crtc(struct sna *sna);
 
 extern bool sna_wait_for_scanline(struct sna *sna, PixmapPtr pixmap,
 				  xf86CrtcPtr crtc, const BoxRec *clip);
 
-xf86CrtcPtr sna_mode_first_crtc(struct sna *sna);
-
 const struct ust_msc {
 	uint64_t msc;
 	int tv_sec;
@@ -536,6 +545,11 @@ static inline uint64_t ust64(int tv_sec, int tv_usec)
 	return (uint64_t)tv_sec * 1000000 + tv_usec;
 }
 
+static inline uint64_t swap_ust(const struct ust_msc *swap)
+{
+	return ust64(swap->tv_sec, swap->tv_usec);
+}
+
 #if HAVE_DRI2
 bool sna_dri2_open(struct sna *sna, ScreenPtr pScreen);
 void sna_dri2_page_flip_handler(struct sna *sna, struct drm_event_vblank *event);
@@ -576,6 +590,7 @@ static inline void sna_present_vblank_handler(struct drm_event_vblank *event) {
 
 extern bool sna_crtc_set_sprite_rotation(xf86CrtcPtr crtc, uint32_t rotation);
 extern int sna_crtc_to_pipe(xf86CrtcPtr crtc);
+extern int sna_crtc_to_pipe__safe(xf86CrtcPtr crtc);
 extern uint32_t sna_crtc_to_sprite(xf86CrtcPtr crtc);
 extern uint32_t sna_crtc_id(xf86CrtcPtr crtc);
 extern bool sna_crtc_is_on(xf86CrtcPtr crtc);
@@ -998,8 +1013,7 @@ static inline uint32_t pixmap_size(PixmapPtr pixmap)
 
 bool sna_accel_init(ScreenPtr sreen, struct sna *sna);
 void sna_accel_create(struct sna *sna);
-void sna_accel_block_handler(struct sna *sna, struct timeval **tv);
-void sna_accel_wakeup_handler(struct sna *sna);
+void sna_accel_block(struct sna *sna, struct timeval **tv);
 void sna_accel_watch_flush(struct sna *sna, int enable);
 void sna_accel_flush(struct sna *sna);
 void sna_accel_enter(struct sna *sna);
@@ -1127,6 +1141,16 @@ memcpy_blt(const void *src, void *dst, int bpp,
 	   uint16_t width, uint16_t height);
 
 void
+affine_blt(const void *src, void *dst, int bpp,
+	   int16_t src_x, int16_t src_y,
+	   int16_t src_width, int16_t src_height,
+	   int32_t src_stride,
+	   int16_t dst_x, int16_t dst_y,
+	   uint16_t dst_width, uint16_t dst_height,
+	   int32_t dst_stride,
+	   const struct pixman_f_transform *t);
+
+void
 memmove_box(const void *src, void *dst,
 	    int bpp, int32_t stride,
 	    const BoxRec *box,
@@ -1182,6 +1206,31 @@ box_intersect(BoxPtr a, const BoxRec *b)
 	return true;
 }
 
+const BoxRec *
+__find_clip_box_for_y(const BoxRec *begin, const BoxRec *end, int16_t y);
+inline static const BoxRec *
+find_clip_box_for_y(const BoxRec *begin, const BoxRec *end, int16_t y)
+{
+	/* Special case for incremental trapezoid clipping */
+	if (begin == end)
+		return end;
+
+	/* Quick test if scanline is within range of clip boxes */
+	if (begin->y2 > y) {
+		assert(end == begin + 1 ||
+		       __find_clip_box_for_y(begin, end, y) == begin);
+		return begin;
+	}
+	if (y >= end[-1].y2) {
+		assert(end == begin + 1 ||
+		       __find_clip_box_for_y(begin, end, y) == end);
+		return end;
+	}
+
+	/* Otherwise bisect to find the first box crossing y */
+	return __find_clip_box_for_y(begin, end, y);
+}
+
 unsigned sna_cpu_detect(void);
 char *sna_cpu_features_to_string(unsigned features, char *line);
 
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index baf5f60..eda4c33 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -527,10 +527,10 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 		DBG(("%s: allocating CPU buffer (%dx%d)\n", __FUNCTION__,
 		     pixmap->drawable.width, pixmap->drawable.height));
 
-		hint = 0;
-		if ((flags & MOVE_ASYNC_HINT) == 0 &&
-		    ((flags & MOVE_READ) == 0 || (priv->gpu_damage && !priv->clear && !sna->kgem.has_llc)))
-			hint = CREATE_CPU_MAP | CREATE_INACTIVE | CREATE_NO_THROTTLE;
+		hint = CREATE_CPU_MAP | CREATE_INACTIVE | CREATE_NO_THROTTLE;
+		if ((flags & MOVE_ASYNC_HINT) ||
+		    (priv->gpu_damage && !priv->clear && kgem_bo_is_busy(priv->gpu_bo) && sna->kgem.can_blt_cpu))
+			hint = 0;
 
 		priv->cpu_bo = kgem_create_cpu_2d(&sna->kgem,
 						  pixmap->drawable.width,
@@ -1311,7 +1311,7 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 
 	if (unlikely((sna->render.prefer_gpu & PREFER_GPU_RENDER) == 0))
 		flags &= ~KGEM_CAN_CREATE_GPU;
-	if (wedged(sna))
+	if (wedged(sna) && usage != SNA_CREATE_FB)
 		flags &= ~KGEM_CAN_CREATE_GTT;
 
 	DBG(("%s: usage=%d, flags=%x\n", __FUNCTION__, usage, flags));
@@ -1420,7 +1420,7 @@ static void __sna_free_pixmap(struct sna *sna,
 		sna_accel_watch_flush(sna, -1);
 
 	if (priv->header) {
-		assert(pixmap->drawable.pScreen == sna->scrn->pScreen);
+		assert(pixmap->drawable.pScreen == to_screen_from_sna(sna));
 		assert(!priv->shm);
 		pixmap->devPrivate.ptr = sna->freed_pixmap;
 		sna->freed_pixmap = pixmap;
@@ -1557,6 +1557,11 @@ static inline bool has_coherent_ptr(struct sna *sna, struct sna_pixmap *priv, un
 		return true;
 	}
 
+	if (priv->pixmap->devPrivate.ptr == MAP(priv->gpu_bo->map__wc)) {
+		assert(priv->mapped == MAPPED_GTT);
+		return true;
+	}
+
 	return false;
 }
 
@@ -1577,6 +1582,16 @@ static inline bool pixmap_inplace(struct sna *sna,
 		return false;
 
 	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo)) {
+		if (priv->clear) {
+			DBG(("%s: no, clear GPU bo is busy\n", __FUNCTION__));
+			return false;
+		}
+
+		if (flags & MOVE_ASYNC_HINT) {
+			DBG(("%s: no, async hint and GPU bo is busy\n", __FUNCTION__));
+			return false;
+		}
+
 		if ((flags & (MOVE_WRITE | MOVE_READ)) == (MOVE_WRITE | MOVE_READ)) {
 			DBG(("%s: no, GPU bo is busy\n", __FUNCTION__));
 			return false;
@@ -1861,7 +1876,9 @@ sna_pixmap_undo_cow(struct sna *sna, struct sna_pixmap *priv, unsigned flags)
 	assert(priv->gpu_bo == cow->bo);
 	assert(cow->refcnt);
 
-	if (flags && (flags & MOVE_WRITE) == 0 && IS_COW_OWNER(priv->cow))
+	if (flags && /* flags == 0 => force decouple */
+	    (flags & MOVE_WRITE) == 0 &&
+	    (((flags & __MOVE_FORCE) == 0) || IS_COW_OWNER(priv->cow)))
 		return true;
 
 	if (!IS_COW_OWNER(priv->cow))
@@ -2267,6 +2284,7 @@ skip_inplace_map:
 	    (flags & MOVE_WRITE ? (void *)priv->gpu_bo : (void *)priv->gpu_damage) && priv->cpu_damage == NULL &&
 	    priv->gpu_bo->tiling == I915_TILING_NONE &&
 	    (flags & MOVE_READ || kgem_bo_can_map__cpu(&sna->kgem, priv->gpu_bo, flags & MOVE_WRITE)) &&
+	    (!priv->clear || !kgem_bo_is_busy(priv->gpu_bo)) &&
 	    ((flags & (MOVE_WRITE | MOVE_ASYNC_HINT)) == 0 ||
 	     (!priv->cow && !priv->move_to_gpu && !__kgem_bo_is_busy(&sna->kgem, priv->gpu_bo)))) {
 		void *ptr;
@@ -2330,7 +2348,9 @@ skip_inplace_map:
 			     pixmap->devKind, pixmap->devKind * pixmap->drawable.height));
 
 			if (priv->cpu_bo) {
+				kgem_bo_undo(&sna->kgem, priv->cpu_bo);
 				if ((flags & MOVE_ASYNC_HINT || priv->cpu_bo->exec) &&
+				    sna->kgem.can_blt_cpu &&
 				    sna->render.fill_one(sna,
 							  pixmap, priv->cpu_bo, priv->clear_color,
 							  0, 0,
@@ -2531,6 +2551,9 @@ static bool cpu_clear_boxes(struct sna *sna,
 {
 	struct sna_fill_op fill;
 
+	if (!sna->kgem.can_blt_cpu)
+		return false;
+
 	if (!sna_fill_init_blt(&fill, sna,
 			       pixmap, priv->cpu_bo,
 			       GXcopy, priv->clear_color,
@@ -3209,13 +3232,14 @@ __sna_pixmap_for_gpu(struct sna *sna, PixmapPtr pixmap, unsigned flags)
 {
 	struct sna_pixmap *priv;
 
+	assert(flags & (MOVE_READ | MOVE_WRITE | __MOVE_FORCE));
 	if ((flags & __MOVE_FORCE) == 0 && wedged(sna))
 		return NULL;
 
 	priv = sna_pixmap(pixmap);
 	if (priv == NULL) {
 		DBG(("%s: not attached\n", __FUNCTION__));
-		if ((flags & __MOVE_DRI) == 0)
+		if ((flags & (__MOVE_DRI | __MOVE_SCANOUT)) == 0)
 			return NULL;
 
 		if (pixmap->usage_hint == -1) {
@@ -3287,12 +3311,14 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl
 	if (priv->cow) {
 		unsigned cow = flags & (MOVE_READ | MOVE_WRITE | __MOVE_FORCE);
 
+		assert(cow);
+
 		if ((flags & MOVE_READ) == 0) {
 			if (priv->gpu_damage) {
 				r.extents = *box;
 				r.data = NULL;
 				if (!region_subsumes_damage(&r, priv->gpu_damage))
-					cow |= MOVE_READ;
+					cow |= MOVE_READ | __MOVE_FORCE;
 			}
 		} else {
 			if (priv->cpu_damage) {
@@ -3303,13 +3329,11 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl
 			}
 		}
 
-		if (cow) {
-			if (!sna_pixmap_undo_cow(sna, priv, cow))
-				return NULL;
+		if (!sna_pixmap_undo_cow(sna, priv, cow))
+			return NULL;
 
-			if (priv->gpu_bo == NULL)
-				sna_damage_destroy(&priv->gpu_damage);
-		}
+		if (priv->gpu_bo == NULL)
+			sna_damage_destroy(&priv->gpu_damage);
 	}
 
 	if (sna_damage_is_all(&priv->gpu_damage,
@@ -3527,7 +3551,8 @@ sna_drawable_use_bo(DrawablePtr drawable, unsigned flags, const BoxRec *box,
 	}
 
 	if (priv->cow) {
-		unsigned cow = MOVE_WRITE | MOVE_READ;
+		unsigned cow = MOVE_WRITE | MOVE_READ | __MOVE_FORCE;
+		assert(cow);
 
 		if (flags & IGNORE_DAMAGE) {
 			if (priv->gpu_damage) {
@@ -4121,15 +4146,14 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 
 	if (priv->cow) {
 		unsigned cow = flags & (MOVE_READ | MOVE_WRITE | __MOVE_FORCE);
+		assert(cow);
 		if (flags & MOVE_READ && priv->cpu_damage)
 			cow |= MOVE_WRITE;
-		if (cow) {
-			if (!sna_pixmap_undo_cow(sna, priv, cow))
-				return NULL;
+		if (!sna_pixmap_undo_cow(sna, priv, cow))
+			return NULL;
 
-			if (priv->gpu_bo == NULL)
-				sna_damage_destroy(&priv->gpu_damage);
-		}
+		if (priv->gpu_bo == NULL)
+			sna_damage_destroy(&priv->gpu_damage);
 	}
 
 	if (sna_damage_is_all(&priv->gpu_damage,
@@ -4864,6 +4888,7 @@ try_upload__inplace(PixmapPtr pixmap, RegionRec *region,
 	pixmap->devPrivate.ptr = dst;
 	pixmap->devKind = priv->gpu_bo->pitch;
 	priv->mapped = dst == MAP(priv->gpu_bo->map__cpu) ? MAPPED_CPU : MAPPED_GTT;
+	priv->cpu &= priv->mapped == MAPPED_CPU;
 	assert(has_coherent_ptr(sna, priv, MOVE_WRITE));
 
 	box = region_rects(region);
@@ -6098,6 +6123,9 @@ sna_copy_boxes__inplace(struct sna *sna, RegionPtr region, int alu,
 
 	kgem_bo_sync__cpu_full(&sna->kgem, src_priv->gpu_bo, FORCE_FULL_SYNC);
 
+	if (sigtrap_get())
+		return false;
+
 	box = region_rects(region);
 	n = region_num_rects(region);
 	if (src_priv->gpu_bo->tiling) {
@@ -6137,6 +6165,8 @@ sna_copy_boxes__inplace(struct sna *sna, RegionPtr region, int alu,
 		}
 	}
 
+	sigtrap_put();
+
 	return true;
 
 upload_inplace:
@@ -6234,6 +6264,9 @@ upload_inplace:
 
 	assert(has_coherent_ptr(sna, src_priv, MOVE_READ));
 
+	if (sigtrap_get())
+		return false;
+
 	box = region_rects(region);
 	n = region_num_rects(region);
 	if (dst_priv->gpu_bo->tiling) {
@@ -6265,15 +6298,19 @@ upload_inplace:
 		} while (--n);
 
 		if (!dst_priv->shm) {
-			assert(ptr == MAP(dst_priv->gpu_bo->map__cpu));
 			dst_pixmap->devPrivate.ptr = ptr;
 			dst_pixmap->devKind = dst_priv->gpu_bo->pitch;
-			dst_priv->mapped = MAPPED_CPU;
+			if (ptr == MAP(dst_priv->gpu_bo->map__cpu)) {
+				dst_priv->mapped = MAPPED_CPU;
+				dst_priv->cpu = true;
+			} else
+				dst_priv->mapped = MAPPED_GTT;
 			assert_pixmap_map(dst_pixmap, dst_priv);
-			dst_priv->cpu = true;
 		}
 	}
 
+	sigtrap_put();
+
 	return true;
 }
 
@@ -6931,7 +6968,8 @@ sna_do_copy(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 
 	/* Short cut for unmapped windows */
 	if (dst->type == DRAWABLE_WINDOW && !((WindowPtr)dst)->realized) {
-		DBG(("%s: unmapped\n", __FUNCTION__));
+		DBG(("%s: unmapped/unrealized dst (pixmap=%ld)\n",
+		     __FUNCTION__, get_window_pixmap((WindowPtr)dst)));
 		return NULL;
 	}
 
@@ -7136,30 +7174,21 @@ sna_copy_area(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 			   copy, 0, NULL);
 }
 
-static const BoxRec *
-find_clip_box_for_y(const BoxRec *begin, const BoxRec *end, int16_t y)
+const BoxRec *
+__find_clip_box_for_y(const BoxRec *begin, const BoxRec *end, int16_t y)
 {
-    const BoxRec *mid;
-
-    if (end == begin)
-	return end;
-
-    if (end - begin == 1) {
+	assert(end - begin > 1);
+	do {
+		const BoxRec *mid = begin + (end - begin) / 2;
+		if (mid->y2 > y)
+			end = mid;
+		else
+			begin = mid;
+	} while (end > begin + 1);
 	if (begin->y2 > y)
-	    return begin;
+		return begin;
 	else
-	    return end;
-    }
-
-    mid = begin + (end - begin) / 2;
-    if (mid->y2 > y)
-	/* If no box is found in [begin, mid], the function
-	 * will return @mid, which is then known to be the
-	 * correct answer.
-	 */
-	return find_clip_box_for_y(begin, mid, y);
-    else
-	return find_clip_box_for_y(mid, end, y);
+		return end;
 }
 
 struct sna_fill_spans {
@@ -11785,14 +11814,29 @@ sna_poly_fill_rect_blt(DrawablePtr drawable,
 				if (nbox > ARRAY_SIZE(boxes))
 					nbox = ARRAY_SIZE(boxes);
 				n -= nbox;
-				do {
+				while (nbox >= 2) {
+					b[0].x1 = rect[0].x + dx;
+					b[0].y1 = rect[0].y + dy;
+					b[0].x2 = b[0].x1 + rect[0].width;
+					b[0].y2 = b[0].y1 + rect[0].height;
+
+					b[1].x1 = rect[1].x + dx;
+					b[1].y1 = rect[1].y + dy;
+					b[1].x2 = b[1].x1 + rect[1].width;
+					b[1].y2 = b[1].y1 + rect[1].height;
+
+					b += 2;
+					rect += 2;
+					nbox -= 2;
+				}
+				if (nbox) {
 					b->x1 = rect->x + dx;
 					b->y1 = rect->y + dy;
 					b->x2 = b->x1 + rect->width;
 					b->y2 = b->y1 + rect->height;
 					b++;
 					rect++;
-				} while (--nbox);
+				}
 				fill.boxes(sna, &fill, boxes, b-boxes);
 				b = boxes;
 			} while (n);
@@ -11802,14 +11846,29 @@ sna_poly_fill_rect_blt(DrawablePtr drawable,
 				if (nbox > ARRAY_SIZE(boxes))
 					nbox = ARRAY_SIZE(boxes);
 				n -= nbox;
-				do {
+				while (nbox >= 2) {
+					b[0].x1 = rect[0].x;
+					b[0].y1 = rect[0].y;
+					b[0].x2 = b[0].x1 + rect[0].width;
+					b[0].y2 = b[0].y1 + rect[0].height;
+
+					b[1].x1 = rect[1].x;
+					b[1].y1 = rect[1].y;
+					b[1].x2 = b[1].x1 + rect[1].width;
+					b[1].y2 = b[1].y1 + rect[1].height;
+
+					b += 2;
+					rect += 2;
+					nbox -= 2;
+				}
+				if (nbox) {
 					b->x1 = rect->x;
 					b->y1 = rect->y;
 					b->x2 = b->x1 + rect->width;
 					b->y2 = b->y1 + rect->height;
 					b++;
 					rect++;
-				} while (--nbox);
+				}
 				fill.boxes(sna, &fill, boxes, b-boxes);
 				b = boxes;
 			} while (n);
@@ -16789,7 +16848,8 @@ sna_get_image__inplace(PixmapPtr pixmap,
 		break;
 	}
 
-	if (!kgem_bo_can_map__cpu(&sna->kgem, priv->gpu_bo, FORCE_FULL_SYNC))
+	if ((flags & MOVE_INPLACE_HINT) == 0 &&
+	    !kgem_bo_can_map__cpu(&sna->kgem, priv->gpu_bo, FORCE_FULL_SYNC))
 		return false;
 
 	if (idle && __kgem_bo_is_busy(&sna->kgem, priv->gpu_bo))
@@ -16801,11 +16861,19 @@ sna_get_image__inplace(PixmapPtr pixmap,
 	assert(sna_damage_contains_box(&priv->gpu_damage, &region->extents) == PIXMAN_REGION_IN);
 	assert(sna_damage_contains_box(&priv->cpu_damage, &region->extents) == PIXMAN_REGION_OUT);
 
-	src = kgem_bo_map__cpu(&sna->kgem, priv->gpu_bo);
-	if (src == NULL)
-		return false;
+	if (kgem_bo_can_map__cpu(&sna->kgem, priv->gpu_bo, FORCE_FULL_SYNC)) {
+		src = kgem_bo_map__cpu(&sna->kgem, priv->gpu_bo);
+		if (src == NULL)
+			return false;
 
-	kgem_bo_sync__cpu_full(&sna->kgem, priv->gpu_bo, FORCE_FULL_SYNC);
+		kgem_bo_sync__cpu_full(&sna->kgem, priv->gpu_bo, FORCE_FULL_SYNC);
+	} else {
+		src = kgem_bo_map__wc(&sna->kgem, priv->gpu_bo);
+		if (src == NULL)
+			return false;
+
+		kgem_bo_sync__gtt(&sna->kgem, priv->gpu_bo);
+	}
 
 	if (sigtrap_get())
 		return false;
@@ -16833,12 +16901,11 @@ sna_get_image__inplace(PixmapPtr pixmap,
 			   region->extents.x2 - region->extents.x1,
 			   region->extents.y2 - region->extents.y1);
 		if (!priv->shm) {
-			assert(src == MAP(priv->gpu_bo->map__cpu));
 			pixmap->devPrivate.ptr = src;
 			pixmap->devKind = priv->gpu_bo->pitch;
-			priv->mapped = MAPPED_CPU;
+			priv->mapped = src == MAP(priv->gpu_bo->map__cpu) ? MAPPED_CPU : MAPPED_GTT;
 			assert_pixmap_map(pixmap, priv);
-			priv->cpu = true;
+			priv->cpu &= priv->mapped == MAPPED_CPU;
 		}
 	}
 
@@ -17199,6 +17266,7 @@ static struct sna_pixmap *sna_accel_scanout(struct sna *sna)
 
 	assert(sna->vblank_interval);
 	assert(sna->front);
+	assert(!sna->mode.hidden);
 
 	priv = sna_pixmap(sna->front);
 	if (priv->gpu_bo == NULL)
@@ -17217,7 +17285,7 @@ static void sna_accel_disarm_timer(struct sna *sna, int id)
 static bool has_offload_slaves(struct sna *sna)
 {
 #if HAS_PIXMAP_SHARING
-	ScreenPtr screen = sna->scrn->pScreen;
+	ScreenPtr screen = to_screen_from_sna(sna);
 	PixmapDirtyUpdatePtr dirty;
 
 	xorg_list_for_each_entry(dirty, &screen->pixmap_dirty_list, ent) {
@@ -17365,7 +17433,7 @@ static bool sna_accel_do_expire(struct sna *sna)
 static void sna_accel_post_damage(struct sna *sna)
 {
 #if HAS_PIXMAP_SHARING
-	ScreenPtr screen = sna->scrn->pScreen;
+	ScreenPtr screen = to_screen_from_sna(sna);
 	PixmapDirtyUpdatePtr dirty;
 	bool flush = false;
 
@@ -17793,17 +17861,32 @@ static bool sna_option_accel_none(struct sna *sna)
 	if (xf86ReturnOptValBool(sna->Options, OPTION_ACCEL_DISABLE, FALSE))
 		return true;
 
+	if (sna->kgem.gen >= 0120)
+		return true;
+
+	if (!intel_option_cast_to_bool(sna->Options,
+				       OPTION_ACCEL_METHOD,
+				       !IS_DEFAULT_ACCEL_METHOD(NOACCEL)))
+		return false;
+
+#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,7,99,901,0)
 	s = xf86GetOptValString(sna->Options, OPTION_ACCEL_METHOD);
 	if (s == NULL)
 		return IS_DEFAULT_ACCEL_METHOD(NOACCEL);
 
 	return strcasecmp(s, "none") == 0;
+#else
+	return IS_DEFAULT_ACCEL_METHOD(NOACCEL);
+#endif
 }
 
 static bool sna_option_accel_blt(struct sna *sna)
 {
 	const char *s;
 
+	if (sna->kgem.gen >= 0110)
+		return true;
+
 	s = xf86GetOptValString(sna->Options, OPTION_ACCEL_METHOD);
 	if (s == NULL)
 		return false;
@@ -17892,21 +17975,21 @@ bool sna_accel_init(ScreenPtr screen, struct sna *sna)
 		backend = "disabled";
 		sna->kgem.wedged = true;
 		sna_render_mark_wedged(sna);
-	} else if (sna_option_accel_blt(sna) || sna->info->gen >= 0110)
+	} else if (sna_option_accel_blt(sna))
 		(void)backend;
-	else if (sna->info->gen >= 0100)
+	else if (sna->kgem.gen >= 0100)
 		backend = gen8_render_init(sna, backend);
-	else if (sna->info->gen >= 070)
+	else if (sna->kgem.gen >= 070)
 		backend = gen7_render_init(sna, backend);
-	else if (sna->info->gen >= 060)
+	else if (sna->kgem.gen >= 060)
 		backend = gen6_render_init(sna, backend);
-	else if (sna->info->gen >= 050)
+	else if (sna->kgem.gen >= 050)
 		backend = gen5_render_init(sna, backend);
-	else if (sna->info->gen >= 040)
+	else if (sna->kgem.gen >= 040)
 		backend = gen4_render_init(sna, backend);
-	else if (sna->info->gen >= 030)
+	else if (sna->kgem.gen >= 030)
 		backend = gen3_render_init(sna, backend);
-	else if (sna->info->gen >= 020)
+	else if (sna->kgem.gen >= 020)
 		backend = gen2_render_init(sna, backend);
 
 	DBG(("%s(backend=%s, prefer_gpu=%x)\n",
@@ -18003,7 +18086,7 @@ void sna_accel_close(struct sna *sna)
 	kgem_cleanup_cache(&sna->kgem);
 }
 
-void sna_accel_block_handler(struct sna *sna, struct timeval **tv)
+void sna_accel_block(struct sna *sna, struct timeval **tv)
 {
 	sigtrap_assert_inactive();
 
@@ -18083,22 +18166,6 @@ set_tv:
 	}
 }
 
-void sna_accel_wakeup_handler(struct sna *sna)
-{
-	DBG(("%s: nbatch=%d, need_retire=%d, need_purge=%d\n", __FUNCTION__,
-	     sna->kgem.nbatch, sna->kgem.need_retire, sna->kgem.need_purge));
-
-	if (!sna->kgem.nbatch)
-		return;
-
-	if (kgem_is_idle(&sna->kgem)) {
-		DBG(("%s: GPU idle, flushing\n", __FUNCTION__));
-		_kgem_submit(&sna->kgem);
-	}
-
-	sigtrap_assert_inactive();
-}
-
 void sna_accel_free(struct sna *sna)
 {
 	DBG(("%s\n", __FUNCTION__));
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index de8f6ec..59b8141 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -237,18 +237,12 @@ static bool sna_blt_fill_init(struct sna *sna,
 	return true;
 }
 
-noinline static void sna_blt_fill_begin(struct sna *sna,
-					const struct sna_blt_state *blt)
+noinline static void __sna_blt_fill_begin(struct sna *sna,
+					  const struct sna_blt_state *blt)
 {
 	struct kgem *kgem = &sna->kgem;
 	uint32_t *b;
 
-	if (kgem->nreloc) {
-		_kgem_submit(kgem);
-		_kgem_set_mode(kgem, KGEM_BLT);
-		assert(kgem->nbatch == 0);
-	}
-
 	assert(kgem->mode == KGEM_BLT);
 	b = kgem->batch + kgem->nbatch;
 	if (sna->kgem.gen >= 0100) {
@@ -293,6 +287,20 @@ noinline static void sna_blt_fill_begin(struct sna *sna,
 	}
 }
 
+inline static void sna_blt_fill_begin(struct sna *sna,
+				      const struct sna_blt_state *blt)
+{
+	struct kgem *kgem = &sna->kgem;
+
+	if (kgem->nreloc) {
+		_kgem_submit(kgem);
+		_kgem_set_mode(kgem, KGEM_BLT);
+		assert(kgem->nbatch == 0);
+	}
+
+	__sna_blt_fill_begin(sna, blt);
+}
+
 inline static void sna_blt_fill_one(struct sna *sna,
 				    const struct sna_blt_state *blt,
 				    int16_t x, int16_t y,
@@ -912,8 +920,27 @@ sna_composite_mask_is_opaque(PicturePtr mask)
 		return is_solid(mask) && is_white(mask);
 	else if (!PICT_FORMAT_A(mask->format))
 		return true;
-	else
-		return is_solid(mask) && is_opaque_solid(mask);
+	else if (mask->pSourcePict) {
+		PictSolidFill *fill = (PictSolidFill *) mask->pSourcePict;
+		return (fill->color >> 24) == 0xff;
+	} else {
+		struct sna_pixmap *priv;
+		assert(mask->pDrawable);
+
+		if (mask->pDrawable->width  == 1 &&
+		    mask->pDrawable->height == 1 &&
+		    mask->repeat)
+			return pixel_is_opaque(get_pixel(mask), mask->format);
+
+		if (mask->transform)
+			return false;
+
+		priv = sna_pixmap_from_drawable(mask->pDrawable);
+		if (priv == NULL || !priv->clear)
+			return false;
+
+		return pixel_is_opaque(priv->clear_color, mask->format);
+	}
 }
 
 fastcall
@@ -2597,24 +2624,20 @@ clear:
 				op = PictOpSrc;
 			if (op == PictOpOver) {
 				color = over(get_solid_color(src, PICT_a8r8g8b8),
-					     color_convert(sna_pixmap(tmp->dst.pixmap)->clear_color,
-							   dst->format, PICT_a8r8g8b8));
+					     solid_color(dst->format, sna_pixmap(tmp->dst.pixmap)->clear_color));
 				op = PictOpSrc;
 				DBG(("%s: precomputing solid OVER (%08x, %08x) -> %08x\n",
 				     __FUNCTION__, get_solid_color(src, PICT_a8r8g8b8),
-				     color_convert(sna_pixmap(tmp->dst.pixmap)->clear_color,
-						   dst->format, PICT_a8r8g8b8),
+				     solid_color(dst->format, sna_pixmap(tmp->dst.pixmap)->clear_color),
 				     color));
 			}
 			if (op == PictOpAdd) {
 				color = add(get_solid_color(src, PICT_a8r8g8b8),
-					    color_convert(sna_pixmap(tmp->dst.pixmap)->clear_color,
-							  dst->format, PICT_a8r8g8b8));
+					    solid_color(dst->format, sna_pixmap(tmp->dst.pixmap)->clear_color));
 				op = PictOpSrc;
 				DBG(("%s: precomputing solid ADD (%08x, %08x) -> %08x\n",
 				     __FUNCTION__, get_solid_color(src, PICT_a8r8g8b8),
-				     color_convert(sna_pixmap(tmp->dst.pixmap)->clear_color,
-						   dst->format, PICT_a8r8g8b8),
+				     solid_color(dst->format, sna_pixmap(tmp->dst.pixmap)->clear_color),
 				     color));
 			}
 		}
@@ -2720,8 +2743,8 @@ fill:
 	if (is_clear(src_pixmap)) {
 		if (src->repeat ||
 		    (x >= 0 && y >= 0 &&
-		     x + width  < src_pixmap->drawable.width &&
-		     y + height < src_pixmap->drawable.height)) {
+		     x + width  <= src_pixmap->drawable.width &&
+		     y + height <= src_pixmap->drawable.height)) {
 			color = color_convert(sna_pixmap(src_pixmap)->clear_color,
 					      src->format, tmp->dst.format);
 			goto fill;
@@ -3062,7 +3085,7 @@ static void sna_blt_fill_op_blt(struct sna *sna,
 	if (sna->blt_state.fill_bo != op->base.u.blt.bo[0]->unique_id) {
 		const struct sna_blt_state *blt = &op->base.u.blt;
 
-		sna_blt_fill_begin(sna, blt);
+		__sna_blt_fill_begin(sna, blt);
 
 		sna->blt_state.fill_bo = blt->bo[0]->unique_id;
 		sna->blt_state.fill_pixel = blt->pixel;
@@ -3079,7 +3102,7 @@ fastcall static void sna_blt_fill_op_box(struct sna *sna,
 	if (sna->blt_state.fill_bo != op->base.u.blt.bo[0]->unique_id) {
 		const struct sna_blt_state *blt = &op->base.u.blt;
 
-		sna_blt_fill_begin(sna, blt);
+		__sna_blt_fill_begin(sna, blt);
 
 		sna->blt_state.fill_bo = blt->bo[0]->unique_id;
 		sna->blt_state.fill_pixel = blt->pixel;
@@ -3097,7 +3120,7 @@ fastcall static void sna_blt_fill_op_boxes(struct sna *sna,
 	if (sna->blt_state.fill_bo != op->base.u.blt.bo[0]->unique_id) {
 		const struct sna_blt_state *blt = &op->base.u.blt;
 
-		sna_blt_fill_begin(sna, blt);
+		__sna_blt_fill_begin(sna, blt);
 
 		sna->blt_state.fill_bo = blt->bo[0]->unique_id;
 		sna->blt_state.fill_pixel = blt->pixel;
@@ -3132,7 +3155,7 @@ fastcall static void sna_blt_fill_op_points(struct sna *sna,
 	DBG(("%s: %08x x %d\n", __FUNCTION__, blt->pixel, n));
 
 	if (sna->blt_state.fill_bo != op->base.u.blt.bo[0]->unique_id) {
-		sna_blt_fill_begin(sna, blt);
+		__sna_blt_fill_begin(sna, blt);
 
 		sna->blt_state.fill_bo = blt->bo[0]->unique_id;
 		sna->blt_state.fill_pixel = blt->pixel;
diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index f01f020..c6de9d5 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -653,8 +653,9 @@ sna_composite(CARD8 op,
 	RegionRec region;
 	int dx, dy;
 
-	DBG(("%s(%d src=%ld+(%d, %d), mask=%ld+(%d, %d), dst=%ld+(%d, %d)+(%d, %d), size=(%d, %d)\n",
-	     __FUNCTION__, op,
+	DBG(("%s(pixmap=%ld, op=%d, src=%ld+(%d, %d), mask=%ld+(%d, %d), dst=%ld+(%d, %d)+(%d, %d), size=(%d, %d)\n",
+	     __FUNCTION__,
+	     pixmap->drawable.serialNumber, op,
 	     get_picture_id(src), src_x, src_y,
 	     get_picture_id(mask), mask_x, mask_y,
 	     get_picture_id(dst), dst_x, dst_y,
@@ -673,13 +674,6 @@ sna_composite(CARD8 op,
 			src = sna->clear;
 	}
 
-	if (mask && sna_composite_mask_is_opaque(mask)) {
-		DBG(("%s: removing opaque %smask\n",
-		     __FUNCTION__,
-		     mask->componentAlpha && PICT_FORMAT_RGB(mask->format) ? "CA " : ""));
-		mask = NULL;
-	}
-
 	if (!sna_compute_composite_region(&region,
 					  src, mask, dst,
 					  src_x,  src_y,
@@ -688,6 +682,13 @@ sna_composite(CARD8 op,
 					  width,  height))
 		return;
 
+	if (mask && sna_composite_mask_is_opaque(mask)) {
+		DBG(("%s: removing opaque %smask\n",
+		     __FUNCTION__,
+		     mask->componentAlpha && PICT_FORMAT_RGB(mask->format) ? "CA " : ""));
+		mask = NULL;
+	}
+
 	if (NO_COMPOSITE)
 		goto fallback;
 
@@ -797,8 +798,10 @@ sna_composite_rectangles(CARD8		 op,
 	int i, num_boxes;
 	unsigned hint;
 
-	DBG(("%s(op=%d, %08x x %d [(%d, %d)x(%d, %d) ...])\n",
-	     __FUNCTION__, op,
+	DBG(("%s(pixmap=%ld, op=%d, %08x x %d [(%d, %d)x(%d, %d) ...])\n",
+	     __FUNCTION__,
+	     get_drawable_pixmap(dst->pDrawable)->drawable.serialNumber,
+	     op,
 	     (color->alpha >> 8 << 24) |
 	     (color->red   >> 8 << 16) |
 	     (color->green >> 8 << 8) |
@@ -814,38 +817,40 @@ sna_composite_rectangles(CARD8		 op,
 		return;
 	}
 
-	if ((color->red|color->green|color->blue|color->alpha) <= 0x00ff) {
-		switch (op) {
-		case PictOpOver:
-		case PictOpOutReverse:
-		case PictOpAdd:
-			return;
-		case  PictOpInReverse:
-		case  PictOpSrc:
-			op = PictOpClear;
-			break;
-		case  PictOpAtopReverse:
-			op = PictOpOut;
-			break;
-		case  PictOpXor:
-			op = PictOpOverReverse;
-			break;
-		}
-	}
 	if (color->alpha <= 0x00ff) {
-		switch (op) {
-		case PictOpOver:
-		case PictOpOutReverse:
-			return;
-		case  PictOpInReverse:
-			op = PictOpClear;
-			break;
-		case  PictOpAtopReverse:
-			op = PictOpOut;
-			break;
-		case  PictOpXor:
-			op = PictOpOverReverse;
-			break;
+		if (PICT_FORMAT_TYPE(dst->format) == PICT_TYPE_A ||
+		    (color->red|color->green|color->blue) <= 0x00ff) {
+			switch (op) {
+			case PictOpOver:
+			case PictOpOutReverse:
+			case PictOpAdd:
+				return;
+			case  PictOpInReverse:
+			case  PictOpSrc:
+				op = PictOpClear;
+				break;
+			case  PictOpAtopReverse:
+				op = PictOpOut;
+				break;
+			case  PictOpXor:
+				op = PictOpOverReverse;
+				break;
+			}
+		} else {
+			switch (op) {
+			case PictOpOver:
+			case PictOpOutReverse:
+				return;
+			case  PictOpInReverse:
+				op = PictOpClear;
+				break;
+			case  PictOpAtopReverse:
+				op = PictOpOut;
+				break;
+			case  PictOpXor:
+				op = PictOpOverReverse;
+				break;
+			}
 		}
 	} else if (color->alpha >= 0xff00) {
 		switch (op) {
@@ -863,11 +868,16 @@ sna_composite_rectangles(CARD8		 op,
 		case  PictOpXor:
 			op = PictOpOut;
 			break;
+		case PictOpAdd:
+			if (PICT_FORMAT_TYPE(dst->format) == PICT_TYPE_A ||
+			    (color->red&color->green&color->blue) >= 0xff00)
+				op = PictOpSrc;
+			break;
 		}
 	}
 
 	/* Avoid reducing overlapping translucent rectangles */
-	if (op == PictOpOver &&
+	if ((op == PictOpOver || op == PictOpAdd) &&
 	    num_rects == 1 &&
 	    sna_drawable_is_clear(dst->pDrawable))
 		op = PictOpSrc;
@@ -979,6 +989,9 @@ sna_composite_rectangles(CARD8		 op,
 			bool ok;
 
 			if (op == PictOpClear) {
+				if (priv->clear_color == 0)
+					goto done;
+
 				ok = sna_get_pixel_from_rgba(&pixel,
 							     0, 0, 0, 0,
 							     dst->format);
@@ -990,8 +1003,11 @@ sna_composite_rectangles(CARD8		 op,
 							     color->alpha,
 							     dst->format);
 			}
-			if (ok && priv->clear_color == pixel)
+			if (ok && priv->clear_color == pixel) {
+				DBG(("%s: matches current clear, skipping\n",
+				     __FUNCTION__));
 				goto done;
+			}
 		}
 
 		if (region.data == NULL) {
diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
index 4b218b7..3ea3436 100644
--- a/src/sna/sna_display.c
+++ b/src/sna/sna_display.c
@@ -88,6 +88,8 @@ union compat_mode_get_connector{
 #define DEFAULT_DPI 96
 #endif
 
+#define OUTPUT_STATUS_CACHE_MS 15000
+
 #define DRM_MODE_PAGE_FLIP_ASYNC 0x02
 
 #define DRM_CLIENT_CAP_UNIVERSAL_PLANES 2
@@ -114,19 +116,34 @@ struct local_mode_obj_get_properties {
 
 extern XF86ConfigPtr xf86configptr;
 
+struct sna_cursor {
+	struct sna_cursor *next;
+	uint32_t *image;
+	bool transformed;
+	Rotation rotation;
+	int ref;
+	int size;
+	int last_width;
+	int last_height;
+	unsigned handle;
+	unsigned serial;
+	unsigned alloc;
+};
+
 struct sna_crtc {
 	xf86CrtcPtr base;
 	struct drm_mode_modeinfo kmode;
-	int dpms_mode;
 	PixmapPtr slave_pixmap;
 	DamagePtr slave_damage;
-	struct kgem_bo *bo, *shadow_bo, *client_bo;
+	struct kgem_bo *bo, *shadow_bo, *client_bo, *cache_bo;
 	struct sna_cursor *cursor;
 	unsigned int last_cursor_size;
 	uint32_t offset;
 	bool shadow;
 	bool fallback_shadow;
 	bool transform;
+	bool cursor_transform;
+	bool hwcursor;
 	bool flip_pending;
 	uint8_t id;
 	uint8_t pipe;
@@ -188,6 +205,9 @@ struct sna_output {
 	struct backlight backlight;
 	int backlight_active_level;
 
+	uint32_t last_detect;
+	uint32_t status;
+
 	int num_modes;
 	struct drm_mode_modeinfo *modes;
 
@@ -281,6 +301,14 @@ int sna_crtc_to_pipe(xf86CrtcPtr crtc)
 	return to_sna_crtc(crtc)->pipe;
 }
 
+int sna_crtc_to_pipe__safe(xf86CrtcPtr crtc)
+{
+	if (to_sna_crtc(crtc))
+		return sna_crtc_to_pipe(crtc);
+	else
+		return sna_crtc_to_pipe(sna_primary_crtc(to_sna(crtc->scrn)));
+}
+
 uint32_t sna_crtc_to_sprite(xf86CrtcPtr crtc)
 {
 	assert(to_sna_crtc(crtc));
@@ -299,34 +327,48 @@ bool sna_crtc_is_transformed(xf86CrtcPtr crtc)
 	return to_sna_crtc(crtc)->transform;
 }
 
-static inline uint64_t msc64(struct sna_crtc *sna_crtc, uint32_t seq)
+static inline bool msc64(struct sna_crtc *sna_crtc, uint32_t seq, uint64_t *msc)
 {
+	bool record = true;
 	if (seq < sna_crtc->last_seq) {
 		if (sna_crtc->last_seq - seq > 0x40000000) {
 			sna_crtc->wrap_seq++;
 			DBG(("%s: pipe=%d wrapped; was %u, now %u, wraps=%u\n",
 			     __FUNCTION__, sna_crtc->pipe,
 			     sna_crtc->last_seq, seq, sna_crtc->wrap_seq));
-		} else  {
-			ERR(("%s: pipe=%d msc went backwards; was %u, now %u\n",
+		} else {
+			DBG(("%s: pipe=%d msc went backwards; was %u, now %u; ignoring for last_swap\n",
 			     __FUNCTION__, sna_crtc->pipe, sna_crtc->last_seq, seq));
-			seq = sna_crtc->last_seq;
+
+			record = false;
 		}
 	}
-	sna_crtc->last_seq = seq;
-	return (uint64_t)sna_crtc->wrap_seq << 32 | seq;
+	*msc = (uint64_t)sna_crtc->wrap_seq << 32 | seq;
+	return record;
 }
 
 uint64_t sna_crtc_record_swap(xf86CrtcPtr crtc,
 			      int tv_sec, int tv_usec, unsigned seq)
 {
 	struct sna_crtc *sna_crtc = to_sna_crtc(crtc);
+	uint64_t msc;
+
 	assert(sna_crtc);
-	DBG(("%s: recording last swap on pipe=%d, frame %d, time %d.%06d\n",
-	     __FUNCTION__, sna_crtc->pipe, seq, tv_sec, tv_usec));
-	sna_crtc->swap.tv_sec = tv_sec;
-	sna_crtc->swap.tv_usec = tv_usec;
-	return sna_crtc->swap.msc = msc64(sna_crtc, seq);
+
+	if (msc64(sna_crtc, seq, &msc)) {
+		DBG(("%s: recording last swap on pipe=%d, frame %d [%08llx], time %d.%06d\n",
+		     __FUNCTION__, sna_crtc->pipe, seq, (long long)msc,
+		     tv_sec, tv_usec));
+		sna_crtc->swap.tv_sec = tv_sec;
+		sna_crtc->swap.tv_usec = tv_usec;
+		sna_crtc->swap.msc = msc;
+	} else {
+		DBG(("%s: swap event on pipe=%d, frame %d [%08llx], time %d.%06d\n",
+		     __FUNCTION__, sna_crtc->pipe, seq, (long long)msc,
+		     tv_sec, tv_usec));
+	}
+
+	return msc;
 }
 
 const struct ust_msc *sna_crtc_last_swap(xf86CrtcPtr crtc)
@@ -342,15 +384,6 @@ const struct ust_msc *sna_crtc_last_swap(xf86CrtcPtr crtc)
 	}
 }
 
-xf86CrtcPtr sna_mode_first_crtc(struct sna *sna)
-{
-	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
-	if (sna->mode.num_real_crtc)
-		return config->crtc[0];
-	else
-		return NULL;
-}
-
 #ifndef NDEBUG
 static void gem_close(int fd, uint32_t handle);
 static void assert_scanout(struct kgem *kgem, struct kgem_bo *bo,
@@ -497,8 +530,6 @@ sna_backlight_uevent(int fd, void *closure)
 		if (sna_output->dpms_mode != DPMSModeOn)
 			continue;
 
-		assert(output->randr_output);
-
 		val = backlight_get(&sna_output->backlight);
 		if (val < 0)
 			continue;
@@ -523,6 +554,7 @@ sna_backlight_uevent(int fd, void *closure)
 					       TRUE, FALSE);
 		}
 	}
+	DBG(("%s: complete\n", __FUNCTION__));
 }
 
 static void sna_backlight_pre_init(struct sna *sna)
@@ -570,6 +602,7 @@ static void sna_backlight_drain_uevents(struct sna *sna)
 	if (sna->mode.backlight_monitor == NULL)
 		return;
 
+	DBG(("%s()\n", __FUNCTION__));
 	sna_backlight_uevent(udev_monitor_get_fd(sna->mode.backlight_monitor),
 			     sna);
 }
@@ -781,6 +814,7 @@ mode_from_kmode(ScrnInfoPtr scrn,
 	mode->VTotal = kmode->vtotal;
 	mode->VScan = kmode->vscan;
 
+	mode->VRefresh = kmode->vrefresh;
 	mode->Flags = kmode->flags;
 	mode->name = get_kmode_name(kmode);
 
@@ -814,6 +848,7 @@ mode_to_kmode(struct drm_mode_modeinfo *kmode, DisplayModePtr mode)
 	kmode->vtotal = mode->VTotal;
 	kmode->vscan = mode->VScan;
 
+	kmode->vrefresh = mode->VRefresh;
 	kmode->flags = mode->Flags;
 	if (mode->name)
 		strncpy(kmode->name, mode->name, DRM_DISPLAY_MODE_LEN);
@@ -827,8 +862,7 @@ sna_crtc_force_outputs_on(xf86CrtcPtr crtc)
 	int i;
 
 	assert(to_sna_crtc(crtc));
-	DBG(("%s(pipe=%d), currently? %d\n", __FUNCTION__,
-	     to_sna_crtc(crtc)->pipe, to_sna_crtc(crtc)->dpms_mode));
+	DBG(("%s(pipe=%d)\n", __FUNCTION__, to_sna_crtc(crtc)->pipe));
 
 	/* DPMS handling by the kernel is inconsistent, so after setting a
 	 * mode on an output presume that we intend for it to be on, or that
@@ -846,7 +880,6 @@ sna_crtc_force_outputs_on(xf86CrtcPtr crtc)
 		output->funcs->dpms(output, DPMSModeOn);
 	}
 
-	to_sna_crtc(crtc)->dpms_mode = DPMSModeOn;
 #if XF86_CRTC_VERSION >= 3
 	crtc->active = TRUE;
 #endif
@@ -859,8 +892,7 @@ sna_crtc_force_outputs_off(xf86CrtcPtr crtc)
 	int i;
 
 	assert(to_sna_crtc(crtc));
-	DBG(("%s(pipe=%d), currently? %d\n", __FUNCTION__,
-	     to_sna_crtc(crtc)->pipe, to_sna_crtc(crtc)->dpms_mode));
+	DBG(("%s(pipe=%d)\n", __FUNCTION__, to_sna_crtc(crtc)->pipe));
 
 	/* DPMS handling by the kernel is inconsistent, so after setting a
 	 * mode on an output presume that we intend for it to be on, or that
@@ -877,8 +909,6 @@ sna_crtc_force_outputs_off(xf86CrtcPtr crtc)
 
 		output->funcs->dpms(output, DPMSModeOff);
 	}
-
-	to_sna_crtc(crtc)->dpms_mode = DPMSModeOff;
 }
 
 static unsigned
@@ -1099,7 +1129,6 @@ static bool wait_for_shadow(struct sna *sna,
 			    unsigned flags)
 {
 	PixmapPtr pixmap = priv->pixmap;
-	DamagePtr damage;
 	struct kgem_bo *bo, *tmp;
 	int flip_active;
 	bool ret = true;
@@ -1154,9 +1183,7 @@ static bool wait_for_shadow(struct sna *sna,
 	}
 
 	assert(sna->mode.shadow_active);
-
-	damage = sna->mode.shadow_damage;
-	sna->mode.shadow_damage = NULL;
+	sna->mode.shadow_enabled = false;
 
 	flip_active = sna->mode.flip_active;
 	if (flip_active) {
@@ -1208,6 +1235,7 @@ static bool wait_for_shadow(struct sna *sna,
 			bo = sna->mode.shadow;
 		}
 	}
+	sna->mode.shadow_enabled = true;
 
 	if (bo->refcnt > 1) {
 		bo = kgem_create_2d(&sna->kgem,
@@ -1230,8 +1258,6 @@ static bool wait_for_shadow(struct sna *sna,
 			bo = sna->mode.shadow;
 	}
 
-	sna->mode.shadow_damage = damage;
-
 	RegionSubtract(&sna->mode.shadow_region,
 		       &sna->mode.shadow_region,
 		       &sna->mode.shadow_cancel);
@@ -1358,22 +1384,38 @@ bool sna_pixmap_discard_shadow_damage(struct sna_pixmap *priv,
 	return RegionNil(&sna->mode.shadow_region);
 }
 
+static void sna_mode_damage(DamagePtr damage, RegionPtr region, void *closure)
+{
+	/* Throw away the rectangles if the region grows too big */
+	region = DamageRegion(damage);
+	if (region->data) {
+		RegionRec dup;
+
+		dup = *region;
+		RegionUninit(&dup);
+
+		region->data = NULL;
+	}
+}
+
 static bool sna_mode_enable_shadow(struct sna *sna)
 {
-	ScreenPtr screen = sna->scrn->pScreen;
+	ScreenPtr screen = to_screen_from_sna(sna);
 
 	DBG(("%s\n", __FUNCTION__));
 	assert(sna->mode.shadow == NULL);
 	assert(sna->mode.shadow_damage == NULL);
 	assert(sna->mode.shadow_active == 0);
+	assert(!sna->mode.shadow_enabled);
 
-	sna->mode.shadow_damage = DamageCreate(NULL, NULL,
-					       DamageReportNone, TRUE,
-					       screen, screen);
+	sna->mode.shadow_damage = DamageCreate(sna_mode_damage, NULL,
+					       DamageReportRawRegion,
+					       TRUE, screen, sna);
 	if (!sna->mode.shadow_damage)
 		return false;
 
 	DamageRegister(&sna->front->drawable, sna->mode.shadow_damage);
+	sna->mode.shadow_enabled = true;
 	return true;
 }
 
@@ -1381,8 +1423,10 @@ static void sna_mode_disable_shadow(struct sna *sna)
 {
 	struct sna_pixmap *priv;
 
-	if (!sna->mode.shadow_damage)
+	if (!sna->mode.shadow_damage) {
+		assert(!sna->mode.shadow_enabled);
 		return;
+	}
 
 	DBG(("%s\n", __FUNCTION__));
 
@@ -1393,6 +1437,7 @@ static void sna_mode_disable_shadow(struct sna *sna)
 	DamageUnregister(&sna->front->drawable, sna->mode.shadow_damage);
 	DamageDestroy(sna->mode.shadow_damage);
 	sna->mode.shadow_damage = NULL;
+	sna->mode.shadow_enabled = false;
 
 	if (sna->mode.shadow) {
 		kgem_bo_destroy(&sna->kgem, sna->mode.shadow);
@@ -1443,9 +1488,12 @@ static bool sna_crtc_enable_shadow(struct sna *sna, struct sna_crtc *crtc)
 	if (crtc->slave_pixmap) {
 		assert(crtc->slave_damage == NULL);
 
+		DBG(("%s: enabling PRIME slave tracking on CRTC %d [pipe=%d], pixmap=%ld\n",
+		     __FUNCTION__, crtc->id, crtc->pipe, crtc->slave_pixmap->drawable.serialNumber));
 		crtc->slave_damage = DamageCreate(sna_crtc_slave_damage, NULL,
 						  DamageReportRawRegion, TRUE,
-						  sna->scrn->pScreen, crtc);
+						  to_screen_from_sna(sna),
+						  crtc);
 		if (crtc->slave_damage == NULL) {
 			if (!--sna->mode.shadow_active)
 				sna_mode_disable_shadow(sna);
@@ -1465,6 +1513,8 @@ static void sna_crtc_disable_override(struct sna *sna, struct sna_crtc *crtc)
 	if (crtc->client_bo == NULL)
 		return;
 
+	assert(crtc->client_bo->refcnt > crtc->client_bo->active_scanout);
+
 	if (!crtc->transform) {
 		DrawableRec tmp;
 
@@ -1517,14 +1567,22 @@ __sna_crtc_disable(struct sna *sna, struct sna_crtc *sna_crtc)
 	sna_crtc_disable_shadow(sna, sna_crtc);
 
 	if (sna_crtc->bo) {
+		DBG(("%s: releasing handle=%d from scanout, active=%d\n",
+		     __FUNCTION__,sna_crtc->bo->handle, sna_crtc->bo->active_scanout-1));
 		assert(sna_crtc->bo->active_scanout);
 		assert(sna_crtc->bo->refcnt >= sna_crtc->bo->active_scanout);
 		sna_crtc->bo->active_scanout--;
 		kgem_bo_destroy(&sna->kgem, sna_crtc->bo);
 		sna_crtc->bo = NULL;
 
-		assert(sna->mode.front_active);
-		sna->mode.front_active--;
+		if (sna->mode.hidden) {
+			sna->mode.hidden--;
+			assert(sna->mode.hidden);
+			assert(sna->mode.front_active == 0);
+		} else {
+			assert(sna->mode.front_active);
+			sna->mode.front_active--;
+		}
 		sna->mode.dirty = true;
 	}
 
@@ -1532,13 +1590,19 @@ __sna_crtc_disable(struct sna *sna, struct sna_crtc *sna_crtc)
 		kgem_bo_destroy(&sna->kgem, sna_crtc->shadow_bo);
 		sna_crtc->shadow_bo = NULL;
 	}
-	sna_crtc->transform = false;
+	if (sna_crtc->transform) {
+		assert(sna->mode.rr_active);
+		sna->mode.rr_active--;
+		sna_crtc->transform = false;
+	}
 
+	sna_crtc->cursor_transform = false;
+	sna_crtc->hwcursor = true;
 	assert(!sna_crtc->shadow);
 }
 
 static void
-sna_crtc_disable(xf86CrtcPtr crtc)
+sna_crtc_disable(xf86CrtcPtr crtc, bool force)
 {
 	struct sna *sna = to_sna(crtc->scrn);
 	struct sna_crtc *sna_crtc = to_sna_crtc(crtc);
@@ -1547,11 +1611,13 @@ sna_crtc_disable(xf86CrtcPtr crtc)
 	if (sna_crtc == NULL)
 		return;
 
-	DBG(("%s: disabling crtc [%d, pipe=%d]\n", __FUNCTION__,
-	     sna_crtc->id, sna_crtc->pipe));
+	if (!force && sna_crtc->bo == NULL)
+		return;
+
+	DBG(("%s: disabling crtc [%d, pipe=%d], force?=%d\n", __FUNCTION__,
+	     sna_crtc->id, sna_crtc->pipe, force));
 
 	sna_crtc_force_outputs_off(crtc);
-	assert(sna_crtc->dpms_mode == DPMSModeOff);
 
 	memset(&arg, 0, sizeof(arg));
 	arg.crtc_id = sna_crtc->id;
@@ -1579,7 +1645,7 @@ static void update_flush_interval(struct sna *sna)
 			continue;
 		}
 
-		if (to_sna_crtc(crtc)->dpms_mode != DPMSModeOn) {
+		if (to_sna_crtc(crtc)->bo == NULL) {
 			DBG(("%s: CRTC:%d (pipe %d) turned off\n",
 			     __FUNCTION__,i, to_sna_crtc(crtc)->pipe));
 			continue;
@@ -1642,7 +1708,7 @@ void sna_copy_fbcon(struct sna *sna)
 	int dx, dy;
 	int i;
 
-	if (wedged(sna))
+	if (wedged(sna) || isGPU(sna->scrn))
 		return;
 
 	DBG(("%s\n", __FUNCTION__));
@@ -1726,7 +1792,7 @@ void sna_copy_fbcon(struct sna *sna)
 	kgem_bo_destroy(&sna->kgem, bo);
 
 #if ABI_VIDEODRV_VERSION >= SET_ABI_VERSION(10, 0)
-	sna->scrn->pScreen->canDoBGNoneRoot = ok;
+	to_screen_from_sna(sna)->canDoBGNoneRoot = ok;
 #endif
 }
 
@@ -1736,7 +1802,6 @@ static bool use_shadow(struct sna *sna, xf86CrtcPtr crtc)
 	PictTransform crtc_to_fb;
 	struct pict_f_transform f_crtc_to_fb, f_fb_to_crtc;
 	unsigned pitch_limit;
-	struct sna_pixmap *priv;
 	BoxRec b;
 
 	assert(sna->scrn->virtualX && sna->scrn->virtualY);
@@ -1765,27 +1830,31 @@ static bool use_shadow(struct sna *sna, xf86CrtcPtr crtc)
 		return true;
 	}
 
-	priv = sna_pixmap_force_to_gpu(sna->front, MOVE_READ | __MOVE_SCANOUT);
-	if (priv == NULL)
-		return true; /* maybe we can create a bo for the scanout? */
-
-	if (sna->kgem.gen == 071)
-		pitch_limit = priv->gpu_bo->tiling ? 16 * 1024 : 32 * 1024;
-	else if ((sna->kgem.gen >> 3) > 4)
-		pitch_limit = 32 * 1024;
-	else if ((sna->kgem.gen >> 3) == 4)
-		pitch_limit = priv->gpu_bo->tiling ? 16 * 1024 : 32 * 1024;
-	else if ((sna->kgem.gen >> 3) == 3)
-		pitch_limit = priv->gpu_bo->tiling ? 8 * 1024 : 16 * 1024;
-	else
-		pitch_limit = 8 * 1024;
-	DBG(("%s: gpu bo handle=%d tiling=%d pitch=%d, limit=%d\n", __FUNCTION__, priv->gpu_bo->handle, priv->gpu_bo->tiling, priv->gpu_bo->pitch, pitch_limit));
-	if (priv->gpu_bo->pitch > pitch_limit)
-		return true;
+	if (!isGPU(sna->scrn)) {
+		struct sna_pixmap *priv;
 
-	if (priv->gpu_bo->tiling && sna->flags & SNA_LINEAR_FB) {
-		DBG(("%s: gpu bo is tiled, need linear, forcing shadow\n", __FUNCTION__));
-		return true;
+		priv = sna_pixmap_force_to_gpu(sna->front, MOVE_READ | __MOVE_SCANOUT);
+		if (priv == NULL)
+			return true; /* maybe we can create a bo for the scanout? */
+
+		if (sna->kgem.gen == 071)
+			pitch_limit = priv->gpu_bo->tiling ? 16 * 1024 : 32 * 1024;
+		else if ((sna->kgem.gen >> 3) > 4)
+			pitch_limit = 32 * 1024;
+		else if ((sna->kgem.gen >> 3) == 4)
+			pitch_limit = priv->gpu_bo->tiling ? 16 * 1024 : 32 * 1024;
+		else if ((sna->kgem.gen >> 3) == 3)
+			pitch_limit = priv->gpu_bo->tiling ? 8 * 1024 : 16 * 1024;
+		else
+			pitch_limit = 8 * 1024;
+		DBG(("%s: gpu bo handle=%d tiling=%d pitch=%d, limit=%d\n", __FUNCTION__, priv->gpu_bo->handle, priv->gpu_bo->tiling, priv->gpu_bo->pitch, pitch_limit));
+		if (priv->gpu_bo->pitch > pitch_limit)
+			return true;
+
+		if (priv->gpu_bo->tiling && sna->flags & SNA_LINEAR_FB) {
+			DBG(("%s: gpu bo is tiled, need linear, forcing shadow\n", __FUNCTION__));
+			return true;
+		}
 	}
 
 	transform = NULL;
@@ -1919,10 +1988,15 @@ static struct kgem_bo *sna_crtc_attach(xf86CrtcPtr crtc)
 	struct sna *sna = to_sna(scrn);
 	struct kgem_bo *bo;
 
-	sna_crtc->transform = false;
+	if (sna_crtc->transform) {
+		assert(sna->mode.rr_active);
+		sna_crtc->transform = false;
+		sna->mode.rr_active--;
+	}
 	sna_crtc->rotation = RR_Rotate_0;
 
 	if (use_shadow(sna, crtc)) {
+		PixmapPtr front;
 		unsigned long tiled_limit;
 		int tiling;
 
@@ -1977,8 +2051,8 @@ force_shadow:
 			return NULL;
 		}
 
-		if (__sna_pixmap_get_bo(sna->front) && !crtc->transformPresent) {
-			DrawableRec tmp;
+		front = sna_crtc->slave_pixmap ?: sna->front;
+		if (__sna_pixmap_get_bo(front) && !crtc->transformPresent) {
 			BoxRec b;
 
 			b.x1 = crtc->x;
@@ -1986,21 +2060,47 @@ force_shadow:
 			b.x2 = crtc->x + crtc->mode.HDisplay;
 			b.y2 = crtc->y + crtc->mode.VDisplay;
 
-			DBG(("%s: copying onto shadow CRTC: (%d, %d)x(%d, %d), handle=%d\n",
-			     __FUNCTION__,
-			     b.x1, b.y1,
-			     b.x2, b.y2,
-			     bo->handle));
-
-			tmp.width = crtc->mode.HDisplay;
-			tmp.height = crtc->mode.VDisplay;
-			tmp.depth = sna->front->drawable.depth;
-			tmp.bitsPerPixel = sna->front->drawable.bitsPerPixel;
+			if (b.x1 < 0)
+				b.x1 = 0;
+			if (b.y1 < 0)
+				b.y1 = 0;
+			if (b.x2 > scrn->virtualX)
+				b.x2 = scrn->virtualX;
+			if (b.y2 > scrn->virtualY)
+				b.y2 = scrn->virtualY;
+			if (b.x2 - b.x1 < crtc->mode.HDisplay ||
+			    b.y2 - b.y1 < crtc->mode.VDisplay) {
+				bool ok = false;
+				if (!wedged(sna))
+					ok = sna->render.fill_one(sna, front, bo, 0,
+								  0, 0, crtc->mode.HDisplay, crtc->mode.VDisplay,
+								  GXclear);
+				if (!ok) {
+					void *ptr = kgem_bo_map__gtt(&sna->kgem, bo);
+					if (ptr)
+						memset(ptr, 0, bo->pitch * crtc->mode.VDisplay);
+				}
+			}
+			if (b.y2 > b.y1 && b.x2 > b.x1) {
+				DrawableRec tmp;
 
-			(void)sna->render.copy_boxes(sna, GXcopy,
-						     &sna->front->drawable, __sna_pixmap_get_bo(sna->front), 0, 0,
-						     &tmp, bo, -b.x1, -b.y1,
-						     &b, 1, 0);
+				DBG(("%s: copying onto shadow CRTC: (%d, %d)x(%d, %d) [fb=%dx%d], handle=%d\n",
+				     __FUNCTION__,
+				     b.x1, b.y1,
+				     b.x2-b.x1, b.y2-b.y1,
+				     scrn->virtualX, scrn->virtualY,
+				     bo->handle));
+
+				tmp.width = crtc->mode.HDisplay;
+				tmp.height = crtc->mode.VDisplay;
+				tmp.depth = front->drawable.depth;
+				tmp.bitsPerPixel = front->drawable.bitsPerPixel;
+
+				(void)sna->render.copy_boxes(sna, GXcopy,
+							     &front->drawable, __sna_pixmap_get_bo(front), 0, 0,
+							     &tmp, bo, -crtc->x, -crtc->y,
+							     &b, 1, 0);
+			}
 		}
 
 		sna_crtc->shadow_bo_width = crtc->mode.HDisplay;
@@ -2008,6 +2108,7 @@ force_shadow:
 		sna_crtc->shadow_bo = bo;
 out_shadow:
 		sna_crtc->transform = true;
+		sna->mode.rr_active++;
 		return kgem_bo_reference(bo);
 	} else {
 		if (sna_crtc->shadow_bo) {
@@ -2093,6 +2194,8 @@ out_shadow:
 					goto force_shadow;
 				}
 
+				assert(__sna_pixmap_get_bo(sna->front) == NULL ||
+				       __sna_pixmap_get_bo(sna->front)->pitch == shadow->pitch);
 				sna->mode.shadow = shadow;
 				set_shadow(sna, &region);
 			}
@@ -2107,6 +2210,37 @@ out_shadow:
 	}
 }
 
+#define SCALING_EPSILON (1./256)
+
+static bool
+is_affine(const struct pixman_f_transform *t)
+{
+	return (fabs(t->m[2][0]) < SCALING_EPSILON &&
+		fabs(t->m[2][1]) < SCALING_EPSILON);
+}
+
+static double determinant(const struct pixman_f_transform *t)
+{
+	return t->m[0][0]*t->m[1][1] - t->m[1][0]*t->m[0][1];
+}
+
+static bool
+affine_is_pixel_exact(const struct pixman_f_transform *t)
+{
+	double det = t->m[2][2] * determinant(t);
+	if (fabs (det * det - 1.0) < SCALING_EPSILON) {
+		if (fabs(t->m[0][1]) < SCALING_EPSILON &&
+		    fabs(t->m[1][0]) < SCALING_EPSILON)
+			return true;
+
+		if (fabs(t->m[0][0]) < SCALING_EPSILON &&
+		    fabs(t->m[1][1]) < SCALING_EPSILON)
+			return true;
+	}
+
+	return false;
+}
+
 static void sna_crtc_randr(xf86CrtcPtr crtc)
 {
 	struct sna_crtc *sna_crtc = to_sna_crtc(crtc);
@@ -2152,6 +2286,16 @@ static void sna_crtc_randr(xf86CrtcPtr crtc)
 	} else
 		crtc->transform_in_use = sna_crtc->rotation != RR_Rotate_0;
 
+	if (needs_transform) {
+		sna_crtc->hwcursor = is_affine(&f_fb_to_crtc);
+		sna_crtc->cursor_transform =
+			sna_crtc->hwcursor &&
+			!affine_is_pixel_exact(&f_fb_to_crtc);
+	} else {
+		sna_crtc->hwcursor = true;
+		sna_crtc->cursor_transform = false;
+	}
+
 	crtc->crtc_to_framebuffer = crtc_to_fb;
 	crtc->f_crtc_to_framebuffer = f_crtc_to_fb;
 	crtc->f_framebuffer_to_crtc = f_fb_to_crtc;
@@ -2184,7 +2328,7 @@ static void sna_crtc_randr(xf86CrtcPtr crtc)
 static void
 sna_crtc_damage(xf86CrtcPtr crtc)
 {
-	ScreenPtr screen = crtc->scrn->pScreen;
+	ScreenPtr screen = xf86ScrnToScreen(crtc->scrn);
 	struct sna *sna = to_sna(crtc->scrn);
 	RegionRec region, *damage;
 
@@ -2200,6 +2344,12 @@ sna_crtc_damage(xf86CrtcPtr crtc)
 	if (region.extents.y2 > screen->height)
 		region.extents.y2 = screen->height;
 
+	if (region.extents.x2 <= region.extents.x1 ||
+	    region.extents.y2 <= region.extents.y1) {
+		DBG(("%s: crtc not damaged, all-clipped\n", __FUNCTION__));
+		return;
+	}
+
 	DBG(("%s: marking crtc %d as completely damaged (%d, %d), (%d, %d)\n",
 	     __FUNCTION__, to_sna_crtc(crtc)->id,
 	     region.extents.x1, region.extents.y1,
@@ -2268,11 +2418,18 @@ __sna_crtc_set_mode(xf86CrtcPtr crtc)
 	struct kgem_bo *saved_bo, *bo;
 	uint32_t saved_offset;
 	bool saved_transform;
+	bool saved_hwcursor;
+	bool saved_cursor_transform;
 
-	DBG(("%s\n", __FUNCTION__));
+	DBG(("%s: CRTC=%d, pipe=%d, hidden?=%d\n", __FUNCTION__,
+	     sna_crtc->id, sna_crtc->pipe, sna->mode.hidden));
+	if (sna->mode.hidden)
+		return TRUE;
 
 	saved_bo = sna_crtc->bo;
 	saved_transform = sna_crtc->transform;
+	saved_cursor_transform = sna_crtc->cursor_transform;
+	saved_hwcursor = sna_crtc->hwcursor;
 	saved_offset = sna_crtc->offset;
 
 	sna_crtc->fallback_shadow = false;
@@ -2305,6 +2462,9 @@ retry: /* Attach per-crtc pixmap or direct */
 	}
 
 	bo->active_scanout++;
+	DBG(("%s: marking handle=%d as active=%d (removing %d from scanout, active=%d)\n",
+	     __FUNCTION__, bo->handle, bo->active_scanout,
+	     saved_bo ? saved_bo->handle : 0, saved_bo ? saved_bo->active_scanout - 1: -1));
 	if (saved_bo) {
 		assert(saved_bo->active_scanout);
 		assert(saved_bo->refcnt >= saved_bo->active_scanout);
@@ -2315,15 +2475,31 @@ retry: /* Attach per-crtc pixmap or direct */
 	sna_crtc_randr(crtc);
 	if (sna_crtc->transform)
 		sna_crtc_damage(crtc);
+	if (sna_crtc->cursor &&  /* Reload cursor if RandR maybe changed */
+	    (!sna_crtc->hwcursor ||
+	     saved_cursor_transform || sna_crtc->cursor_transform ||
+	     sna_crtc->cursor->rotation != crtc->rotation))
+		sna_crtc_disable_cursor(sna, sna_crtc);
+
+	assert(!sna->mode.hidden);
 	sna->mode.front_active += saved_bo == NULL;
 	sna->mode.dirty = true;
-	DBG(("%s: front_active=%d\n", __FUNCTION__, sna->mode.front_active));
+	DBG(("%s: handle=%d, scanout_active=%d, front_active=%d\n",
+	     __FUNCTION__, bo->handle, bo->active_scanout, sna->mode.front_active));
 
 	return TRUE;
 
 error:
 	sna_crtc->offset = saved_offset;
+	if (sna_crtc->transform) {
+		assert(sna->mode.rr_active);
+		sna->mode.rr_active--;
+	}
+	if (saved_transform)
+		sna->mode.rr_active++;
 	sna_crtc->transform = saved_transform;
+	sna_crtc->cursor_transform = saved_cursor_transform;
+	sna_crtc->hwcursor = saved_hwcursor;
 	sna_crtc->bo = saved_bo;
 	sna_mode_discover(sna);
 	return FALSE;
@@ -2372,17 +2548,10 @@ sna_crtc_set_mode_major(xf86CrtcPtr crtc, DisplayModePtr mode,
 static void
 sna_crtc_dpms(xf86CrtcPtr crtc, int mode)
 {
-	struct sna_crtc *priv = to_sna_crtc(crtc);
-
 	DBG(("%s(pipe %d, dpms mode -> %d):= active=%d\n",
-	     __FUNCTION__, priv->pipe, mode, mode == DPMSModeOn));
-	if (priv->dpms_mode == mode)
-		return;
-
-	assert(priv);
-	priv->dpms_mode = mode;
+	     __FUNCTION__, to_sna_crtc(crtc)->pipe, mode, mode == DPMSModeOn));
 
-	if (mode == DPMSModeOn && crtc->enabled && priv->bo == NULL) {
+	if (mode == DPMSModeOn && crtc->enabled) {
 		if (__sna_crtc_set_mode(crtc))
 			update_flush_interval(to_sna(crtc->scrn));
 		else
@@ -2390,7 +2559,7 @@ sna_crtc_dpms(xf86CrtcPtr crtc, int mode)
 	}
 
 	if (mode != DPMSModeOn)
-		sna_crtc_disable(crtc);
+		sna_crtc_disable(crtc, false);
 }
 
 void sna_mode_adjust_frame(struct sna *sna, int x, int y)
@@ -2720,7 +2889,6 @@ sna_crtc_add(ScrnInfoPtr scrn, int id)
 		return false;
 
 	sna_crtc->id = id;
-	sna_crtc->dpms_mode = -1;
 
 	VG_CLEAR(get_pipe);
 	get_pipe.pipe = 0;
@@ -2804,6 +2972,7 @@ sna_output_detect(xf86OutputPtr output)
 	struct sna *sna = to_sna(output->scrn);
 	struct sna_output *sna_output = output->driver_private;
 	union compat_mode_get_connector compat_conn;
+	uint32_t now;
 
 	DBG(("%s(%s:%d)\n", __FUNCTION__, output->name, sna_output->id));
 
@@ -2812,6 +2981,16 @@ sna_output_detect(xf86OutputPtr output)
 		return XF86OutputStatusDisconnected;
 	}
 
+	/* Cache detections for 15s or hotplug event  */
+	now = GetTimeInMillis();
+	if (sna_output->last_detect != 0 &&
+	    (int32_t)(now - sna_output->last_detect) <= OUTPUT_STATUS_CACHE_MS) {
+		DBG(("%s(%s) reporting cached status (since %dms): %d\n",
+		     __FUNCTION__, output->name, now - sna_output->last_detect,
+		     sna_output->status));
+		return sna_output->status;
+	}
+
 	VG_CLEAR(compat_conn);
 	compat_conn.conn.connector_id = sna_output->id;
 	sna_output->num_modes = compat_conn.conn.count_modes = 0; /* reprobe */
@@ -2854,15 +3033,20 @@ sna_output_detect(xf86OutputPtr output)
 	DBG(("%s(%s): found %d modes, connection status=%d\n",
 	     __FUNCTION__, output->name, sna_output->num_modes, compat_conn.conn.connection));
 
+	sna_output->last_detect = now;
 	switch (compat_conn.conn.connection) {
 	case DRM_MODE_CONNECTED:
-		return XF86OutputStatusConnected;
+		sna_output->status = XF86OutputStatusConnected;
+		break;
 	case DRM_MODE_DISCONNECTED:
-		return XF86OutputStatusDisconnected;
+		sna_output->status = XF86OutputStatusDisconnected;
+		break;
 	default:
 	case DRM_MODE_UNKNOWNCONNECTION:
-		return XF86OutputStatusUnknown;
+		sna_output->status = XF86OutputStatusUnknown;
+		break;
 	}
+	return sna_output->status;
 }
 
 static Bool
@@ -2949,10 +3133,12 @@ sna_output_attach_edid(xf86OutputPtr output)
 
 		VG(memset(raw, 0, blob.length));
 		blob.data = (uintptr_t)raw;
-		if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETPROPBLOB, &blob))
-			goto done;
 	}
 
+	if (blob.length != sna_output->edid_len &&
+	    drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETPROPBLOB, &blob))
+		goto done;
+
 	if (old &&
 	    blob.length == sna_output->edid_len &&
 	    memcmp(old, raw, blob.length) == 0) {
@@ -2983,18 +3169,53 @@ done:
 	}
 }
 
+static bool duplicate_mode(DisplayModePtr modes, DisplayModePtr m)
+{
+	if (m == NULL)
+		return false;
+
+	while (modes) {
+		if (xf86ModesEqual(modes, m))
+			return true;
+
+		modes = modes->next;
+	}
+
+	return false;
+}
+
 static DisplayModePtr
-default_modes(void)
+default_modes(DisplayModePtr preferred)
 {
+	DisplayModePtr modes;
+
 #if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
-	return xf86GetDefaultModes();
+	modes = xf86GetDefaultModes();
 #else
-	return xf86GetDefaultModes(0, 0);
+	modes = xf86GetDefaultModes(0, 0);
 #endif
+
+#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,4,99,901,0)
+	if (preferred) {
+		DisplayModePtr m;
+
+		/* Add a half-resolution mode useful for large panels */
+		m = xf86GTFMode(preferred->HDisplay/2,
+				preferred->VDisplay/2,
+				xf86ModeVRefresh(preferred),
+				FALSE, FALSE);
+		if (!duplicate_mode(modes, m))
+			modes = xf86ModesAdd(modes, m);
+		else
+			free(m);
+	}
+#endif
+
+	return modes;
 }
 
 static DisplayModePtr
-sna_output_panel_edid(xf86OutputPtr output, DisplayModePtr modes)
+sna_output_add_default_modes(xf86OutputPtr output, DisplayModePtr modes)
 {
 	xf86MonPtr mon = output->MonInfo;
 	DisplayModePtr i, m, preferred = NULL;
@@ -3015,7 +3236,7 @@ sna_output_panel_edid(xf86OutputPtr output, DisplayModePtr modes)
 	max_vrefresh = max(max_vrefresh, 60.0);
 	max_vrefresh *= (1 + SYNC_TOLERANCE);
 
-	m = default_modes();
+	m = default_modes(preferred);
 	xf86ValidateModesSize(output->scrn, m, max_x, max_y, 0);
 
 	for (i = m; i; i = i->next) {
@@ -3117,7 +3338,7 @@ sna_output_get_modes(xf86OutputPtr output)
 	}
 
 	if (sna_output->add_default_modes)
-		Modes = sna_output_panel_edid(output, Modes);
+		Modes = sna_output_add_default_modes(output, Modes);
 
 	return Modes;
 }
@@ -3239,14 +3460,14 @@ sna_output_create_ranged_atom(xf86OutputPtr output, Atom *atom,
 	err = RRConfigureOutputProperty(output->randr_output, *atom, FALSE,
 					TRUE, immutable, 2, atom_range);
 	if (err != 0)
-		xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
+		xf86DrvMsg(output->scrn->scrnIndex, X_WARNING,
 			   "RRConfigureOutputProperty error, %d\n", err);
 
 	err = RRChangeOutputProperty(output->randr_output, *atom, XA_INTEGER,
 				     32, PropModeReplace, 1, &value,
 				     FALSE, FALSE);
 	if (err != 0)
-		xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
+		xf86DrvMsg(output->scrn->scrnIndex, X_WARNING,
 			   "RRChangeOutputProperty error, %d\n", err);
 }
 
@@ -3303,7 +3524,7 @@ sna_output_create_resources(xf86OutputPtr output)
 							p->kprop->flags & DRM_MODE_PROP_IMMUTABLE ? TRUE : FALSE,
 							p->num_atoms - 1, (INT32 *)&p->atoms[1]);
 			if (err != 0) {
-				xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
+				xf86DrvMsg(output->scrn->scrnIndex, X_WARNING,
 					   "RRConfigureOutputProperty error, %d\n", err);
 			}
 
@@ -3315,7 +3536,7 @@ sna_output_create_resources(xf86OutputPtr output)
 						     XA_ATOM, 32, PropModeReplace, 1, &p->atoms[j+1],
 						     FALSE, FALSE);
 			if (err != 0) {
-				xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
+				xf86DrvMsg(output->scrn->scrnIndex, X_WARNING,
 					   "RRChangeOutputProperty error, %d\n", err);
 			}
 		}
@@ -3385,18 +3606,19 @@ sna_output_set_property(xf86OutputPtr output, Atom property,
 			if (value->type != XA_INTEGER || value->format != 32 ||
 			    value->size != 1)
 				return FALSE;
-			val = *(uint32_t *)value->data;
 
+			val = *(uint32_t *)value->data;
 			drmModeConnectorSetProperty(sna->kgem.fd, sna_output->id,
 						    p->kprop->prop_id, (uint64_t)val);
 			return TRUE;
 		} else if (p->kprop->flags & DRM_MODE_PROP_ENUM) {
-			Atom	atom;
-			const char	*name;
-			int		j;
+			Atom atom;
+			const char *name;
+			int j;
 
 			if (value->type != XA_ATOM || value->format != 32 || value->size != 1)
 				return FALSE;
+
 			memcpy(&atom, value->data, 4);
 			name = NameForAtom(atom);
 			if (name == NULL)
@@ -3421,11 +3643,32 @@ sna_output_set_property(xf86OutputPtr output, Atom property,
 	return TRUE;
 }
 
+static void update_properties(struct sna *sna, struct sna_output *output)
+{
+	union compat_mode_get_connector compat_conn;
+	struct drm_mode_modeinfo dummy;
+
+	VG_CLEAR(compat_conn);
+
+	compat_conn.conn.connector_id = output->id;
+	compat_conn.conn.count_props = output->num_props;
+	compat_conn.conn.props_ptr = (uintptr_t)output->prop_ids;
+	compat_conn.conn.prop_values_ptr = (uintptr_t)output->prop_values;
+	compat_conn.conn.count_modes = 1; /* skip detect */
+	compat_conn.conn.modes_ptr = (uintptr_t)&dummy;
+
+	(void)drmIoctl(sna->kgem.fd,
+		       DRM_IOCTL_MODE_GETCONNECTOR,
+		       &compat_conn.conn);
+
+	assert(compat_conn.conn.count_props == output->num_props);
+}
+
 static Bool
 sna_output_get_property(xf86OutputPtr output, Atom property)
 {
 	struct sna_output *sna_output = output->driver_private;
-	int err;
+	int err, i, j;
 
 	if (property == backlight_atom || property == backlight_deprecated_atom) {
 		INT32 val;
@@ -3449,7 +3692,7 @@ sna_output_get_property(xf86OutputPtr output, Atom property)
 					     XA_INTEGER, 32, PropModeReplace, 1, &val,
 					     FALSE, FALSE);
 		if (err != 0) {
-			xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
+			xf86DrvMsg(output->scrn->scrnIndex, X_WARNING,
 				   "RRChangeOutputProperty error, %d\n", err);
 			return FALSE;
 		}
@@ -3457,6 +3700,40 @@ sna_output_get_property(xf86OutputPtr output, Atom property)
 		return TRUE;
 	}
 
+	for (i = 0; i < sna_output->num_props; i++) {
+		struct sna_property *p = &sna_output->props[i];
+
+		if (p->atoms == NULL || p->atoms[0] != property)
+			continue;
+
+		if (0&&output->scrn->vtSema)
+			update_properties(to_sna(output->scrn), sna_output);
+
+		err = 0;
+		if (p->kprop->flags & DRM_MODE_PROP_RANGE) {
+			err = RRChangeOutputProperty(output->randr_output,
+						     property, XA_INTEGER, 32,
+						     PropModeReplace, 1,
+						     &sna_output->prop_values[i],
+						     FALSE, FALSE);
+		} else if (p->kprop->flags & DRM_MODE_PROP_ENUM) {
+			for (j = 0; j < p->kprop->count_enums; j++) {
+				if (p->kprop->enums[j].value == sna_output->prop_values[i])
+					break;
+			}
+			err = RRChangeOutputProperty(output->randr_output,
+						     property, XA_ATOM, 32,
+						     PropModeReplace, 1,
+						     &p->atoms[j+1],
+						     FALSE, FALSE);
+		}
+
+		if (err != 0)
+			xf86DrvMsg(output->scrn->scrnIndex, X_WARNING,
+				   "RRChangeOutputProperty error, %d\n", err);
+		return TRUE;
+	}
+
 	return FALSE;
 }
 
@@ -3572,14 +3849,20 @@ gather_encoders(struct sna *sna, uint32_t id, int count,
 	struct drm_mode_get_encoder enc;
 	uint32_t *ids = NULL;
 
+	DBG(("%s(%d): expected count=%d\n", __FUNCTION__, id, count));
+
 	VG_CLEAR(compat_conn);
 	memset(out, 0, sizeof(*out));
 
 	do {
-		free(ids);
-		ids = malloc(sizeof(*ids) * count);
-		if (ids == 0)
+		uint32_t *nids;
+
+		nids = realloc(ids, sizeof(*ids) * count);
+		if (nids == NULL) {
+			free(ids);
 			return false;
+		}
+		ids = nids;
 
 		compat_conn.conn.connector_id = id;
 		compat_conn.conn.count_props = 0;
@@ -3599,6 +3882,7 @@ gather_encoders(struct sna *sna, uint32_t id, int count,
 		count = compat_conn.conn.count_encoders;
 	} while (1);
 
+	DBG(("%s(%d): gathering %d encoders\n", __FUNCTION__, id, count));
 	for (count = 0; count < compat_conn.conn.count_encoders; count++) {
 		enc.encoder_id = ids[count];
 		if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETENCODER, &enc)) {
@@ -3606,6 +3890,8 @@ gather_encoders(struct sna *sna, uint32_t id, int count,
 			count = 0;
 			break;
 		}
+		DBG(("%s(%d): encoder=%d, possible_crtcs=%x, possible_clones=%x\n",
+		     __FUNCTION__, id, enc.encoder_id, enc.possible_crtcs, enc.possible_clones));
 		out->possible_crtcs |= enc.possible_crtcs;
 		out->possible_clones |= enc.possible_clones;
 
@@ -3765,6 +4051,7 @@ sna_output_add(struct sna *sna, unsigned id, unsigned serial)
 		return -1;
 	}
 	assert(compat_conn.conn.connector_id == id);
+	DBG(("%s(%d): has %d associated encoders\n", __FUNCTION__, id, compat_conn.conn.count_encoders));
 
 	if (compat_conn.conn.connector_type < ARRAY_SIZE(output_names))
 		output_name = output_names[compat_conn.conn.connector_type];
@@ -3976,6 +4263,7 @@ reset:
 				goto cleanup;
 		}
 
+		RROutputChanged(output->randr_output, TRUE);
 		sna_output_create_resources(output);
 		RRPostPendingProperties(output->randr_output);
 
@@ -4080,6 +4368,8 @@ static bool disable_unused_crtc(struct sna *sna)
 	bool update = false;
 	int o, c;
 
+	DBG(("%s\n", __FUNCTION__));
+
 	for (c = 0; c < sna->mode.num_real_crtc; c++) {
 		xf86CrtcPtr crtc = config->crtc[c];
 
@@ -4128,8 +4418,9 @@ void sna_mode_discover(struct sna *sna)
 	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETRESOURCES, &res))
 		return;
 
-	DBG(("%s: now %d (was %d) connectors\n", __FUNCTION__,
-	     res.count_connectors, sna->mode.num_real_output));
+	DBG(("%s: now %d (was %d) connectors, %d encoders, %d crtc\n", __FUNCTION__,
+	     res.count_connectors, sna->mode.num_real_output,
+	     res.count_encoders, res.count_crtcs));
 	if (res.count_connectors > 32)
 		return;
 
@@ -4161,16 +4452,21 @@ void sna_mode_discover(struct sna *sna)
 
 	for (i = 0; i < sna->mode.num_real_output; i++) {
 		xf86OutputPtr output = config->output[i];
+		struct sna_output *sna_output = to_sna_output(output);
 
-		if (to_sna_output(output)->id == 0)
+		if (sna_output->id == 0)
 			continue;
 
-		if (to_sna_output(output)->serial == serial)
+		sna_output->last_detect = 0;
+		if (sna_output->serial == serial) {
+			if (sna_output_detect(output) != output->status)
+				RROutputChanged(output->randr_output, TRUE);
 			continue;
+		}
 
 		DBG(("%s: removing output %s (id=%d), serial=%u [now %u]\n",
-		     __FUNCTION__, output->name, to_sna_output(output)->id,
-		    to_sna_output(output)->serial, serial));
+		     __FUNCTION__, output->name, sna_output->id,
+		    sna_output->serial, serial));
 
 		xf86DrvMsg(sna->scrn->scrnIndex, X_INFO,
 			   "%s output %s\n",
@@ -4180,8 +4476,9 @@ void sna_mode_discover(struct sna *sna)
 			sna_output_del(output);
 			i--;
 		} else {
-			to_sna_output(output)->id = 0;
+			sna_output->id = 0;
 			output->crtc = NULL;
+			RROutputChanged(output->randr_output, TRUE);
 		}
 		changed |= 2;
 	}
@@ -4200,6 +4497,8 @@ void sna_mode_discover(struct sna *sna)
 
 		xf86RandR12TellChanged(screen);
 	}
+
+	RRTellChanged(screen);
 }
 
 static void copy_front(struct sna *sna, PixmapPtr old, PixmapPtr new)
@@ -4208,7 +4507,7 @@ static void copy_front(struct sna *sna, PixmapPtr old, PixmapPtr new)
 
 	DBG(("%s\n", __FUNCTION__));
 
-	if (wedged(sna))
+	if (wedged(sna) || isGPU(sna->scrn))
 		return;
 
 	old_priv = sna_pixmap_force_to_gpu(old, MOVE_READ);
@@ -4220,12 +4519,19 @@ static void copy_front(struct sna *sna, PixmapPtr old, PixmapPtr new)
 		return;
 
 	if (old_priv->clear) {
-		(void)sna->render.fill_one(sna, new, new_priv->gpu_bo,
-					   old_priv->clear_color,
-					   0, 0,
-					   new->drawable.width,
-					   new->drawable.height,
-					   GXcopy);
+		bool ok = false;
+		if (!wedged(sna))
+			ok = sna->render.fill_one(sna, new, new_priv->gpu_bo,
+						  old_priv->clear_color,
+						  0, 0,
+						  new->drawable.width,
+						  new->drawable.height,
+						  GXcopy);
+		if (!ok) {
+			void *ptr = kgem_bo_map__gtt(&sna->kgem, new_priv->gpu_bo);
+			if (ptr)
+				memset(ptr, 0, new_priv->gpu_bo->pitch*new->drawable.height);
+		}
 		new_priv->clear = true;
 		new_priv->clear_color = old_priv->clear_color;
 	} else {
@@ -4281,11 +4587,18 @@ static void copy_front(struct sna *sna, PixmapPtr old, PixmapPtr new)
 			     __FUNCTION__, box.x2, box.y2, sx, sy, dx, dy));
 
 			if (box.x2 != new->drawable.width || box.y2 != new->drawable.height) {
-				(void)sna->render.fill_one(sna, new, new_priv->gpu_bo, 0,
-							   0, 0,
-							   new->drawable.width,
-							   new->drawable.height,
-							   GXclear);
+				bool ok = false;
+				if (!wedged(sna))
+					ok = sna->render.fill_one(sna, new, new_priv->gpu_bo, 0,
+								  0, 0,
+								  new->drawable.width,
+								  new->drawable.height,
+								  GXclear);
+				if (!ok) {
+					void *ptr = kgem_bo_map__gtt(&sna->kgem, new_priv->gpu_bo);
+					if (ptr)
+						memset(ptr, 0, new_priv->gpu_bo->pitch*new->drawable.height);
+				}
 			}
 			(void)sna->render.copy_boxes(sna, GXcopy,
 						     &old->drawable, old_priv->gpu_bo, sx, sy,
@@ -4302,7 +4615,7 @@ sna_mode_resize(ScrnInfoPtr scrn, int width, int height)
 {
 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(scrn);
 	struct sna *sna = to_sna(scrn);
-	ScreenPtr screen = scrn->pScreen;
+	ScreenPtr screen = xf86ScrnToScreen(scrn);
 	PixmapPtr new_front;
 	int i;
 
@@ -4337,6 +4650,7 @@ sna_mode_resize(ScrnInfoPtr scrn, int width, int height)
 	for (i = 0; i < sna->mode.num_real_crtc; i++)
 		sna_crtc_disable_shadow(sna, to_sna_crtc(config->crtc[i]));
 	assert(sna->mode.shadow_active == 0);
+	assert(!sna->mode.shadow_enabled);
 	assert(sna->mode.shadow_damage == NULL);
 	assert(sna->mode.shadow == NULL);
 
@@ -4371,7 +4685,7 @@ sna_mode_resize(ScrnInfoPtr scrn, int width, int height)
 			continue;
 
 		if (!__sna_crtc_set_mode(crtc))
-			sna_crtc_disable(crtc);
+			sna_crtc_disable(crtc, false);
 	}
 
 	sna_mode_wakeup(sna);
@@ -4381,19 +4695,6 @@ sna_mode_resize(ScrnInfoPtr scrn, int width, int height)
 }
 
 /* cursor handling */
-struct sna_cursor {
-	struct sna_cursor *next;
-	uint32_t *image;
-	Rotation rotation;
-	int ref;
-	int size;
-	int last_width;
-	int last_height;
-	unsigned handle;
-	unsigned serial;
-	unsigned alloc;
-};
-
 static void
 rotate_coord(Rotation rotation, int size,
 	     int x_dst, int y_dst,
@@ -4519,6 +4820,17 @@ static uint32_t *get_cursor_argb(CursorPtr c)
 #endif
 }
 
+static int __cursor_size(int width, int height)
+{
+	int i, size;
+
+	i = MAX(width, height);
+	for (size = 64; size < i; size <<= 1)
+		;
+
+	return size;
+}
+
 static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
 {
 	struct sna_cursor *cursor;
@@ -4526,6 +4838,9 @@ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
 	const uint32_t *argb;
 	uint32_t *image;
 	int width, height, pitch, size, x, y;
+	PictTransform cursor_to_fb;
+	struct pict_f_transform f_cursor_to_fb, f_fb_to_cursor;
+	bool transformed;
 	Rotation rotation;
 
 	assert(sna->cursor.ref);
@@ -4537,8 +4852,8 @@ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
 	       cursor ? cursor->serial : 0,
 	       sna->cursor.serial));
 	if (cursor && cursor->serial == sna->cursor.serial) {
-		assert(cursor->size == sna->cursor.size);
-		assert(cursor->rotation == crtc->transform_in_use ? crtc->rotation : RR_Rotate_0);
+		assert(cursor->size == sna->cursor.size || cursor->transformed);
+		assert(cursor->rotation == (!to_sna_crtc(crtc)->cursor_transform && crtc->transform_in_use) ? crtc->rotation : RR_Rotate_0);
 		assert(cursor->ref);
 		return cursor;
 	}
@@ -4550,22 +4865,44 @@ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
 	       sna->cursor.serial,
 	       get_cursor_argb(sna->cursor.ref) != NULL));
 
-	rotation = crtc->transform_in_use ? crtc->rotation : RR_Rotate_0;
+	transformed = to_sna_crtc(crtc)->cursor_transform;
+	rotation = (!transformed && crtc->transform_in_use) ? crtc->rotation : RR_Rotate_0;
 
-	if (sna->cursor.use_gtt) { /* Don't allow phys cursor sharing */
+	/* Don't allow phys cursor sharing */
+	if (sna->cursor.use_gtt && !transformed) {
 		for (cursor = sna->cursor.cursors; cursor; cursor = cursor->next) {
-			if (cursor->serial == sna->cursor.serial && cursor->rotation == rotation) {
+			if (cursor->serial == sna->cursor.serial &&
+			    cursor->rotation == rotation &&
+			    !cursor->transformed) {
 				__DBG(("%s: reusing handle=%d, serial=%d, rotation=%d, size=%d\n",
 				       __FUNCTION__, cursor->handle, cursor->serial, cursor->rotation, cursor->size));
 				assert(cursor->size == sna->cursor.size);
 				return cursor;
 			}
 		}
-
-		cursor = to_sna_crtc(crtc)->cursor;
 	}
 
-	size = sna->cursor.size;
+	if (transformed) {
+		struct pixman_box16 box;
+
+		box.x1 = box.y1 = 0;
+		box.x2 = sna->cursor.ref->bits->width;
+		box.y2 = sna->cursor.ref->bits->height;
+
+		pixman_f_transform_bounds(&crtc->f_crtc_to_framebuffer, &box);
+		size = __cursor_size(box.x2 - box.x1, box.y2 - box.y1);
+
+		RRTransformCompute(0, 0,
+				   sna->cursor.ref->bits->width,
+				   sna->cursor.ref->bits->height,
+				   crtc->rotation, &crtc->transform,
+				   &cursor_to_fb,
+				   &f_cursor_to_fb,
+				   &f_fb_to_cursor);
+	} else
+		size = sna->cursor.size;
+
+	cursor = to_sna_crtc(crtc)->cursor;
 	if (cursor && cursor->alloc < 4*size*size)
 		cursor = NULL;
 
@@ -4577,7 +4914,7 @@ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
 		}
 	}
 
-	width = sna->cursor.ref->bits->width;
+	width  = sna->cursor.ref->bits->width;
 	height = sna->cursor.ref->bits->height;
 	source = sna->cursor.ref->bits->source;
 	mask = sna->cursor.ref->bits->mask;
@@ -4585,7 +4922,7 @@ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
 	pitch = BitmapBytePad(width);
 
 	image = cursor->image;
-	if (image == NULL) {
+	if (image == NULL || transformed) {
 		image = sna->cursor.scratch;
 		cursor->last_width = cursor->last_height = size;
 	}
@@ -4616,6 +4953,19 @@ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
 				mask += pitch;
 				source += pitch;
 			}
+			if (transformed) {
+				affine_blt(image, cursor->image, 32,
+					   0, 0, width, height, size * 4,
+					   0, 0, width, height, size * 4,
+					   &f_cursor_to_fb);
+				image = cursor->image;
+			}
+		} else if (transformed) {
+			affine_blt(argb, cursor->image, 32,
+				   0, 0, width, height, width * 4,
+				   0, 0, width, height, size * 4,
+				   &f_cursor_to_fb);
+			image = cursor->image;
 		} else
 			memcpy_blt(argb, image, 32,
 				   width * 4, size * 4,
@@ -4662,6 +5012,7 @@ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
 
 	cursor->size = size;
 	cursor->rotation = rotation;
+	cursor->transformed = transformed;
 	cursor->serial = sna->cursor.serial;
 	cursor->last_width = width;
 	cursor->last_height = height;
@@ -4810,6 +5161,23 @@ sna_crtc_disable_cursor(struct sna *sna, struct sna_crtc *crtc)
 }
 
 static void
+sna_disable_cursors(ScrnInfoPtr scrn)
+{
+	xf86CrtcConfigPtr xf86_config = XF86_CRTC_CONFIG_PTR(scrn);
+	struct sna *sna = to_sna(scrn);
+	int sigio, c;
+
+	DBG(("%s\n", __FUNCTION__));
+
+	sigio = sigio_block();
+	for (c = 0; c < sna->mode.num_real_crtc; c++) {
+		assert(to_sna_crtc(xf86_config->crtc[c]));
+		sna_crtc_disable_cursor(sna, to_sna_crtc(xf86_config->crtc[c]));
+	}
+	sigio_unblock(sigio);
+}
+
+static void
 sna_hide_cursors(ScrnInfoPtr scrn)
 {
 	xf86CrtcConfigPtr xf86_config = XF86_CRTC_CONFIG_PTR(scrn);
@@ -4978,17 +5346,6 @@ sna_load_cursor_image(ScrnInfoPtr scrn, unsigned char *src)
 {
 }
 
-static int __cursor_size(CursorPtr cursor)
-{
-	int i, size;
-
-	i = MAX(cursor->bits->width, cursor->bits->height);
-	for (size = 64; size < i; size <<= 1)
-		;
-
-	return size;
-}
-
 static bool
 sna_cursor_preallocate(struct sna *sna)
 {
@@ -5006,6 +5363,40 @@ sna_cursor_preallocate(struct sna *sna)
 	return true;
 }
 
+static bool
+transformable_cursor(struct sna *sna, CursorPtr cursor)
+{
+	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
+	int i;
+
+	for (i = 0; i < sna->mode.num_real_crtc; i++) {
+		xf86CrtcPtr crtc = config->crtc[i];
+		const struct pixman_f_transform *t;
+		struct pixman_box16 box;
+		int size;
+
+		if (!to_sna_crtc(crtc)->hwcursor)
+			return false;
+
+		t = &crtc->f_crtc_to_framebuffer;
+		if (!sna->cursor.use_gtt || !sna->cursor.scratch)
+			return false;
+
+		box.x1 = box.y1 = 0;
+		box.x2 = cursor->bits->width;
+		box.y2 = cursor->bits->height;
+
+		if (!pixman_f_transform_bounds(t, &box))
+			return false;
+
+		size = __cursor_size(box.x2 - box.x1, box.y2 - box.y1);
+		if (size > sna->cursor.max_size)
+			return false;
+	}
+
+	return true;
+}
+
 static Bool
 sna_use_hw_cursor(ScreenPtr screen, CursorPtr cursor)
 {
@@ -5023,10 +5414,14 @@ sna_use_hw_cursor(ScreenPtr screen, CursorPtr cursor)
 		sna->cursor.ref = NULL;
 	}
 
-	sna->cursor.size = __cursor_size(cursor);
+	sna->cursor.size =
+		__cursor_size(cursor->bits->width, cursor->bits->height);
 	if (sna->cursor.size > sna->cursor.max_size)
 		return FALSE;
 
+	if (sna->mode.rr_active && !transformable_cursor(sna, cursor))
+		return FALSE;
+
 	if (!sna_cursor_preallocate(sna))
 		return FALSE;
 
@@ -5087,11 +5482,9 @@ sna_cursor_pre_init(struct sna *sna)
 	DBG(("%s: cursor updates use_gtt?=%d\n",
 	     __FUNCTION__, sna->cursor.use_gtt));
 
-	if (!sna->cursor.use_gtt) {
-		sna->cursor.scratch = malloc(sna->cursor.max_size * sna->cursor.max_size * 4);
-		if (!sna->cursor.scratch)
-			sna->cursor.max_size = 0;
-	}
+	sna->cursor.scratch = malloc(sna->cursor.max_size * sna->cursor.max_size * 4);
+	if (!sna->cursor.scratch && !sna->cursor.use_gtt)
+		sna->cursor.max_size = 0;
 
 	sna->cursor.num_stash = -sna->mode.num_real_crtc;
 
@@ -5247,6 +5640,37 @@ sna_crtc_flip(struct sna *sna, struct sna_crtc *crtc, struct kgem_bo *bo, int x,
 	return true;
 }
 
+static void sna_mode_restore(struct sna *sna)
+{
+	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
+	int error = 0;
+	int i;
+
+	assert(!sna->mode.hidden);
+
+	for (i = 0; i < sna->mode.num_real_crtc; i++) {
+		xf86CrtcPtr crtc = config->crtc[i];
+
+		assert(to_sna_crtc(crtc) != NULL);
+		if (to_sna_crtc(crtc)->bo == NULL)
+			continue;
+
+		assert(crtc->enabled);
+		if (!__sna_crtc_set_mode(crtc)) {
+			sna_crtc_disable(crtc, false);
+			error++;
+		}
+	}
+	sna_mode_wakeup(sna);
+	update_flush_interval(sna);
+	sna_cursors_reload(sna);
+	sna->mode.dirty = false;
+
+	if (error)
+		xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
+			   "Failed to restore display configuration\n");
+}
+
 int
 sna_page_flip(struct sna *sna,
 	      struct kgem_bo *bo,
@@ -5263,9 +5687,9 @@ sna_page_flip(struct sna *sna,
 	assert(bo->refcnt);
 
 	assert((sna->flags & SNA_IS_HOSTED) == 0);
-	assert((sna->flags & SNA_TEAR_FREE) == 0);
 	assert(sna->mode.flip_active == 0);
 	assert(sna->mode.front_active);
+	assert(!sna->mode.hidden);
 	assert(sna->scrn->vtSema);
 
 	if ((sna->flags & (data ? SNA_HAS_FLIP : SNA_HAS_ASYNC_FLIP)) == 0)
@@ -5288,6 +5712,9 @@ sna_page_flip(struct sna *sna,
 		assert(crtc->bo->refcnt >= crtc->bo->active_scanout);
 		assert(crtc->flip_bo == NULL);
 
+		if (data == NULL && crtc->bo == bo)
+			goto next_crtc;
+
 		arg.crtc_id = crtc->id;
 		arg.fb_id = get_fb(sna, bo, width, height);
 		if (arg.fb_id == 0) {
@@ -5304,6 +5731,10 @@ sna_page_flip(struct sna *sna,
 			     crtc_offset, crtc->offset));
 fixup_flip:
 			if (crtc->bo != bo && sna_crtc_flip(sna, crtc, bo, crtc->base->x, crtc->base->y)) {
+update_scanout:
+				DBG(("%s: removing handle=%d [active_scanout=%d] from scanout, installing handle=%d [active_scanout=%d]\n",
+				     __FUNCTION__, crtc->bo->handle, crtc->bo->active_scanout,
+				     bo->handle, bo->active_scanout));
 				assert(crtc->bo->active_scanout);
 				assert(crtc->bo->refcnt >= crtc->bo->active_scanout);
 				crtc->bo->active_scanout--;
@@ -5322,12 +5753,8 @@ fixup_flip:
 
 				/* queue a flip in order to send the event */
 			} else {
-				if (count && !xf86SetDesiredModes(sna->scrn)) {
-					xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
-						   "failed to restore display configuration\n");
-					for (; i < sna->mode.num_real_crtc; i++)
-						sna_crtc_disable(config->crtc[i]);
-				}
+				if (count)
+					sna_mode_restore(sna);
 				return 0;
 			}
 		}
@@ -5375,11 +5802,17 @@ retry_flip:
 				goto retry_flip;
 			}
 
-			xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
-				   "page flipping failed, on CRTC:%d (pipe=%d), disabling %s page flips\n",
-				   crtc->id, crtc->pipe, data ? "synchronous": "asynchronous");
-			sna->flags &= ~(data ? SNA_HAS_FLIP : SNA_HAS_ASYNC_FLIP);
-			goto fixup_flip;
+			if (sna->flags & (data ? SNA_HAS_FLIP : SNA_HAS_ASYNC_FLIP)) {
+				xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
+					   "page flipping failed, on CRTC:%d (pipe=%d), disabling %s page flips\n",
+					   crtc->id, crtc->pipe, data ? "synchronous": "asynchronous");
+				sna->flags &= ~(data ? SNA_HAS_FLIP : SNA_HAS_ASYNC_FLIP);
+				goto fixup_flip;
+			}
+
+			if (count)
+				sna_mode_restore(sna);
+			return 0;
 		}
 
 		if (data) {
@@ -5391,8 +5824,11 @@ retry_flip:
 			crtc->flip_serial = crtc->mode_serial;
 			crtc->flip_pending = true;
 			sna->mode.flip_active++;
-		}
 
+			DBG(("%s: recording flip on CRTC:%d handle=%d, active_scanout=%d, serial=%d\n",
+			     __FUNCTION__, crtc->id, crtc->flip_bo->handle, crtc->flip_bo->active_scanout, crtc->flip_serial));
+		} else
+			goto update_scanout;
 next_crtc:
 		count++;
 	}
@@ -5604,6 +6040,7 @@ static bool sna_probe_initial_configuration(struct sna *sna)
 
 		crtc_id = (uintptr_t)output->crtc;
 		output->crtc = NULL;
+		output->status = XF86OutputStatusUnknown;
 		if (sna->flags & SNA_IS_SLAVED)
 			continue;
 
@@ -5644,6 +6081,7 @@ static bool sna_probe_initial_configuration(struct sna *sna)
 					   to_sna_crtc(crtc)->pipe);
 
 				output->crtc = crtc;
+				output->status = XF86OutputStatusConnected;
 				crtc->enabled = TRUE;
 
 				if (output->mm_width == 0 || output->mm_height == 0) {
@@ -5707,8 +6145,8 @@ static bool sna_probe_initial_configuration(struct sna *sna)
 			if (sna_output->num_modes == 0)
 				continue;
 
-			width = sna_output->modes[0].hdisplay;
-			height= sna_output->modes[0].vdisplay;
+			width  = sna_output->modes[0].hdisplay;
+			height = sna_output->modes[0].vdisplay;
 
 			DBG(("%s: panel '%s' is %dx%d\n",
 			     __FUNCTION__, output->name, width, height));
@@ -5788,7 +6226,7 @@ probe_capabilities(struct sna *sna)
 	sna->flags &= ~(SNA_HAS_FLIP | SNA_HAS_ASYNC_FLIP);
 	if (has_flip(sna))
 		sna->flags |= SNA_HAS_FLIP;
-	if (has_flip__async(sna))
+	if (has_flip__async(sna) && (sna->flags & SNA_TEAR_FREE) == 0)
 		sna->flags |= SNA_HAS_ASYNC_FLIP;
 	DBG(("%s: page flips? %s, async? %s\n", __FUNCTION__,
 	     sna->flags & SNA_HAS_FLIP ? "enabled" : "disabled",
@@ -5840,6 +6278,7 @@ bool sna_mode_pre_init(ScrnInfoPtr scrn, struct sna *sna)
 	}
 
 	probe_capabilities(sna);
+	sna->mode.hidden = 1;
 
 	if (!xf86GetOptValInteger(sna->Options, OPTION_VIRTUAL, &num_fake))
 		num_fake = 1;
@@ -5855,6 +6294,9 @@ bool sna_mode_pre_init(ScrnInfoPtr scrn, struct sna *sna)
 	if (res) {
 		xf86CrtcConfigPtr xf86_config;
 
+		DBG(("%s: found %d CRTC, %d encoders, %d connectors\n",
+		     __FUNCTION__, res->count_crtcs, res->count_encoders, res->count_connectors));
+
 		assert(res->count_crtcs);
 		assert(res->count_connectors);
 
@@ -5955,7 +6397,7 @@ sna_mode_set_primary(struct sna *sna)
 
 		DBG(("%s: setting PrimaryOutput %s\n", __FUNCTION__, output->name));
 		rr->primaryOutput = output->randr_output;
-		RROutputChanged(rr->primaryOutput, 0);
+		RROutputChanged(rr->primaryOutput, FALSE);
 		rr->layoutChanged = TRUE;
 		break;
 	}
@@ -5974,12 +6416,9 @@ sna_mode_disable(struct sna *sna)
 	if (!sna->scrn->vtSema)
 		return false;
 
-	/* XXX we will cause previously hidden cursors to be reshown, but
-	 * this should be a rare fixup case for severe fragmentation.
-	 */
-	sna_hide_cursors(sna->scrn);
+	sna_disable_cursors(sna->scrn);
 	for (i = 0; i < sna->mode.num_real_crtc; i++)
-		sna_crtc_disable(config->crtc[i]);
+		sna_crtc_disable(config->crtc[i], false);
 	assert(sna->mode.front_active == 0);
 
 	sna_mode_wakeup(sna);
@@ -6001,6 +6440,11 @@ sna_mode_enable(struct sna *sna)
 	if (!sna->scrn->vtSema)
 		return;
 
+	if (sna->mode.hidden) {
+		DBG(("%s: hidden outputs\n", __FUNCTION__));
+		return;
+	}
+
 	for (i = 0; i < sna->mode.num_real_crtc; i++) {
 		xf86CrtcPtr crtc = config->crtc[i];
 
@@ -6016,13 +6460,30 @@ sna_mode_enable(struct sna *sna)
 	}
 
 	update_flush_interval(sna);
-	sna_show_cursors(sna->scrn);
+	sna_cursors_reload(sna);
 	sna->mode.dirty = false;
 }
 
+static void sna_randr_close(struct sna *sna)
+{
+	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
+	int n;
+
+	/* The RR structs are freed early during CloseScreen as they
+	 * are tracked as Resources. However, we may be tempted to
+	 * access them during shutdown so decouple them now.
+	 */
+	  for (n = 0; n < config->num_output; n++)
+		  config->output[n]->randr_output = NULL;
+
+	  for (n = 0; n < config->num_crtc; n++)
+		  config->crtc[n]->randr_crtc = NULL;
+}
+
 void
 sna_mode_close(struct sna *sna)
 {
+	sna_randr_close(sna);
 	sna_mode_wakeup(sna);
 
 	if (sna->flags & SNA_IS_HOSTED)
@@ -6077,15 +6538,22 @@ xf86CrtcPtr
 sna_covering_crtc(struct sna *sna, const BoxRec *box, xf86CrtcPtr desired)
 {
 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
-	xf86CrtcPtr best_crtc;
-	int best_coverage, c;
+	xf86CrtcPtr best_crtc = NULL;
+	int best_coverage = -1, c;
 
 	if (sna->flags & SNA_IS_HOSTED)
 		return NULL;
 
 	/* If we do not own the VT, we do not own the CRTC either */
-	if (!sna->scrn->vtSema)
+	if (!sna->scrn->vtSema) {
+		DBG(("%s: none, VT switched\n", __FUNCTION__));
+		return NULL;
+	}
+
+	if (sna->mode.hidden) {
+		DBG(("%s: none, hidden outputs\n", __FUNCTION__));
 		return NULL;
+	}
 
 	DBG(("%s for box=(%d, %d), (%d, %d)\n",
 	     __FUNCTION__, box->x1, box->y1, box->x2, box->y2));
@@ -6107,10 +6575,10 @@ sna_covering_crtc(struct sna *sna, const BoxRec *box, xf86CrtcPtr desired)
 			     cover_box.x2, cover_box.y2));
 			return desired;
 		}
+		best_crtc = desired;
+		best_coverage = 0;
 	}
 
-	best_crtc = NULL;
-	best_coverage = 0;
 	for (c = 0; c < sna->mode.num_real_crtc; c++) {
 		xf86CrtcPtr crtc = config->crtc[c];
 		BoxRec cover_box;
@@ -6156,6 +6624,21 @@ sna_covering_crtc(struct sna *sna, const BoxRec *box, xf86CrtcPtr desired)
 	return best_crtc;
 }
 
+xf86CrtcPtr sna_primary_crtc(struct sna *sna)
+{
+	rrScrPrivPtr rr = rrGetScrPriv(xf86ScrnToScreen(sna->scrn));
+	if (rr && rr->primaryOutput) {
+		xf86OutputPtr output = rr->primaryOutput->devPrivate;
+		if (output->crtc && to_sna_crtc(output->crtc))
+			return output->crtc;
+	}
+
+	if (sna->mode.num_real_crtc)
+		return XF86_CRTC_CONFIG_PTR(sna->scrn)->crtc[0];
+
+	return NULL;
+}
+
 #define MI_LOAD_REGISTER_IMM			(0x22<<23)
 
 static bool sna_emit_wait_for_scanline_hsw(struct sna *sna,
@@ -6465,7 +6948,9 @@ void sna_mode_check(struct sna *sna)
 	if (sna->flags & SNA_IS_HOSTED)
 		return;
 
-	DBG(("%s\n", __FUNCTION__));
+	DBG(("%s: hidden?=%d\n", __FUNCTION__, sna->mode.hidden));
+	if (sna->mode.hidden)
+		return;
 
 	/* Validate CRTC attachments and force consistency upon the kernel */
 	for (i = 0; i < sna->mode.num_real_crtc; i++) {
@@ -6496,7 +6981,7 @@ void sna_mode_check(struct sna *sna)
 			xf86DrvMsg(crtc->scrn->scrnIndex, X_ERROR,
 				   "%s: invalid state found on pipe %d, disabling CRTC:%d\n",
 				   __FUNCTION__, sna_crtc->pipe, sna_crtc->id);
-			sna_crtc_disable(crtc);
+			sna_crtc_disable(crtc, true);
 		}
 	}
 
@@ -6561,17 +7046,16 @@ void sna_mode_reset(struct sna *sna)
 
 	DBG(("%s\n", __FUNCTION__));
 
-	sna_hide_cursors(sna->scrn);
+	sna_disable_cursors(sna->scrn);
 	for (i = 0; i < sna->mode.num_real_crtc; i++)
 		if (!sna_crtc_hide_planes(sna, to_sna_crtc(config->crtc[i])))
-			sna_crtc_disable(config->crtc[i]);
+			sna_crtc_disable(config->crtc[i], true);
 	assert(sna->mode.front_active == 0);
 
 	for (i = 0; i < sna->mode.num_real_crtc; i++) {
 		struct sna_crtc *sna_crtc = to_sna_crtc(config->crtc[i]);
 
 		assert(sna_crtc != NULL);
-		sna_crtc->dpms_mode = -1;
 
 		/* Force the rotation property to be reset on next use */
 		rotation_reset(&sna_crtc->primary);
@@ -6641,9 +7125,10 @@ sna_crtc_redisplay__fallback(xf86CrtcPtr crtc, RegionPtr region, struct kgem_bo
 {
 	int16_t sx, sy;
 	struct sna *sna = to_sna(crtc->scrn);
-	ScreenPtr screen = sna->scrn->pScreen;
+	ScreenPtr screen = xf86ScrnToScreen(crtc->scrn);
 	DrawablePtr draw = crtc_source(crtc, &sx, &sy);
 	PictFormatPtr format;
+	PictTransform T;
 	PicturePtr src, dst;
 	PixmapPtr pixmap;
 	int depth, error;
@@ -6664,6 +7149,14 @@ sna_crtc_redisplay__fallback(xf86CrtcPtr crtc, RegionPtr region, struct kgem_bo
 	     __FUNCTION__, format->format, depth, draw->bitsPerPixel,
 	     bo->pitch, crtc->mode.HDisplay, crtc->mode.VDisplay));
 
+	if (sx | sy)
+		RegionTranslate(region, sx, sy);
+	error = !sna_drawable_move_region_to_cpu(draw, region, MOVE_READ);
+	if (sx | sy)
+		RegionTranslate(region, -sx, -sy);
+	if (error)
+		return;
+
 	ptr = kgem_bo_map__gtt(&sna->kgem, bo);
 	if (ptr == NULL)
 		return;
@@ -6683,9 +7176,37 @@ sna_crtc_redisplay__fallback(xf86CrtcPtr crtc, RegionPtr region, struct kgem_bo
 	if (!src)
 		goto free_pixmap;
 
-	error = SetPictureTransform(src, &crtc->crtc_to_framebuffer);
-	if (error)
-		goto free_src;
+	pixman_transform_init_translate(&T, sx << 16, sy << 16);
+	pixman_transform_multiply(&T, &T, &crtc->crtc_to_framebuffer);
+	if (!sna_transform_is_integer_translation(&T, &sx, &sy)) {
+#define f2d(x) (((double)(x))/65536.)
+		DBG(("%s: transform=[[%f %f %f], [%f %f %f], [%f %f %f]] (raw [[%x %x %x], [%x %x %x], [%x %x %x]])\n",
+		     __FUNCTION__,
+		     f2d(T.matrix[0][0]),
+		     f2d(T.matrix[0][1]),
+		     f2d(T.matrix[0][2]),
+		     f2d(T.matrix[1][0]),
+		     f2d(T.matrix[1][1]),
+		     f2d(T.matrix[1][2]),
+		     f2d(T.matrix[2][0]),
+		     f2d(T.matrix[2][1]),
+		     f2d(T.matrix[2][2]),
+		     T.matrix[0][0],
+		     T.matrix[0][1],
+		     T.matrix[0][2],
+		     T.matrix[1][0],
+		     T.matrix[1][1],
+		     T.matrix[1][2],
+		     T.matrix[2][0],
+		     T.matrix[2][1],
+		     T.matrix[2][2]));
+#undef f2d
+
+		error = SetPictureTransform(src, &T);
+		if (error)
+			goto free_src;
+		sx = sy = 0;
+	}
 
 	if (crtc->filter && crtc->transform_in_use)
 		SetPicturePictFilter(src, crtc->filter,
@@ -6733,10 +7254,11 @@ sna_crtc_redisplay__composite(xf86CrtcPtr crtc, RegionPtr region, struct kgem_bo
 {
 	int16_t sx, sy;
 	struct sna *sna = to_sna(crtc->scrn);
-	ScreenPtr screen = crtc->scrn->pScreen;
+	ScreenPtr screen = xf86ScrnToScreen(crtc->scrn);
 	DrawablePtr draw = crtc_source(crtc, &sx, &sy);
 	struct sna_composite_op tmp;
 	PictFormatPtr format;
+	PictTransform T;
 	PicturePtr src, dst;
 	PixmapPtr pixmap;
 	const BoxRec *b;
@@ -6777,9 +7299,14 @@ sna_crtc_redisplay__composite(xf86CrtcPtr crtc, RegionPtr region, struct kgem_bo
 	if (!src)
 		goto free_pixmap;
 
-	error = SetPictureTransform(src, &crtc->crtc_to_framebuffer);
-	if (error)
-		goto free_src;
+	pixman_transform_init_translate(&T, sx << 16, sy << 16);
+	pixman_transform_multiply(&T, &T, &crtc->crtc_to_framebuffer);
+	if (!sna_transform_is_integer_translation(&T, &sx, &sy)) {
+		error = SetPictureTransform(src, &T);
+		if (error)
+			goto free_src;
+		sx = sy = 0;
+	}
 
 	if (crtc->filter && crtc->transform_in_use)
 		SetPicturePictFilter(src, crtc->filter,
@@ -6916,10 +7443,13 @@ void sna_shadow_set_crtc(struct sna *sna,
 	assert(!sna_crtc->transform);
 
 	if (sna_crtc->client_bo != bo) {
-		if (sna_crtc->client_bo)
+		if (sna_crtc->client_bo) {
+			assert(sna_crtc->client_bo->refcnt > sna_crtc->client_bo->active_scanout);
 			kgem_bo_destroy(&sna->kgem, sna_crtc->client_bo);
+		}
 
 		sna_crtc->client_bo = kgem_bo_reference(bo);
+		assert(sna_crtc->client_bo->refcnt > sna_crtc->client_bo->active_scanout);
 		sna_crtc_damage(crtc);
 	}
 
@@ -6974,6 +7504,7 @@ void sna_shadow_unset_crtc(struct sna *sna,
 	if (sna_crtc->client_bo == NULL)
 		return;
 
+	assert(sna_crtc->client_bo->refcnt > sna_crtc->client_bo->active_scanout);
 	kgem_bo_destroy(&sna->kgem, sna_crtc->client_bo);
 	sna_crtc->client_bo = NULL;
 	list_del(&sna_crtc->shadow_link);
@@ -6982,15 +7513,50 @@ void sna_shadow_unset_crtc(struct sna *sna,
 	sna_crtc_damage(crtc);
 }
 
+static bool move_crtc_to_gpu(struct sna *sna)
+{
+	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
+	int i;
+
+	for (i = 0; i < sna->mode.num_real_crtc; i++) {
+		struct sna_crtc *crtc = to_sna_crtc(config->crtc[i]);
+
+		assert(crtc);
+
+		if (crtc->bo == NULL)
+			continue;
+
+		if (crtc->slave_pixmap)
+			continue;
+
+		if (crtc->client_bo)
+			continue;
+
+		DBG(("%s: CRTC %d [pipe=%d] requires frontbuffer\n",
+		     __FUNCTION__, crtc->id, crtc->pipe));
+		return sna_pixmap_move_to_gpu(sna->front,
+					      MOVE_READ | MOVE_ASYNC_HINT | __MOVE_SCANOUT);
+	}
+
+	return true;
+}
+
 void sna_mode_redisplay(struct sna *sna)
 {
 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
 	RegionPtr region;
 	int i;
 
-	if (!sna->mode.shadow_damage)
+	if (sna->mode.hidden) {
+		DBG(("%s: hidden outputs, skipping\n", __FUNCTION__));
+		return;
+	}
+
+	if (!sna->mode.shadow_enabled)
 		return;
 
+	assert(sna->mode.shadow_damage);
+
 	DBG(("%s: posting shadow damage? %d (flips pending? %d, mode reconfiguration pending? %d)\n",
 	     __FUNCTION__,
 	     !RegionNil(DamageRegion(sna->mode.shadow_damage)),
@@ -7012,21 +7578,16 @@ void sna_mode_redisplay(struct sna *sna)
 	     region->extents.x2, region->extents.y2));
 
 	if (sna->mode.flip_active) {
-		DamagePtr damage;
-
-		damage = sna->mode.shadow_damage;
-		sna->mode.shadow_damage = NULL;
-
+		sna->mode.shadow_enabled = false;
 		while (sna->mode.flip_active && sna_mode_wakeup(sna))
 			;
+		sna->mode.shadow_enabled = true;
 
-		sna->mode.shadow_damage = damage;
+		if (sna->mode.flip_active)
+			return;
 	}
 
-	if (sna->mode.flip_active)
-		return;
-
-	if (wedged(sna) || !sna_pixmap_move_to_gpu(sna->front, MOVE_READ | MOVE_ASYNC_HINT | __MOVE_SCANOUT)) {
+	if (wedged(sna) || !move_crtc_to_gpu(sna)) {
 		DBG(("%s: forcing scanout update using the CPU\n", __FUNCTION__));
 		if (!sna_pixmap_move_to_cpu(sna->front, MOVE_READ))
 			return;
@@ -7060,7 +7621,7 @@ void sna_mode_redisplay(struct sna *sna)
 					RegionNull(&new_damage);
 					RegionCopy(&new_damage, &damage);
 
-					bo = sna_crtc->client_bo;
+					bo = sna_crtc->cache_bo;
 					if (bo == NULL) {
 						damage.extents = crtc->bounds;
 						damage.data = NULL;
@@ -7096,6 +7657,9 @@ void sna_mode_redisplay(struct sna *sna)
 
 					if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_PAGE_FLIP, &arg)) {
 						if (sna_crtc_flip(sna, sna_crtc, bo, 0, 0)) {
+							DBG(("%s: removing handle=%d [active_scanout=%d] from scanout, installing handle=%d [active_scanout=%d]\n",
+							     __FUNCTION__, sna_crtc->bo->handle, sna_crtc->bo->active_scanout,
+							     bo->handle, bo->active_scanout));
 							assert(sna_crtc->bo->active_scanout);
 							assert(sna_crtc->bo->refcnt >= sna_crtc->bo->active_scanout);
 							sna_crtc->bo->active_scanout--;
@@ -7103,7 +7667,7 @@ void sna_mode_redisplay(struct sna *sna)
 
 							sna_crtc->bo = bo;
 							sna_crtc->bo->active_scanout++;
-							sna_crtc->client_bo = NULL;
+							sna_crtc->cache_bo = NULL;
 						} else {
 							DBG(("%s: flip [fb=%d] on crtc %d [%d, pipe=%d] failed - %d\n",
 							     __FUNCTION__, arg.fb_id, i, sna_crtc->id, sna_crtc->pipe, errno));
@@ -7116,7 +7680,7 @@ void sna_mode_redisplay(struct sna *sna)
 							sna_crtc_redisplay__fallback(crtc, &damage, sna_crtc->bo);
 
 							kgem_bo_destroy(&sna->kgem, bo);
-							sna_crtc->client_bo = NULL;
+							sna_crtc->cache_bo = NULL;
 						}
 					} else {
 						sna->mode.flip_active++;
@@ -7128,7 +7692,10 @@ void sna_mode_redisplay(struct sna *sna)
 						sna_crtc->flip_bo->active_scanout++;
 						sna_crtc->flip_serial = sna_crtc->mode_serial;
 
-						sna_crtc->client_bo = kgem_bo_reference(sna_crtc->bo);
+						sna_crtc->cache_bo = kgem_bo_reference(sna_crtc->bo);
+
+						DBG(("%s: recording flip on CRTC:%d handle=%d, active_scanout=%d, serial=%d\n",
+						     __FUNCTION__, sna_crtc->id, sna_crtc->flip_bo->handle, sna_crtc->flip_bo->active_scanout, sna_crtc->flip_serial));
 					}
 				}
 			}
@@ -7201,7 +7768,7 @@ void sna_mode_redisplay(struct sna *sna)
 				damage.extents = crtc->bounds;
 				damage.data = NULL;
 
-				bo = sna_crtc->client_bo;
+				bo = sna_crtc->cache_bo;
 				if (bo == NULL)
 					bo = kgem_create_2d(&sna->kgem,
 							    crtc->mode.HDisplay,
@@ -7228,6 +7795,9 @@ void sna_mode_redisplay(struct sna *sna)
 
 				if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_PAGE_FLIP, &arg)) {
 					if (sna_crtc_flip(sna, sna_crtc, bo, 0, 0)) {
+						DBG(("%s: removing handle=%d [active_scanout=%d] from scanout, installing handle=%d [active_scanout=%d]\n",
+						     __FUNCTION__, sna_crtc->bo->handle, sna_crtc->bo->active_scanout - 1,
+						     bo->handle, bo->active_scanout));
 						assert(sna_crtc->bo->active_scanout);
 						assert(sna_crtc->bo->refcnt >= sna_crtc->bo->active_scanout);
 						sna_crtc->bo->active_scanout--;
@@ -7235,7 +7805,7 @@ void sna_mode_redisplay(struct sna *sna)
 
 						sna_crtc->bo = kgem_bo_reference(bo);
 						sna_crtc->bo->active_scanout++;
-						sna_crtc->client_bo = kgem_bo_reference(bo);
+						sna_crtc->cache_bo = kgem_bo_reference(bo);
 					} else {
 						BoxRec box;
 						DrawableRec tmp;
@@ -7261,11 +7831,11 @@ disable1:
 							xf86DrvMsg(crtc->scrn->scrnIndex, X_ERROR,
 								   "%s: page flipping failed, disabling CRTC:%d (pipe=%d)\n",
 								   __FUNCTION__, sna_crtc->id, sna_crtc->pipe);
-							sna_crtc_disable(crtc);
+							sna_crtc_disable(crtc, false);
 						}
 
 						kgem_bo_destroy(&sna->kgem, bo);
-						sna_crtc->client_bo = NULL;
+						sna_crtc->cache_bo = NULL;
 					}
 					continue;
 				}
@@ -7279,7 +7849,9 @@ disable1:
 				sna_crtc->flip_serial = sna_crtc->mode_serial;
 				sna_crtc->flip_pending = true;
 
-				sna_crtc->client_bo = kgem_bo_reference(sna_crtc->bo);
+				sna_crtc->cache_bo = kgem_bo_reference(sna_crtc->bo);
+				DBG(("%s: recording flip on CRTC:%d handle=%d, active_scanout=%d, serial=%d\n",
+				     __FUNCTION__, sna_crtc->id, sna_crtc->flip_bo->handle, sna_crtc->flip_bo->active_scanout, sna_crtc->flip_serial));
 			} else {
 				sna_crtc_redisplay(crtc, &damage, sna_crtc->bo);
 				kgem_scanout_flush(&sna->kgem, sna_crtc->bo);
@@ -7320,7 +7892,6 @@ disable1:
 				continue;
 
 			assert(config->crtc[i]->enabled);
-			assert(crtc->dpms_mode <= DPMSModeOn);
 			assert(crtc->flip_bo == NULL);
 
 			arg.crtc_id = crtc->id;
@@ -7365,8 +7936,10 @@ fixup_shadow:
 				y = crtc->base->y;
 			}
 
-			if (crtc->bo == flip_bo)
+			if (crtc->bo == flip_bo) {
+				assert(crtc->bo->refcnt >= crtc->bo->active_scanout);
 				continue;
+			}
 
 			if (flip_bo->pitch != crtc->bo->pitch || (y << 16 | x)  != crtc->offset) {
 				DBG(("%s: changing pitch (new %d =?= old %d) or offset (new %x =?= old %x)\n",
@@ -7375,6 +7948,9 @@ fixup_shadow:
 				     y << 16 | x, crtc->offset));
 fixup_flip:
 				if (sna_crtc_flip(sna, crtc, flip_bo, x, y)) {
+					DBG(("%s: removing handle=%d [active_scanout=%d] from scanout, installing handle=%d [active_scanout=%d]\n",
+					     __FUNCTION__, crtc->bo->handle, crtc->bo->active_scanout-1,
+					     flip_bo->handle, flip_bo->active_scanout));
 					assert(flip_bo != crtc->bo);
 					assert(crtc->bo->active_scanout);
 					assert(crtc->bo->refcnt >= crtc->bo->active_scanout);
@@ -7401,7 +7977,7 @@ fixup_flip:
 					xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
 						   "%s: page flipping failed, disabling CRTC:%d (pipe=%d)\n",
 						   __FUNCTION__, crtc->id, crtc->pipe);
-					sna_crtc_disable(crtc->base);
+					sna_crtc_disable(crtc->base, false);
 				}
 				continue;
 			}
@@ -7421,6 +7997,9 @@ fixup_flip:
 			crtc->flip_serial = crtc->mode_serial;
 			crtc->flip_pending = true;
 
+			DBG(("%s: recording flip on CRTC:%d handle=%d, active_scanout=%d, serial=%d\n",
+			     __FUNCTION__, crtc->id, crtc->flip_bo->handle, crtc->flip_bo->active_scanout, crtc->flip_serial));
+
 			{
 				struct drm_i915_gem_busy busy = { flip_bo->handle };
 				if (drmIoctl(sna->kgem.fd, DRM_IOCTL_I915_GEM_BUSY, &busy) == 0) {
@@ -7495,13 +8074,18 @@ again:
 			{
 				struct drm_event_vblank *vbl = (struct drm_event_vblank *)e;
 				struct sna_crtc *crtc = (void *)(uintptr_t)vbl->user_data;
+				uint64_t msc;
 
 				/* Beware Zaphod! */
 				sna = to_sna(crtc->base->scrn);
 
-				crtc->swap.tv_sec = vbl->tv_sec;
-				crtc->swap.tv_usec = vbl->tv_usec;
-				crtc->swap.msc = msc64(crtc, vbl->sequence);
+				if (msc64(crtc, vbl->sequence, &msc)) {
+					DBG(("%s: recording last swap on pipe=%d, frame %d [%08llx], time %d.%06d\n",
+					     __FUNCTION__, crtc->pipe, vbl->sequence, (long long)msc, vbl->tv_sec, vbl->tv_usec));
+					crtc->swap.tv_sec = vbl->tv_sec;
+					crtc->swap.tv_usec = vbl->tv_usec;
+					crtc->swap.msc = msc;
+				}
 				crtc->flip_pending = false;
 
 				assert(crtc->flip_bo);
@@ -7509,8 +8093,9 @@ again:
 				assert(crtc->flip_bo->refcnt >= crtc->flip_bo->active_scanout);
 
 				if (crtc->flip_serial == crtc->mode_serial) {
-					DBG(("%s: removing handle=%d from scanout, installing handle=%d\n",
-					     __FUNCTION__, crtc->bo->handle, crtc->flip_bo->handle));
+					DBG(("%s: removing handle=%d [active_scanout=%d] from scanout, installing handle=%d [active_scanout=%d]\n",
+					     __FUNCTION__, crtc->bo->handle, crtc->bo->active_scanout - 1,
+					     crtc->flip_bo->handle, crtc->flip_bo->active_scanout));
 					assert(crtc->bo->active_scanout);
 					assert(crtc->bo->refcnt >= crtc->bo->active_scanout);
 					crtc->bo->active_scanout--;
diff --git a/src/sna/sna_display_fake.c b/src/sna/sna_display_fake.c
index 4d74c38..a07fe0f 100644
--- a/src/sna/sna_display_fake.c
+++ b/src/sna/sna_display_fake.c
@@ -192,7 +192,7 @@ static const xf86OutputFuncsRec sna_output_funcs = {
 static Bool
 sna_mode_resize(ScrnInfoPtr scrn, int width, int height)
 {
-	ScreenPtr screen = scrn->pScreen;
+	ScreenPtr screen = xf86ScrnToScreen(scrn);
 	PixmapPtr new_front;
 
 	DBG(("%s (%d, %d) -> (%d, %d)\n", __FUNCTION__,
@@ -262,6 +262,7 @@ static bool add_fake_output(struct sna *sna, bool late)
 	output->mm_height = 0;
 	output->interlaceAllowed = FALSE;
 	output->subpixel_order = SubPixelNone;
+	output->status = XF86OutputStatusDisconnected;
 
 	output->possible_crtcs = ~((1 << sna->mode.num_real_crtc) - 1);
 	output->possible_clones = ~((1 << sna->mode.num_real_output) - 1);
diff --git a/src/sna/sna_dri2.c b/src/sna/sna_dri2.c
index e5c4d53..83e652d 100644
--- a/src/sna/sna_dri2.c
+++ b/src/sna/sna_dri2.c
@@ -82,6 +82,18 @@ get_private(void *buffer)
 	return (struct sna_dri2_private *)((DRI2Buffer2Ptr)buffer+1);
 }
 
+pure static inline DRI2BufferPtr sna_pixmap_get_buffer(PixmapPtr pixmap)
+{
+	assert(pixmap->refcnt);
+	return ((void **)__get_private(pixmap, sna_pixmap_key))[2];
+}
+
+static inline void sna_pixmap_set_buffer(PixmapPtr pixmap, void *ptr)
+{
+	assert(pixmap->refcnt);
+	((void **)__get_private(pixmap, sna_pixmap_key))[2] = ptr;
+}
+
 #if DRI2INFOREC_VERSION >= 4
 enum event_type {
 	WAITMSC = 0,
@@ -98,6 +110,7 @@ struct dri_bo {
 	struct list link;
 	struct kgem_bo *bo;
 	uint32_t name;
+	int flags;
 };
 
 struct sna_dri2_event {
@@ -108,6 +121,7 @@ struct sna_dri2_event {
 	xf86CrtcPtr crtc;
 	int pipe;
 	bool queued;
+	bool sync;
 
 	/* for swaps & flips only */
 	DRI2SwapEventPtr event_complete;
@@ -121,10 +135,38 @@ struct sna_dri2_event {
 	struct list cache;
 	struct list link;
 
-	int mode;
+	int flip_continue;
+	int keepalive;
 };
 
+#if DRI2INFOREC_VERSION < 10
+#undef USE_ASYNC_SWAP
+#endif
+
+#if USE_ASYNC_SWAP
+#define KEEPALIVE 4 /* wait ~50ms before discarding swap caches */
+#define APPLY_DAMAGE 0
+#else
+#define USE_ASYNC_SWAP 0
+#define KEEPALIVE 1
+#define APPLY_DAMAGE 1
+#endif
+
 static void sna_dri2_flip_event(struct sna_dri2_event *flip);
+inline static DRI2BufferPtr dri2_window_get_front(WindowPtr win);
+
+static int front_pitch(DrawablePtr draw)
+{
+	DRI2BufferPtr buffer;
+
+	buffer = NULL;
+	if (draw->type != DRAWABLE_PIXMAP)
+		buffer = dri2_window_get_front((WindowPtr)draw);
+	if (buffer == NULL)
+		buffer = sna_pixmap_get_buffer(get_drawable_pixmap(draw));
+
+	return buffer ? buffer->pitch : 0;
+}
 
 static void
 sna_dri2_get_back(struct sna *sna,
@@ -134,17 +176,24 @@ sna_dri2_get_back(struct sna *sna,
 {
 	struct kgem_bo *bo;
 	uint32_t name;
+	int flags;
 	bool reuse;
 
-	DBG(("%s: draw size=%dx%d, buffer size=%dx%d\n",
+	DBG(("%s: draw size=%dx%d, back buffer handle=%d size=%dx%d, is-scanout? %d, pitch=%d, front pitch=%d, has-cache?=%d\n",
 	     __FUNCTION__, draw->width, draw->height,
-	     get_private(back)->size & 0xffff, get_private(back)->size >> 16));
+	     get_private(back)->bo->handle,
+	     get_private(back)->size & 0xffff, get_private(back)->size >> 16,
+	     get_private(back)->bo->scanout,
+	     back->pitch, front_pitch(draw), info!=NULL));
 	reuse = (draw->height << 16 | draw->width) == get_private(back)->size;
+	if (reuse && get_private(back)->bo->scanout)
+		reuse = front_pitch(draw) == back->pitch;
+	DBG(("%s: reuse backbuffer? %d\n", __FUNCTION__, reuse));
 	if (reuse) {
 		bo = get_private(back)->bo;
 		assert(bo->refcnt);
 		DBG(("%s: back buffer handle=%d, scanout?=%d, refcnt=%d\n",
-					__FUNCTION__, bo->handle, bo->active_scanout, get_private(back)->refcnt));
+		     __FUNCTION__, bo->handle, bo->active_scanout, get_private(back)->refcnt));
 		if (bo->active_scanout == 0) {
 			DBG(("%s: reuse unattached back\n", __FUNCTION__));
 			get_private(back)->stale = false;
@@ -156,12 +205,16 @@ sna_dri2_get_back(struct sna *sna,
 	if (info) {
 		struct dri_bo *c;
 		list_for_each_entry(c, &info->cache, link) {
-			if (c->bo && c->bo->scanout == 0) {
+			DBG(("%s: cache: handle=%d, active=%d\n",
+			     __FUNCTION__, c->bo ? c->bo->handle : 0, c->bo ? c->bo->active_scanout : -1));
+			if (c->bo && c->bo->active_scanout == 0) {
 				bo = c->bo;
 				name = c->name;
-				DBG(("%s: reuse cache handle=%d\n", __FUNCTION__, bo->handle));
+				flags = c->flags;
+				DBG(("%s: reuse cache handle=%d, name=%d, flags=%d\n", __FUNCTION__, bo->handle, name, flags));
 				list_move_tail(&c->link, &info->cache);
 				c->bo = NULL;
+				break;
 			}
 		}
 	}
@@ -179,6 +232,23 @@ sna_dri2_get_back(struct sna *sna,
 			kgem_bo_destroy(&sna->kgem, bo);
 			return;
 		}
+
+		flags = 0;
+		if (USE_ASYNC_SWAP && back->flags) {
+			BoxRec box;
+
+			box.x1 = 0;
+			box.y1 = 0;
+			box.x2 = draw->width;
+			box.y2 = draw->height;
+
+			DBG(("%s: filling new buffer with old back\n", __FUNCTION__));
+			if (sna->render.copy_boxes(sna, GXcopy,
+						   draw, get_private(back)->bo, 0, 0,
+						   draw, bo, 0, 0,
+						   &box, 1, 0))
+				flags = back->flags;
+		}
 	}
 	assert(bo->active_scanout == 0);
 
@@ -198,11 +268,13 @@ sna_dri2_get_back(struct sna *sna,
 		if (c != NULL) {
 			c->bo = ref(get_private(back)->bo);
 			c->name = back->name;
+			c->flags = back->flags;
 			list_add(&c->link, &info->cache);
-			DBG(("%s: cacheing handle=%d (name=%d)\n", __FUNCTION__, c->bo->handle, c->name));
+			DBG(("%s: cacheing handle=%d (name=%d, flags=%d, active_scanout=%d)\n", __FUNCTION__, c->bo->handle, c->name, c->flags, c->bo->active_scanout));
 		}
 	}
 
+	assert(bo->active_scanout == 0);
 	assert(bo != get_private(back)->bo);
 	kgem_bo_destroy(&sna->kgem, get_private(back)->bo);
 
@@ -210,6 +282,7 @@ sna_dri2_get_back(struct sna *sna,
 	get_private(back)->size = draw->height << 16 | draw->width;
 	back->pitch = bo->pitch;
 	back->name = name;
+	back->flags = flags;
 
 	get_private(back)->stale = false;
 }
@@ -247,6 +320,7 @@ inline static void *dri2_window_get_front(WindowPtr win) { return NULL; }
 
 #define xorg_can_triple_buffer() 0
 #define swap_limit(d, l) false
+#define mark_stale(b)
 
 #else
 
@@ -273,6 +347,8 @@ mark_stale(DRI2BufferPtr back)
 	 * stale frame. (This is mostly useful for tracking down
 	 * driver bugs!)
 	 */
+	DBG(("%s(handle=%d) => %d\n", __FUNCTION__,
+	     get_private(back)->bo->handle, xorg_can_triple_buffer()));
 	get_private(back)->stale = xorg_can_triple_buffer();
 }
 
@@ -291,15 +367,16 @@ sna_dri2_reuse_buffer(DrawablePtr draw, DRI2BufferPtr buffer)
 	     buffer->attachment, get_private(buffer)->bo->handle, buffer->name));
 	assert(get_private(buffer)->refcnt);
 	assert(get_private(buffer)->bo->refcnt > get_private(buffer)->bo->active_scanout);
+	assert(kgem_bo_flink(&to_sna_from_drawable(draw)->kgem, get_private(buffer)->bo) == buffer->name);
 
 	if (buffer->attachment == DRI2BufferBackLeft &&
 	    draw->type != DRAWABLE_PIXMAP) {
-		DBG(("%s: replacing back buffer\n", __FUNCTION__));
+		DBG(("%s: replacing back buffer on window %ld\n", __FUNCTION__, draw->id));
 		sna_dri2_get_back(to_sna_from_drawable(draw), draw, buffer, dri2_chain(draw));
-
-		assert(kgem_bo_flink(&to_sna_from_drawable(draw)->kgem, get_private(buffer)->bo) == buffer->name);
 		assert(get_private(buffer)->bo->refcnt);
 		assert(get_private(buffer)->bo->active_scanout == 0);
+		assert(kgem_bo_flink(&to_sna_from_drawable(draw)->kgem, get_private(buffer)->bo) == buffer->name);
+		DBG(("reusing back buffer, age = %d\n", buffer->flags));
 	}
 }
 
@@ -314,11 +391,6 @@ static bool swap_limit(DrawablePtr draw, int limit)
 }
 #endif
 
-#if DRI2INFOREC_VERSION < 10
-#undef USE_ASYNC_SWAP
-#define USE_ASYNC_SWAP 0
-#endif
-
 #define COLOR_PREFER_TILING_Y 0
 
 /* Prefer to enable TILING_Y if this buffer will never be a
@@ -382,24 +454,12 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 	tiling = color_tiling(sna, &pixmap->drawable);
 	if (tiling < 0)
 		tiling = -tiling;
-	if (priv->gpu_bo->tiling != tiling)
+	if (priv->gpu_bo->tiling != tiling && !priv->gpu_bo->scanout)
 		sna_pixmap_change_tiling(pixmap, tiling);
 
 	return priv->gpu_bo;
 }
 
-pure static inline void *sna_pixmap_get_buffer(PixmapPtr pixmap)
-{
-	assert(pixmap->refcnt);
-	return ((void **)__get_private(pixmap, sna_pixmap_key))[2];
-}
-
-static inline void sna_pixmap_set_buffer(PixmapPtr pixmap, void *ptr)
-{
-	assert(pixmap->refcnt);
-	((void **)__get_private(pixmap, sna_pixmap_key))[2] = ptr;
-}
-
 void
 sna_dri2_pixmap_update_bo(struct sna *sna, PixmapPtr pixmap, struct kgem_bo *bo)
 {
@@ -449,9 +509,9 @@ sna_dri2_create_buffer(DrawablePtr draw,
 	struct sna_dri2_private *private;
 	PixmapPtr pixmap;
 	struct kgem_bo *bo;
+	unsigned bpp = format ?: draw->bitsPerPixel;
 	unsigned flags = 0;
 	uint32_t size;
-	int bpp;
 
 	DBG(("%s pixmap=%ld, (attachment=%d, format=%d, drawable=%dx%d), window?=%d\n",
 	     __FUNCTION__,
@@ -472,7 +532,7 @@ sna_dri2_create_buffer(DrawablePtr draw,
 		if (buffer) {
 			private = get_private(buffer);
 
-			DBG(("%s: reusing front buffer attachment, win=%lu %dx%d, pixmap=%ld [%ld] %dx%d, handle=%d, name=%d\n",
+			DBG(("%s: reusing front buffer attachment, win=%lu %dx%d, pixmap=%ld [%ld] %dx%d, handle=%d, name=%d, active_scanout=%d\n",
 			     __FUNCTION__,
 			     draw->type != DRAWABLE_PIXMAP ? (long)draw->id : (long)0,
 			     draw->width, draw->height,
@@ -480,12 +540,14 @@ sna_dri2_create_buffer(DrawablePtr draw,
 			     private->pixmap->drawable.serialNumber,
 			     pixmap->drawable.width,
 			     pixmap->drawable.height,
-			     private->bo->handle, buffer->name));
+			     private->bo->handle, buffer->name,
+			     private->bo->active_scanout));
 
 			assert(private->pixmap == pixmap);
 			assert(sna_pixmap(pixmap)->flush);
 			assert(sna_pixmap(pixmap)->pinned & PIN_DRI2);
 			assert(kgem_bo_flink(&sna->kgem, private->bo) == buffer->name);
+			assert(private->bo->pitch == buffer->pitch);
 
 			private->refcnt++;
 			return buffer;
@@ -498,7 +560,6 @@ sna_dri2_create_buffer(DrawablePtr draw,
 		assert(sna_pixmap(pixmap) != NULL);
 
 		bo = ref(bo);
-		bpp = pixmap->drawable.bitsPerPixel;
 		if (pixmap == sna->front && !(sna->flags & SNA_LINEAR_FB))
 			flags |= CREATE_SCANOUT;
 		DBG(("%s: attaching to front buffer %dx%d [%p:%d], scanout? %d\n",
@@ -506,6 +567,7 @@ sna_dri2_create_buffer(DrawablePtr draw,
 		     pixmap->drawable.width, pixmap->drawable.height,
 		     pixmap, pixmap->refcnt, flags & CREATE_SCANOUT));
 		size = (uint32_t)pixmap->drawable.height << 16 | pixmap->drawable.width;
+		bpp = pixmap->drawable.bitsPerPixel;
 		break;
 
 	case DRI2BufferBackLeft:
@@ -514,6 +576,7 @@ sna_dri2_create_buffer(DrawablePtr draw,
 				flags |= CREATE_SCANOUT;
 			if (draw->width  == sna->front->drawable.width &&
 			    draw->height == sna->front->drawable.height &&
+			    draw->bitsPerPixel == bpp &&
 			    (sna->flags & (SNA_LINEAR_FB | SNA_NO_WAIT | SNA_NO_FLIP)) == 0)
 				flags |= CREATE_SCANOUT;
 		}
@@ -521,7 +584,6 @@ sna_dri2_create_buffer(DrawablePtr draw,
 	case DRI2BufferFrontRight:
 	case DRI2BufferFakeFrontLeft:
 	case DRI2BufferFakeFrontRight:
-		bpp = draw->bitsPerPixel;
 		DBG(("%s: creating back buffer %dx%d, suitable for scanout? %d\n",
 		     __FUNCTION__,
 		     draw->width, draw->height,
@@ -530,7 +592,7 @@ sna_dri2_create_buffer(DrawablePtr draw,
 		bo = kgem_create_2d(&sna->kgem,
 				    draw->width,
 				    draw->height,
-				    draw->bitsPerPixel,
+				    bpp,
 				    color_tiling(sna, draw),
 				    flags);
 		break;
@@ -558,7 +620,6 @@ sna_dri2_create_buffer(DrawablePtr draw,
 		 * not understand W tiling and the GTT is incapable of
 		 * W fencing.
 		 */
-		bpp = format ? format : draw->bitsPerPixel;
 		bpp *= 2;
 		bo = kgem_create_2d(&sna->kgem,
 				    ALIGN(draw->width, 64),
@@ -570,7 +631,6 @@ sna_dri2_create_buffer(DrawablePtr draw,
 	case DRI2BufferDepthStencil:
 	case DRI2BufferHiz:
 	case DRI2BufferAccum:
-		bpp = format ? format : draw->bitsPerPixel,
 		bo = kgem_create_2d(&sna->kgem,
 				    draw->width, draw->height, bpp,
 				    other_tiling(sna, draw),
@@ -746,7 +806,6 @@ static void set_bo(PixmapPtr pixmap, struct kgem_bo *bo)
 {
 	struct sna *sna = to_sna_from_pixmap(pixmap);
 	struct sna_pixmap *priv = sna_pixmap(pixmap);
-	RegionRec region;
 
 	DBG(("%s: pixmap=%ld, handle=%d\n",
 	     __FUNCTION__, pixmap->drawable.serialNumber, bo->handle));
@@ -758,15 +817,18 @@ static void set_bo(PixmapPtr pixmap, struct kgem_bo *bo)
 	assert((priv->pinned & (PIN_PRIME | PIN_DRI3)) == 0);
 	assert(priv->flush);
 
-	/* Post damage on the new front buffer so that listeners, such
-	 * as DisplayLink know take a copy and shove it over the USB,
-	 * also for software cursors and the like.
-	 */
-	region.extents.x1 = region.extents.y1 = 0;
-	region.extents.x2 = pixmap->drawable.width;
-	region.extents.y2 = pixmap->drawable.height;
-	region.data = NULL;
-	DamageRegionAppend(&pixmap->drawable, &region);
+	if (APPLY_DAMAGE) {
+		RegionRec region;
+		/* Post damage on the new front buffer so that listeners, such
+		 * as DisplayLink know take a copy and shove it over the USB,
+		 * also for software cursors and the like.
+		 */
+		region.extents.x1 = region.extents.y1 = 0;
+		region.extents.x2 = pixmap->drawable.width;
+		region.extents.y2 = pixmap->drawable.height;
+		region.data = NULL;
+		DamageRegionAppend(&pixmap->drawable, &region);
+	}
 
 	damage(pixmap, priv, NULL);
 
@@ -792,7 +854,8 @@ static void set_bo(PixmapPtr pixmap, struct kgem_bo *bo)
 		bo->domain = DOMAIN_NONE;
 	assert(bo->flush);
 
-	DamageRegionProcessPending(&pixmap->drawable);
+	if (APPLY_DAMAGE)
+		DamageRegionProcessPending(&pixmap->drawable);
 }
 
 static void sna_dri2_select_mode(struct sna *sna, struct kgem_bo *dst, struct kgem_bo *src, bool sync)
@@ -823,6 +886,12 @@ static void sna_dri2_select_mode(struct sna *sna, struct kgem_bo *dst, struct kg
 		return;
 	}
 
+	if (sna->render_state.gt < 2 && sna->kgem.has_semaphores) {
+		DBG(("%s: small GT [%d], not forcing selection\n",
+		     __FUNCTION__, sna->render_state.gt));
+		return;
+	}
+
 	VG_CLEAR(busy);
 	busy.handle = src->handle;
 	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_I915_GEM_BUSY, &busy))
@@ -871,10 +940,12 @@ static bool is_front(int attachment)
 	return attachment == DRI2BufferFrontLeft;
 }
 
+#define DRI2_SYNC 0x1
+#define DRI2_DAMAGE 0x2
 static struct kgem_bo *
 __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
 		      DRI2BufferPtr src, DRI2BufferPtr dst,
-		      bool sync)
+		      unsigned flags)
 {
 	PixmapPtr pixmap = get_drawable_pixmap(draw);
 	DrawableRec scratch, *src_draw = &pixmap->drawable, *dst_draw = &pixmap->drawable;
@@ -886,7 +957,7 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
 	struct kgem_bo *dst_bo;
 	const BoxRec *boxes;
 	int16_t dx, dy, sx, sy;
-	unsigned flags;
+	unsigned hint;
 	int n;
 
 	/* To hide a stale DRI2Buffer, one may choose to substitute
@@ -962,7 +1033,7 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
 			}
 		}
 	} else
-		sync = false;
+		flags &= ~DRI2_SYNC;
 
 	scratch.x = scratch.y = 0;
 	scratch.width = scratch.height = 0;
@@ -1013,12 +1084,12 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
 		/* Preserve the CRTC shadow overrides */
 		sna_shadow_steal_crtcs(sna, &shadow);
 
-		flags = MOVE_WRITE | __MOVE_FORCE;
+		hint = MOVE_WRITE | __MOVE_FORCE;
 		if (clip.data)
-			flags |= MOVE_READ;
+			hint |= MOVE_READ;
 
 		assert(region == NULL || region == &clip);
-		priv = sna_pixmap_move_area_to_gpu(pixmap, &clip.extents, flags);
+		priv = sna_pixmap_move_area_to_gpu(pixmap, &clip.extents, hint);
 		if (priv) {
 			damage(pixmap, priv, region);
 			dst_bo = priv->gpu_bo;
@@ -1050,20 +1121,20 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
 		assert(region == NULL || region == &clip);
 		pixman_region_intersect(&clip, &clip, &target);
 
-		sync = false;
+		flags &= ~DRI2_SYNC;
 	}
 
 	if (!wedged(sna)) {
 		xf86CrtcPtr crtc;
 
 		crtc = NULL;
-		if (sync && sna_pixmap_is_scanout(sna, pixmap))
+		if (flags & DRI2_SYNC && sna_pixmap_is_scanout(sna, pixmap))
 			crtc = sna_covering_crtc(sna, &clip.extents, NULL);
 		sna_dri2_select_mode(sna, dst_bo, src_bo, crtc != NULL);
 
-		sync = (crtc != NULL&&
-			sna_wait_for_scanline(sna, pixmap, crtc,
-					      &clip.extents));
+		if (crtc == NULL ||
+		    !sna_wait_for_scanline(sna, pixmap, crtc, &clip.extents))
+			flags &= ~DRI2_SYNC;
 	}
 
 	if (region) {
@@ -1075,8 +1146,8 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
 		boxes = &clip.extents;
 		n = 1;
 	}
-	DamageRegionAppend(&pixmap->drawable, region);
-
+	if (APPLY_DAMAGE || flags & DRI2_DAMAGE)
+		DamageRegionAppend(&pixmap->drawable, region);
 
 	DBG(("%s: copying [(%d, %d), (%d, %d)]x%d src=(%d, %d), dst=(%d, %d)\n",
 	     __FUNCTION__,
@@ -1084,20 +1155,20 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
 	     boxes[0].x2, boxes[0].y2,
 	     n, sx, sy, dx, dy));
 
-	flags = COPY_LAST;
-	if (sync)
-		flags |= COPY_SYNC;
+	hint = COPY_LAST;
+	if (flags & DRI2_SYNC)
+		hint |= COPY_SYNC;
 	if (!sna->render.copy_boxes(sna, GXcopy,
 				    src_draw, src_bo, sx, sy,
 				    dst_draw, dst_bo, dx, dy,
-				    boxes, n, flags))
+				    boxes, n, hint))
 		memcpy_copy_boxes(sna, GXcopy,
 				  src_draw, src_bo, sx, sy,
 				  dst_draw, dst_bo, dx, dy,
-				  boxes, n, flags);
+				  boxes, n, hint);
 
 	DBG(("%s: flushing? %d\n", __FUNCTION__, sync));
-	if (sync) { /* STAT! */
+	if (flags & DRI2_SYNC) { /* STAT! */
 		struct kgem_request *rq = sna->kgem.next_request;
 		kgem_submit(&sna->kgem);
 		if (rq->bo) {
@@ -1106,7 +1177,8 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
 		}
 	}
 
-	DamageRegionProcessPending(&pixmap->drawable);
+	if (APPLY_DAMAGE || flags & DRI2_DAMAGE)
+		DamageRegionProcessPending(&pixmap->drawable);
 
 	if (clip.data)
 		pixman_region_fini(&clip);
@@ -1114,6 +1186,15 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
 	return bo;
 }
 
+inline static void
+__sna_dri2_copy_event(struct sna_dri2_event *info, unsigned flags)
+{
+	info->bo = __sna_dri2_copy_region(info->sna, info->draw, NULL,
+					  info->back, info->front,
+					  flags);
+	info->front->flags = info->back->flags;
+}
+
 static void
 sna_dri2_copy_region(DrawablePtr draw,
 		     RegionPtr region,
@@ -1151,7 +1232,7 @@ sna_dri2_copy_region(DrawablePtr draw,
 	     region->extents.x2, region->extents.y2,
 	     region_num_rects(region)));
 
-	__sna_dri2_copy_region(sna, draw, region, src, dst, false);
+	__sna_dri2_copy_region(sna, draw, region, src, dst, DRI2_DAMAGE);
 }
 
 inline static uint32_t pipe_select(int pipe)
@@ -1281,6 +1362,9 @@ sna_dri2_event_free(struct sna_dri2_event *info)
 	DrawablePtr draw = info->draw;
 
 	DBG(("%s(draw?=%d)\n", __FUNCTION__, draw != NULL));
+	if (info->sna->dri2.flip_pending == info)
+		info->sna->dri2.flip_pending = info->chain;
+	assert(info->sna->dri2.flip_pending != info);
 	if (draw && draw->type == DRAWABLE_WINDOW)
 		sna_dri2_remove_event((WindowPtr)draw, info);
 
@@ -1338,6 +1422,7 @@ sna_dri2_client_gone(CallbackListPtr *list, void *closure, void *data)
 						      event);
 			event->client = NULL;
 			event->draw = NULL;
+			event->flip_continue = 0;
 			list_del(&event->link);
 		} else
 			sna_dri2_event_free(event);
@@ -1452,8 +1537,10 @@ void sna_dri2_destroy_window(WindowPtr win)
 
 		chain = priv->chain;
 		while ((info = chain)) {
+			assert(info->draw == &win->drawable);
 			info->draw = NULL;
 			info->client = NULL;
+			info->flip_continue = 0;
 			list_del(&info->link);
 
 			chain = info->chain;
@@ -1479,7 +1566,7 @@ sna_dri2_flip(struct sna_dri2_event *info)
 {
 	struct kgem_bo *bo = get_private(info->back)->bo;
 	struct kgem_bo *tmp_bo;
-	uint32_t tmp_name;
+	uint32_t tmp_name, tmp_flags;
 	int tmp_pitch;
 
 	DBG(("%s(type=%d)\n", __FUNCTION__, info->type));
@@ -1488,6 +1575,12 @@ sna_dri2_flip(struct sna_dri2_event *info)
 	assert(get_drawable_pixmap(info->draw)->drawable.height * bo->pitch <= kgem_bo_size(bo));
 	assert(bo->refcnt);
 
+	if (info->sna->mode.flip_active) {
+		DBG(("%s: %d flips still active, aborting\n",
+		     __FUNCTION__, info->sna->mode.flip_active));
+		return false;
+	}
+
 	if (!sna_page_flip(info->sna, bo, sna_dri2_flip_handler,
 			   info->type == FLIP_ASYNC ? NULL : info))
 		return false;
@@ -1505,13 +1598,16 @@ sna_dri2_flip(struct sna_dri2_event *info)
 	tmp_bo = get_private(info->front)->bo;
 	tmp_name = info->front->name;
 	tmp_pitch = info->front->pitch;
+	tmp_flags = info->front->flags;
 
 	set_bo(info->sna->front, bo);
 
+	info->front->flags = info->back->flags;
 	info->front->name = info->back->name;
 	info->front->pitch = info->back->pitch;
 	get_private(info->front)->bo = bo;
 
+	info->back->flags = tmp_flags;
 	info->back->name = tmp_name;
 	info->back->pitch = tmp_pitch;
 	get_private(info->back)->bo = tmp_bo;
@@ -1521,6 +1617,7 @@ sna_dri2_flip(struct sna_dri2_event *info)
 	assert(get_private(info->back)->bo->refcnt);
 	assert(get_private(info->front)->bo != get_private(info->back)->bo);
 
+	info->keepalive = KEEPALIVE;
 	info->queued = true;
 	return true;
 }
@@ -1549,15 +1646,16 @@ can_flip(struct sna * sna,
 	}
 
 	assert(sna->scrn->vtSema);
+	assert(!sna->mode.hidden);
 
 	if ((sna->flags & (SNA_HAS_FLIP | SNA_HAS_ASYNC_FLIP)) == 0) {
 		DBG(("%s: no, pageflips disabled\n", __FUNCTION__));
 		return false;
 	}
 
-	if (front->format != back->format) {
+	if (front->cpp != back->cpp) {
 		DBG(("%s: no, format mismatch, front = %d, back = %d\n",
-		     __FUNCTION__, front->format, back->format));
+		     __FUNCTION__, front->cpp, back->cpp));
 		return false;
 	}
 
@@ -1680,9 +1778,9 @@ can_xchg(struct sna *sna,
 	if (draw->type == DRAWABLE_PIXMAP)
 		return false;
 
-	if (front->format != back->format) {
+	if (front->cpp != back->cpp) {
 		DBG(("%s: no, format mismatch, front = %d, back = %d\n",
-		     __FUNCTION__, front->format, back->format));
+		     __FUNCTION__, front->cpp, back->cpp));
 		return false;
 	}
 
@@ -1785,9 +1883,9 @@ can_xchg_crtc(struct sna *sna,
 	if (draw->type == DRAWABLE_PIXMAP)
 		return false;
 
-	if (front->format != back->format) {
+	if (front->cpp != back->cpp) {
 		DBG(("%s: no, format mismatch, front = %d, back = %d\n",
-		     __FUNCTION__, front->format, back->format));
+		     __FUNCTION__, front->cpp, back->cpp));
 		return false;
 	}
 
@@ -1876,10 +1974,10 @@ sna_dri2_xchg(DrawablePtr draw, DRI2BufferPtr front, DRI2BufferPtr back)
 	     pixmap->drawable.width,
 	     pixmap->drawable.height));
 
-	DBG(("%s: back_bo pitch=%d, size=%d, ref=%d, active_scanout?=%d\n",
-	     __FUNCTION__, back_bo->pitch, kgem_bo_size(back_bo), back_bo->refcnt, back_bo->active_scanout));
-	DBG(("%s: front_bo pitch=%d, size=%d, ref=%d, active_scanout?=%d\n",
-	     __FUNCTION__, front_bo->pitch, kgem_bo_size(front_bo), front_bo->refcnt, front_bo->active_scanout));
+	DBG(("%s: back_bo handle=%d, pitch=%d, size=%d, ref=%d, active_scanout?=%d\n",
+	     __FUNCTION__, back_bo->handle, back_bo->pitch, kgem_bo_size(back_bo), back_bo->refcnt, back_bo->active_scanout));
+	DBG(("%s: front_bo handle=%d, pitch=%d, size=%d, ref=%d, active_scanout?=%d\n",
+	     __FUNCTION__, front_bo->handle, front_bo->pitch, kgem_bo_size(front_bo), front_bo->refcnt, front_bo->active_scanout));
 	assert(front_bo->refcnt);
 	assert(back_bo->refcnt);
 
@@ -1902,6 +2000,10 @@ sna_dri2_xchg(DrawablePtr draw, DRI2BufferPtr front, DRI2BufferPtr back)
 	front->pitch = back->pitch;
 	back->pitch = tmp;
 
+	tmp = front->flags;
+	front->flags = back->flags;
+	back->flags = tmp;
+
 	assert(front_bo->refcnt);
 	assert(back_bo->refcnt);
 
@@ -1923,9 +2025,11 @@ static void sna_dri2_xchg_crtc(struct sna *sna, DrawablePtr draw, xf86CrtcPtr cr
 	     get_window_pixmap(win)->drawable.width,
 	     get_window_pixmap(win)->drawable.height));
 
-	DamageRegionAppend(&win->drawable, &win->clipList);
+	if (APPLY_DAMAGE)
+		DamageRegionAppend(&win->drawable, &win->clipList);
 	sna_shadow_set_crtc(sna, crtc, get_private(back)->bo);
-	DamageRegionProcessPending(&win->drawable);
+	if (APPLY_DAMAGE)
+		DamageRegionProcessPending(&win->drawable);
 
 	assert(dri2_window(win)->front == NULL);
 
@@ -1933,8 +2037,8 @@ static void sna_dri2_xchg_crtc(struct sna *sna, DrawablePtr draw, xf86CrtcPtr cr
 	if (tmp == NULL) {
 		back->attachment = -1;
 		if (get_private(back)->proxy == NULL) {
-			get_private(back)->pixmap = get_window_pixmap(win);
-			get_private(back)->proxy = sna_dri2_reference_buffer(sna_pixmap_get_buffer(get_private(back)->pixmap));
+			get_private(back)->pixmap = get_private(front)->pixmap;
+			get_private(back)->proxy = sna_dri2_reference_buffer(get_private(front)->proxy ?: front);
 		}
 		dri2_window(win)->front = sna_dri2_reference_buffer(back);
 		return;
@@ -1946,26 +2050,35 @@ static void sna_dri2_xchg_crtc(struct sna *sna, DrawablePtr draw, xf86CrtcPtr cr
 	get_private(tmp)->refcnt = 1;
 	get_private(tmp)->bo = get_private(back)->bo;
 	get_private(tmp)->size = get_private(back)->size;
-	get_private(tmp)->pixmap = get_window_pixmap(win);
-	get_private(tmp)->proxy = sna_dri2_reference_buffer(sna_pixmap_get_buffer(get_private(tmp)->pixmap));
+	get_private(tmp)->pixmap = get_private(front)->pixmap;
+	get_private(tmp)->proxy = sna_dri2_reference_buffer(get_private(front)->proxy ?: front);
+
 	dri2_window(win)->front = tmp;
 
-	DBG(("%s: allocating new backbuffer\n", __FUNCTION__));
-	back->name = 0;
-	bo = kgem_create_2d(&sna->kgem,
-			    draw->width, draw->height, draw->bitsPerPixel,
-			    get_private(back)->bo->tiling,
-			    CREATE_SCANOUT);
-	if (bo != NULL) {
-		get_private(back)->bo = bo;
-		back->pitch = bo->pitch;
-		back->name = kgem_bo_flink(&sna->kgem, bo);
-	}
-	if (back->name == 0) {
-		if (bo != NULL)
-			kgem_bo_destroy(&sna->kgem, bo);
-		get_private(back)->bo = NULL;
+	if (get_private(front)->proxy) {
+		DBG(("%s: reusing current proxy frontbuffer\n", __FUNCTION__));
+		front->attachment = DRI2BufferBackLeft;
+		ref(get_private(tmp)->bo);
 		back->attachment = -1;
+	} else {
+		DBG(("%s: allocating new backbuffer\n", __FUNCTION__));
+		back->name = 0;
+		back->flags = 0;
+		bo = kgem_create_2d(&sna->kgem,
+				    draw->width, draw->height, draw->bitsPerPixel,
+				    get_private(back)->bo->tiling,
+				    CREATE_SCANOUT);
+		if (bo != NULL) {
+			get_private(back)->bo = bo;
+			back->pitch = bo->pitch;
+			back->name = kgem_bo_flink(&sna->kgem, bo);
+		}
+		if (back->name == 0) {
+			if (bo != NULL)
+				kgem_bo_destroy(&sna->kgem, bo);
+			get_private(back)->bo = NULL;
+			back->attachment = -1;
+		}
 	}
 }
 
@@ -2022,9 +2135,9 @@ static void chain_swap(struct sna_dri2_event *chain)
 	if (chain->queued) /* too early! */
 		return;
 
-	assert(chain == dri2_chain(chain->draw));
 	DBG(("%s: chaining draw=%ld, type=%d\n",
 	     __FUNCTION__, (long)chain->draw->id, chain->type));
+	assert(chain == dri2_chain(chain->draw));
 	chain->queued = true;
 
 	switch (chain->type) {
@@ -2053,9 +2166,7 @@ static void chain_swap(struct sna_dri2_event *chain)
 			sna_dri2_xchg_crtc(chain->sna, chain->draw, chain->crtc, chain->front, chain->back);
 		} else {
 			assert(chain->queued);
-			chain->bo = __sna_dri2_copy_region(chain->sna, chain->draw, NULL,
-							   chain->back, chain->front,
-							   true);
+			__sna_dri2_copy_event(chain, 0);
 		}
 	case SWAP:
 		break;
@@ -2069,15 +2180,13 @@ static void chain_swap(struct sna_dri2_event *chain)
 		DRM_VBLANK_EVENT;
 	vbl.request.sequence = 1;
 	vbl.request.signal = (uintptr_t)chain;
-	if (sna_wait_vblank(chain->sna, &vbl, chain->pipe)) {
+	if ((chain->type == SWAP_THROTTLE &&
+	     !swap_limit(chain->draw, 2 + !chain->sync) &&
+	     !chain->sync) ||
+	    sna_wait_vblank(chain->sna, &vbl, chain->pipe)) {
 		DBG(("%s: vblank wait failed, unblocking client\n", __FUNCTION__));
 		frame_swap_complete(chain, DRI2_BLIT_COMPLETE);
 		sna_dri2_event_free(chain);
-	} else {
-		if (chain->type == SWAP_THROTTLE && !swap_limit(chain->draw, 2)) {
-			DBG(("%s: fake triple buffering, unblocking client\n", __FUNCTION__));
-			frame_swap_complete(chain, DRI2_BLIT_COMPLETE);
-		}
 	}
 }
 
@@ -2163,8 +2272,7 @@ void sna_dri2_vblank_handler(struct drm_event_vblank *event)
 			info->type = SWAP_WAIT;
 		}  else {
 			assert(info->queued);
-			info->bo = __sna_dri2_copy_region(sna, draw, NULL,
-							  info->back, info->front, true);
+			__sna_dri2_copy_event(info, DRI2_SYNC);
 			info->type = SWAP_WAIT;
 		}
 
@@ -2230,101 +2338,108 @@ done:
 	DBG(("%s complete\n", __FUNCTION__));
 }
 
-static bool
+static void
 sna_dri2_immediate_blit(struct sna *sna,
 			struct sna_dri2_event *info,
-			bool sync, bool event)
+			bool sync)
 {
 	DrawablePtr draw = info->draw;
-	bool ret = false;
+	struct sna_dri2_event *chain = dri2_chain(draw);
 
 	if (sna->flags & SNA_NO_WAIT)
 		sync = false;
 
-	DBG(("%s: emitting immediate blit, throttling client, synced? %d, chained? %d, send-event? %d\n",
-	     __FUNCTION__, sync, dri2_chain(draw) != info,
-	     event));
+	DBG(("%s: emitting immediate blit, throttling client, synced? %d, chained? %d\n",
+	     __FUNCTION__, sync, chain != info));
 
 	info->type = SWAP_THROTTLE;
-	if (!sync || dri2_chain(draw) == info) {
-		DBG(("%s: no pending blit, starting chain\n",
-		     __FUNCTION__));
+	info->sync = sync;
+	if (chain == info) {
+		union drm_wait_vblank vbl;
+
+		DBG(("%s: no pending blit, starting chain\n", __FUNCTION__));
 
 		info->queued = true;
-		info->bo = __sna_dri2_copy_region(sna, draw, NULL,
-						  info->back,
-						  info->front,
-						  sync);
-		if (event) {
-			if (sync) {
-				union drm_wait_vblank vbl;
-
-				VG_CLEAR(vbl);
-				vbl.request.type =
-					DRM_VBLANK_RELATIVE |
-					DRM_VBLANK_EVENT;
-				vbl.request.sequence = 1;
-				vbl.request.signal = (uintptr_t)info;
-				ret = !sna_wait_vblank(sna, &vbl, info->pipe);
-				if (ret)
-					event = !swap_limit(draw, 2);
-			}
-			if (event) {
-				DBG(("%s: fake triple buffering, unblocking client\n", __FUNCTION__));
-				frame_swap_complete(info, DRI2_BLIT_COMPLETE);
-			}
+		__sna_dri2_copy_event(info, sync);
+
+		VG_CLEAR(vbl);
+		vbl.request.type =
+			DRM_VBLANK_RELATIVE |
+			DRM_VBLANK_EVENT;
+		vbl.request.sequence = 1;
+		vbl.request.signal = (uintptr_t)info;
+		if ((!swap_limit(draw, 2 + !sync) && !sync) ||
+		    sna_wait_vblank(sna, &vbl, info->pipe)) {
+			DBG(("%s: fake triple buffering, unblocking client\n", __FUNCTION__));
+			frame_swap_complete(info, DRI2_BLIT_COMPLETE);
+			sna_dri2_event_free(info);
 		}
-	} else {
-		DBG(("%s: pending blit, chained\n", __FUNCTION__));
-		ret = true;
+		return;
 	}
 
-	DBG(("%s: continue? %d\n", __FUNCTION__, ret));
-	return ret;
+	if (chain->chain != info && chain->chain->type == SWAP_THROTTLE) {
+		struct sna_dri2_event *tmp = chain->chain;
+
+		assert(!tmp->queued);
+
+		assert(info->chain == NULL);
+		info->chain = tmp->chain;
+		if (info->chain == info)
+			info->chain = NULL;
+		chain->chain = info;
+		tmp->chain = NULL;
+
+		DBG(("%s: swap elision, unblocking client\n", __FUNCTION__));
+		frame_swap_complete(tmp, DRI2_BLIT_COMPLETE);
+
+		tmp->draw = NULL;
+		sna_dri2_event_free(tmp);
+	}
+
+	DBG(("%s: pending blit, chained\n", __FUNCTION__));
 }
 
 static bool
 sna_dri2_flip_continue(struct sna_dri2_event *info)
 {
-	DBG(("%s(mode=%d)\n", __FUNCTION__, info->mode));
+	struct kgem_bo *bo = get_private(info->front)->bo;
 
-	if (info->mode > 0){
-		struct kgem_bo *bo = get_private(info->front)->bo;
+	DBG(("%s(mode=%d)\n", __FUNCTION__, info->flip_continue));
+	assert(info->flip_continue > 0);
 
-		info->type = info->mode;
+	info->type = info->flip_continue;
 
-		if (bo != sna_pixmap(info->sna->front)->gpu_bo)
-			return false;
+	if (info->sna->mode.front_active == 0)
+		return false;
 
-		if (!sna_page_flip(info->sna, bo, sna_dri2_flip_handler, info))
-			return false;
+	if (bo != sna_pixmap(info->sna->front)->gpu_bo)
+		return false;
 
-		assert(info->sna->dri2.flip_pending == NULL ||
-		       info->sna->dri2.flip_pending == info);
-		info->sna->dri2.flip_pending = info;
-		assert(info->queued);
-	} else {
-		info->type = -info->mode;
+	if (!sna_page_flip(info->sna, bo, sna_dri2_flip_handler, info))
+		return false;
 
-		if (!info->draw)
-			return false;
+	assert(info->sna->dri2.flip_pending == NULL ||
+	       info->sna->dri2.flip_pending == info);
+	info->sna->dri2.flip_pending = info;
+	assert(info->queued);
 
-		if (!can_flip(info->sna, info->draw, info->front, info->back, info->crtc))
-			return false;
+	info->flip_continue = 0;
+	return true;
+}
 
-		assert(sna_pixmap_get_buffer(get_drawable_pixmap(info->draw)) == info->front);
-		if (!sna_dri2_flip(info))
-			return false;
+static bool
+sna_dri2_flip_keepalive(struct sna_dri2_event *info)
+{
+	DBG(("%s(keepalive?=%d)\n", __FUNCTION__, info->keepalive-1));
+	assert(info->keepalive > 0);
+	if (!--info->keepalive)
+		return false;
 
-		if (!xorg_can_triple_buffer()) {
-			sna_dri2_get_back(info->sna, info->draw, info->back, info);
-			DBG(("%s: fake triple buffering, unblocking client\n", __FUNCTION__));
-			frame_swap_complete(info, DRI2_FLIP_COMPLETE);
-		}
-	}
+	if (info->draw == NULL)
+		return false;
 
-	info->mode = 0;
-	return true;
+	info->flip_continue = FLIP_COMPLETE;
+	return sna_dri2_flip_continue(info);
 }
 
 static void chain_flip(struct sna *sna)
@@ -2332,8 +2447,8 @@ static void chain_flip(struct sna *sna)
 	struct sna_dri2_event *chain = sna->dri2.flip_pending;
 
 	assert(chain->type == FLIP);
-	DBG(("%s: chaining type=%d, cancelled?=%d\n",
-	     __FUNCTION__, chain->type, chain->draw == NULL));
+	DBG(("%s: chaining type=%d, cancelled?=%d window=%ld\n",
+	     __FUNCTION__, chain->type, chain->draw == NULL, chain->draw ? chain->draw->id : 0));
 
 	sna->dri2.flip_pending = NULL;
 	if (chain->draw == NULL) {
@@ -2350,9 +2465,7 @@ static void chain_flip(struct sna *sna)
 		DBG(("%s: performing chained flip\n", __FUNCTION__));
 	} else {
 		DBG(("%s: emitting chained vsync'ed blit\n", __FUNCTION__));
-		chain->bo = __sna_dri2_copy_region(sna, chain->draw, NULL,
-						  chain->back, chain->front,
-						  true);
+		__sna_dri2_copy_event(chain, DRI2_SYNC);
 
 		if (xorg_can_triple_buffer()) {
 			union drm_wait_vblank vbl;
@@ -2403,23 +2516,30 @@ static void sna_dri2_flip_event(struct sna_dri2_event *flip)
 		frame_swap_complete(flip, DRI2_FLIP_COMPLETE);
 	case FLIP_COMPLETE:
 		if (sna->dri2.flip_pending) {
+			DBG(("%s: pending flip\n", __FUNCTION__));
 			sna_dri2_event_free(flip);
 			chain_flip(sna);
-		} else if (!flip->mode) {
+		} else if (!flip->flip_continue) {
 			DBG(("%s: flip chain complete\n", __FUNCTION__));
+			if (!sna_dri2_flip_keepalive(flip)) {
+				if (flip->chain) {
+					sna_dri2_remove_event((WindowPtr)flip->draw,
+							      flip);
+					chain_swap(flip->chain);
+					flip->draw = NULL;
+				}
 
-			if (flip->chain) {
-				sna_dri2_remove_event((WindowPtr)flip->draw,
-						      flip);
-				chain_swap(flip->chain);
-				flip->draw = NULL;
+				sna_dri2_event_free(flip);
 			}
-
-			sna_dri2_event_free(flip);
 		} else if (!sna_dri2_flip_continue(flip)) {
 			DBG(("%s: no longer able to flip\n", __FUNCTION__));
-			if (flip->draw == NULL || !sna_dri2_immediate_blit(sna, flip, false, flip->mode < 0))
-				sna_dri2_event_free(flip);
+			if (flip->draw != NULL)
+				__sna_dri2_copy_event(flip, 0);
+			if (flip->flip_continue == FLIP_COMPLETE) {
+				DBG(("%s: fake triple buffering, unblocking client\n", __FUNCTION__));
+				frame_swap_complete(flip, DRI2_BLIT_COMPLETE);
+			}
+			sna_dri2_event_free(flip);
 		}
 		break;
 
@@ -2494,12 +2614,18 @@ static int use_triple_buffer(struct sna *sna, ClientPtr client, bool async)
 }
 
 static bool immediate_swap(struct sna *sna,
-			   uint64_t target_msc,
-			   uint64_t divisor,
 			   DrawablePtr draw,
 			   xf86CrtcPtr crtc,
+			   uint64_t *target_msc,
+			   uint64_t divisor,
+			   uint64_t remainder,
 			   uint64_t *current_msc)
 {
+	/*
+	 * If divisor is zero, or current_msc is smaller than target_msc
+	 * we just need to make sure target_msc passes before initiating
+	 * the swap.
+	 */
 	if (divisor == 0) {
 		*current_msc = -1;
 
@@ -2508,62 +2634,73 @@ static bool immediate_swap(struct sna *sna,
 			return true;
 		}
 
-		if (target_msc)
+		if (*target_msc)
 			*current_msc = get_current_msc(sna, draw, crtc);
 
 		DBG(("%s: current_msc=%ld, target_msc=%ld -- %s\n",
-		     __FUNCTION__, (long)*current_msc, (long)target_msc,
-		     (*current_msc >= target_msc - 1) ? "yes" : "no"));
-		return *current_msc >= target_msc - 1;
+		     __FUNCTION__, (long)*current_msc, (long)*target_msc,
+		     (*current_msc >= *target_msc - 1) ? "yes" : "no"));
+		return *current_msc >= *target_msc - 1;
 	}
 
 	DBG(("%s: explicit waits requests, divisor=%ld\n",
 	     __FUNCTION__, (long)divisor));
 	*current_msc = get_current_msc(sna, draw, crtc);
-	return false;
+	if (*current_msc >= *target_msc) {
+		DBG(("%s: missed target, queueing event for next: current=%lld, target=%lld, divisor=%lld, remainder=%lld\n",
+		     __FUNCTION__,
+		     (long long)*current_msc,
+		     (long long)*target_msc,
+		     (long long)divisor,
+		     (long long)remainder));
+
+		*target_msc = *current_msc + remainder - *current_msc % divisor;
+		if (*target_msc <= *current_msc)
+			*target_msc += divisor;
+	}
+
+	DBG(("%s: target_msc=%lld, current_msc=%lld, immediate?=%d\n",
+	     __FUNCTION__, (long long)*target_msc, (long long)*current_msc,
+	     *current_msc >= *target_msc - 1));
+	return *current_msc >= *target_msc - 1;
 }
 
 static bool
 sna_dri2_schedule_flip(ClientPtr client, DrawablePtr draw, xf86CrtcPtr crtc,
 		       DRI2BufferPtr front, DRI2BufferPtr back,
-		       CARD64 *target_msc, CARD64 divisor, CARD64 remainder,
+		       bool immediate, CARD64 *target_msc, CARD64 current_msc,
 		       DRI2SwapEventPtr func, void *data)
 {
 	struct sna *sna = to_sna_from_drawable(draw);
 	struct sna_dri2_event *info;
-	uint64_t current_msc;
-
-	if (immediate_swap(sna, *target_msc, divisor, draw, crtc, &current_msc)) {
-		int type;
 
+	if (immediate) {
 		info = sna->dri2.flip_pending;
 		DBG(("%s: performing immediate swap on pipe %d, pending? %d, mode: %d, continuation? %d\n",
 		     __FUNCTION__, sna_crtc_to_pipe(crtc),
-		     info != NULL, info ? info->mode : 0,
+		     info != NULL, info ? info->flip_continue : 0,
 		     info && info->draw == draw));
 
 		if (info && info->draw == draw) {
 			assert(info->type != FLIP);
 			assert(info->front == front);
+			assert(info->queued);
 			if (info->back != back) {
 				_sna_dri2_destroy_buffer(sna, info->back);
 				info->back = sna_dri2_reference_buffer(back);
 			}
-			if (info->mode || current_msc >= *target_msc) {
-				DBG(("%s: executing xchg of pending flip\n",
-				     __FUNCTION__));
-				sna_dri2_xchg(draw, front, back);
-				info->mode = type = FLIP_COMPLETE;
-				goto new_back;
-			} else {
+			DBG(("%s: executing xchg of pending flip: flip_continue=%d, keepalive=%d\n", __FUNCTION__, info->flip_continue, info->keepalive));
+			sna_dri2_xchg(draw, front, back);
+			info->keepalive++;
+			if (xorg_can_triple_buffer() &&
+			    !info->flip_continue &&
+			    current_msc < *target_msc) {
 				DBG(("%s: chaining flip\n", __FUNCTION__));
-				type = FLIP_THROTTLE;
-				if (xorg_can_triple_buffer())
-					info->mode = -type;
-				else
-					info->mode = -FLIP_COMPLETE;
+				info->type = FLIP_THROTTLE;
+				info->flip_continue = FLIP_COMPLETE;
 				goto out;
-			}
+			} else
+				goto new_back;
 		}
 
 		info = sna_dri2_add_event(sna, draw, client);
@@ -2584,21 +2721,26 @@ sna_dri2_schedule_flip(ClientPtr client, DrawablePtr draw, xf86CrtcPtr crtc,
 			 */
 			DBG(("%s: queueing flip after pending completion\n",
 			     __FUNCTION__));
-			info->type = type = FLIP;
+			info->type = FLIP;
 			sna->dri2.flip_pending = info;
-			assert(info->queued);
+			assert(!info->queued);
 			current_msc++;
+		} else if (sna->mode.flip_active) {
+			DBG(("%s: %d outstanding flips from old client, queueing\n",
+			     __FUNCTION__, sna->mode.flip_active));
+			goto queue;
 		} else {
-			info->type = type = use_triple_buffer(sna, client, *target_msc == 0);
+			info->type = use_triple_buffer(sna, client, *target_msc == 0);
 			if (!sna_dri2_flip(info)) {
 				DBG(("%s: flip failed, falling back\n", __FUNCTION__));
 				sna_dri2_event_free(info);
 				return false;
 			}
+			assert(get_private(info->front)->bo->active_scanout);
 		}
 
-		swap_limit(draw, 1 + (type == FLIP_THROTTLE));
-		if (type >= FLIP_COMPLETE) {
+		swap_limit(draw, 1 + (info->type == FLIP_THROTTLE));
+		if (info->type >= FLIP_COMPLETE) {
 new_back:
 			if (!xorg_can_triple_buffer())
 				sna_dri2_get_back(sna, draw, back, info);
@@ -2613,6 +2755,12 @@ out:
 		return true;
 	}
 
+queue:
+	if (KEEPALIVE > 1 && sna->dri2.flip_pending) {
+		info = sna->dri2.flip_pending;
+		info->keepalive = 1;
+	}
+
 	info = sna_dri2_add_event(sna, draw, client);
 	if (info == NULL)
 		return false;
@@ -2625,29 +2773,7 @@ out:
 	info->front = sna_dri2_reference_buffer(front);
 	info->back = sna_dri2_reference_buffer(back);
 
-	/*
-	 * If divisor is zero, or current_msc is smaller than target_msc
-	 * we just need to make sure target_msc passes before initiating
-	 * the swap.
-	 */
-	if (divisor && current_msc >= *target_msc) {
-		DBG(("%s: missed target, queueing event for next: current=%lld, target=%lld, divisor=%lld, remainder=%lld\n",
-		     __FUNCTION__,
-		     (long long)current_msc,
-		     (long long)*target_msc,
-		     (long long)divisor,
-		     (long long)remainder));
-
-		*target_msc = current_msc + remainder - current_msc % divisor;
-		if (*target_msc <= current_msc)
-			*target_msc += divisor;
-	}
-
-	if (*target_msc <= current_msc + 1) {
-		if (!sna_dri2_flip(info)) {
-			sna_dri2_event_free(info);
-			return false;
-		}
+	if (*target_msc <= current_msc + 1 && sna_dri2_flip(info)) {
 		*target_msc = current_msc + 1;
 	} else {
 		union drm_wait_vblank vbl;
@@ -2677,18 +2803,18 @@ out:
 static bool
 sna_dri2_schedule_xchg(ClientPtr client, DrawablePtr draw, xf86CrtcPtr crtc,
 		       DRI2BufferPtr front, DRI2BufferPtr back,
-		       CARD64 *target_msc, CARD64 divisor, CARD64 remainder,
+		       bool immediate, CARD64 *target_msc, CARD64 current_msc,
 		       DRI2SwapEventPtr func, void *data)
 {
 	struct sna *sna = to_sna_from_drawable(draw);
-	uint64_t current_msc;
 	bool sync, event;
 
-	if (!immediate_swap(sna, *target_msc, divisor, draw, crtc, &current_msc))
+	if (!immediate)
 		return false;
 
-	sync = current_msc < *target_msc;
+	sync = current_msc < *target_msc && xorg_can_triple_buffer();
 	event = dri2_chain(draw) == NULL;
+	DBG(("%s: synchronous?=%d, send-event?=%d\n", __FUNCTION__, sync, event));
 	if (!sync || event) {
 		DBG(("%s: performing immediate xchg on pipe %d\n",
 		     __FUNCTION__, sna_crtc_to_pipe(crtc)));
@@ -2720,6 +2846,7 @@ sna_dri2_schedule_xchg(ClientPtr client, DrawablePtr draw, xf86CrtcPtr crtc,
 
 			info->queued = true;
 			if (sna_wait_vblank(sna, &vbl, info->pipe)) {
+				DBG(("%s: vblank queue failed, unblocking client\n", __FUNCTION__));
 				sna_dri2_event_free(info);
 				goto complete;
 			}
@@ -2738,18 +2865,18 @@ complete:
 static bool
 sna_dri2_schedule_xchg_crtc(ClientPtr client, DrawablePtr draw, xf86CrtcPtr crtc,
 			    DRI2BufferPtr front, DRI2BufferPtr back,
-			    CARD64 *target_msc, CARD64 divisor, CARD64 remainder,
+			    bool immediate, CARD64 *target_msc, CARD64 current_msc,
 			    DRI2SwapEventPtr func, void *data)
 {
 	struct sna *sna = to_sna_from_drawable(draw);
-	uint64_t current_msc;
 	bool sync, event;
 
-	if (!immediate_swap(sna, *target_msc, divisor, draw, crtc, &current_msc))
+	if (!immediate)
 		return false;
 
-	sync = current_msc < *target_msc;
+	sync = current_msc < *target_msc && xorg_can_triple_buffer();
 	event = dri2_chain(draw) == NULL;
+	DBG(("%s: synchronous?=%d, send-event?=%d\n", __FUNCTION__, sync, event));
 	if (!sync || event) {
 		DBG(("%s: performing immediate xchg only on pipe %d\n",
 		     __FUNCTION__, sna_crtc_to_pipe(crtc)));
@@ -2781,6 +2908,7 @@ sna_dri2_schedule_xchg_crtc(ClientPtr client, DrawablePtr draw, xf86CrtcPtr crtc
 
 			info->queued = true;
 			if (sna_wait_vblank(sna, &vbl, info->pipe)) {
+				DBG(("%s: vblank queue failed, unblocking client\n", __FUNCTION__));
 				sna_dri2_event_free(info);
 				goto complete;
 			}
@@ -2835,6 +2963,7 @@ sna_dri2_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	struct sna_dri2_event *info = NULL;
 	int type = DRI2_EXCHANGE_COMPLETE;
 	CARD64 current_msc;
+	bool immediate;
 
 	DBG(("%s: draw=%lu %dx%d, pixmap=%ld %dx%d, back=%u (refs=%d/%d, flush=%d) , front=%u (refs=%d/%d, flush=%d)\n",
 	     __FUNCTION__,
@@ -2914,21 +3043,25 @@ sna_dri2_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 		sna_mode_wakeup(sna);
 	}
 
+	immediate = immediate_swap(sna, draw, crtc,
+				   target_msc, divisor, remainder,
+				   &current_msc);
+
 	if (can_xchg(sna, draw, front, back) &&
 	    sna_dri2_schedule_xchg(client, draw, crtc, front, back,
-				   target_msc, divisor, remainder,
+				   immediate, target_msc, current_msc,
 				   func, data))
 		return TRUE;
 
 	if (can_xchg_crtc(sna, draw, front, back, crtc) &&
 	    sna_dri2_schedule_xchg_crtc(client, draw, crtc, front, back,
-					target_msc, divisor, remainder,
+					immediate, target_msc, current_msc,
 					func, data))
 		return TRUE;
 
 	if (can_flip(sna, draw, front, back, crtc) &&
 	    sna_dri2_schedule_flip(client, draw, crtc, front, back,
-				  target_msc, divisor, remainder,
+				  immediate, target_msc, current_msc,
 				  func, data))
 		return TRUE;
 
@@ -2945,10 +3078,9 @@ sna_dri2_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	info->front = sna_dri2_reference_buffer(front);
 	info->back = sna_dri2_reference_buffer(back);
 
-	if (immediate_swap(sna, *target_msc, divisor, draw, crtc, &current_msc)) {
+	if (immediate) {
 		bool sync = current_msc < *target_msc;
-		if (!sna_dri2_immediate_blit(sna, info, sync, true))
-			sna_dri2_event_free(info);
+		sna_dri2_immediate_blit(sna, info, sync);
 		*target_msc = current_msc + sync;
 		return TRUE;
 	}
@@ -2958,32 +3090,13 @@ sna_dri2_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 		DRM_VBLANK_EVENT;
 	vbl.request.signal = (uintptr_t)info;
 
-	/*
-	 * If divisor is zero, or current_msc is smaller than target_msc
-	 * we just need to make sure target_msc passes before initiating
-	 * the swap.
-	 */
 	info->type = SWAP;
 	info->queued = true;
-	if (divisor && current_msc >= *target_msc) {
-		DBG(("%s: missed target, queueing event for next: current=%lld, target=%lld, divisor=%lld, remainder=%lld\n",
-		     __FUNCTION__,
-		     (long long)current_msc,
-		     (long long)*target_msc,
-		     (long long)divisor,
-		     (long long)remainder));
-
-		*target_msc = current_msc + remainder - current_msc % divisor;
-		if (*target_msc <= current_msc)
-			*target_msc += divisor;
-	}
 	vbl.request.sequence = draw_target_seq(draw, *target_msc - 1);
 	if (*target_msc <= current_msc + 1) {
 		DBG(("%s: performing blit before queueing\n", __FUNCTION__));
 		assert(info->queued);
-		info->bo = __sna_dri2_copy_region(sna, draw, NULL,
-						  back, front,
-						  true);
+		__sna_dri2_copy_event(info, DRI2_SYNC);
 		info->type = SWAP_WAIT;
 
 		vbl.request.type =
@@ -3008,13 +3121,14 @@ blit:
 	if (can_xchg(sna, draw, front, back)) {
 		sna_dri2_xchg(draw, front, back);
 	} else {
-		__sna_dri2_copy_region(sna, draw, NULL, back, front, false);
+		__sna_dri2_copy_region(sna, draw, NULL, back, front, 0);
+		front->flags = back->flags;
 		type = DRI2_BLIT_COMPLETE;
 	}
 skip:
 	DBG(("%s: unable to show frame, unblocking client\n", __FUNCTION__));
 	if (crtc == NULL)
-		crtc = sna_mode_first_crtc(sna);
+		crtc = sna_primary_crtc(sna);
 	fake_swap_complete(sna, client, draw, crtc, type, func, data);
 	*target_msc = 0; /* offscreen, so zero out target vblank count */
 	return TRUE;
@@ -3044,13 +3158,13 @@ sna_dri2_get_msc(DrawablePtr draw, CARD64 *ust, CARD64 *msc)
 			sna_crtc_record_vblank(crtc, &vbl);
 	} else
 		/* Drawable not displayed, make up a *monotonic* value */
-		crtc = sna_mode_first_crtc(sna);
+		crtc = sna_primary_crtc(sna);
 
 	swap = sna_crtc_last_swap(crtc);
 	*msc = draw_current_msc(draw, crtc, swap->msc);
 	*ust = ust64(swap->tv_sec, swap->tv_usec);
-	DBG(("%s: msc=%llu, ust=%llu\n", __FUNCTION__,
-	     (long long)*msc, (long long)*ust));
+	DBG(("%s: msc=%llu [raw=%llu], ust=%llu\n", __FUNCTION__,
+	     (long long)*msc, swap->msc, (long long)*ust));
 	return TRUE;
 }
 
@@ -3142,7 +3256,7 @@ out_free_info:
 	sna_dri2_event_free(info);
 out_complete:
 	if (crtc == NULL)
-		crtc = sna_mode_first_crtc(sna);
+		crtc = sna_primary_crtc(sna);
 	swap = sna_crtc_last_swap(crtc);
 	DRI2WaitMSCComplete(client, draw,
 			    draw_current_msc(draw, crtc, swap->msc),
@@ -3231,9 +3345,18 @@ static bool is_level(const char **str)
 	return false;
 }
 
+static const char *options_get_dri(struct sna *sna)
+{
+#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,7,99,901,0)
+	return xf86GetOptValString(sna->Options, OPTION_DRI);
+#else
+	return NULL;
+#endif
+}
+
 static const char *dri_driver_name(struct sna *sna)
 {
-	const char *s = xf86GetOptValString(sna->Options, OPTION_DRI);
+	const char *s = options_get_dri(sna);
 
 	if (is_level(&s)) {
 		if (sna->kgem.gen < 030)
@@ -3259,7 +3382,7 @@ bool sna_dri2_open(struct sna *sna, ScreenPtr screen)
 
 	if (wedged(sna)) {
 		xf86DrvMsg(sna->scrn->scrnIndex, X_WARNING,
-			   "loading DRI2 whilst the GPU is wedged.\n");
+			   "loading DRI2 whilst acceleration is disabled.\n");
 	}
 
 	if (xf86LoaderCheckSymbol("DRI2Version"))
@@ -3274,7 +3397,7 @@ bool sna_dri2_open(struct sna *sna, ScreenPtr screen)
 	memset(&info, '\0', sizeof(info));
 	info.fd = sna->kgem.fd;
 	info.driverName = dri_driver_name(sna);
-	info.deviceName = intel_get_client_name(sna->dev);
+	info.deviceName = intel_get_master_name(sna->dev);
 
 	DBG(("%s: loading dri driver '%s' [gen=%d] for device '%s'\n",
 	     __FUNCTION__, info.driverName, sna->kgem.gen, info.deviceName));
@@ -3299,11 +3422,12 @@ bool sna_dri2_open(struct sna *sna, ScreenPtr screen)
 	info.numDrivers = 2;
 	info.driverNames = driverNames;
 	driverNames[0] = info.driverName;
-	driverNames[1] = info.driverName;
+	driverNames[1] = "va_gl";
 #endif
 
 #if DRI2INFOREC_VERSION >= 6
 	if (xorg_can_triple_buffer()) {
+		DBG(("%s: enabling Xorg triple buffering\n", __FUNCTION__));
 		info.version = 6;
 		info.SwapLimitValidate = sna_dri2_swap_limit_validate;
 		info.ReuseBufferNotify = sna_dri2_reuse_buffer;
@@ -3311,8 +3435,10 @@ bool sna_dri2_open(struct sna *sna, ScreenPtr screen)
 #endif
 
 #if USE_ASYNC_SWAP
+	DBG(("%s: enabled async swap and buffer age\n", __FUNCTION__));
 	info.version = 10;
 	info.scheduleSwap0 = 1;
+	info.bufferAge = 1;
 #endif
 
 	return DRI2ScreenInit(screen, &info);
diff --git a/src/sna/sna_driver.c b/src/sna/sna_driver.c
index 8a3599c..5092bfb 100644
--- a/src/sna/sna_driver.c
+++ b/src/sna/sna_driver.c
@@ -57,6 +57,13 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include <mi.h>
 #include <micmap.h>
 
+#if defined(HAVE_X11_EXTENSIONS_DPMSCONST_H)
+#include <X11/extensions/dpmsconst.h>
+#else
+#define DPMSModeOn 0
+#define DPMSModeOff 3
+#endif
+
 #include <sys/ioctl.h>
 #include <sys/fcntl.h>
 #include <sys/poll.h>
@@ -69,6 +76,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #if HAVE_DOT_GIT
 #include "git_version.h"
+#else
+#define git_version "not compiled from git"
 #endif
 
 #ifdef TEARFREE
@@ -185,8 +194,8 @@ sna_set_fallback_mode(ScrnInfoPtr scrn)
 
 	xf86DisableUnusedFunctions(scrn);
 #ifdef RANDR_12_INTERFACE
-	if (get_root_window(scrn->pScreen))
-		xf86RandR12TellChanged(scrn->pScreen);
+	if (get_root_window(xf86ScrnToScreen(scrn)))
+		xf86RandR12TellChanged(xf86ScrnToScreen(scrn));
 #endif
 }
 
@@ -222,7 +231,7 @@ static Bool sna_create_screen_resources(ScreenPtr screen)
 	     screen->width, screen->height, screen->rootDepth));
 
 	assert(sna->scrn == xf86ScreenToScrn(screen));
-	assert(sna->scrn->pScreen == screen);
+	assert(to_screen_from_sna(sna) == screen);
 
 	/* free the data used during miInitScreen */
 	free(screen->devPrivate);
@@ -279,27 +288,83 @@ static Bool sna_create_screen_resources(ScreenPtr screen)
 	return TRUE;
 }
 
-static Bool sna_save_screen(ScreenPtr screen, int mode)
+static void sna_dpms_set(ScrnInfoPtr scrn, int mode, int flags)
 {
-	ScrnInfoPtr scrn = xf86ScreenToScrn(screen);
+	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(scrn);
+	struct sna *sna = to_sna(scrn);
+	bool changed = false;
+	int i;
 
-	DBG(("%s(mode=%d)\n", __FUNCTION__, mode));
+	DBG(("%s(mode=%d, flags=%d), vtSema=%d => off?=%d\n",
+	     __FUNCTION__, mode, flags, scrn->vtSema, mode!=DPMSModeOn));
 	if (!scrn->vtSema)
-		return FALSE;
+		return;
 
-	xf86SaveScreen(screen, mode);
-	sna_crtc_config_notify(screen);
-	return TRUE;
+	/* Opencoded version of xf86DPMSSet().
+	 *
+	 * The principle difference is to skip calling crtc->dpms() when
+	 * turning off the display. This (on recent enough kernels at
+	 * least) should be equivalent in power consumption, but require
+	 * less work (hence quicker and less likely to fail) when switching
+	 * back on.
+	 */
+	if (mode != DPMSModeOn) {
+		if (sna->mode.hidden == 0) {
+			DBG(("%s: hiding %d outputs\n",
+			     __FUNCTION__, config->num_output));
+			for (i = 0; i < config->num_output; i++) {
+				xf86OutputPtr output = config->output[i];
+				if (output->crtc != NULL)
+					output->funcs->dpms(output, mode);
+			}
+			sna->mode.hidden = sna->mode.front_active + 1;
+			sna->mode.front_active = 0;
+			changed = true;
+		}
+	} else {
+		/* Re-enable CRTC that have been forced off via other means */
+		if (sna->mode.hidden != 0) {
+			DBG(("%s: unhiding %d crtc, %d outputs\n",
+			     __FUNCTION__, config->num_crtc, config->num_output));
+			sna->mode.front_active = sna->mode.hidden - 1;
+			sna->mode.hidden = 0;
+			for (i = 0; i < config->num_crtc; i++) {
+				xf86CrtcPtr crtc = config->crtc[i];
+				if (crtc->enabled)
+					crtc->funcs->dpms(crtc, mode);
+			}
+
+			for (i = 0; i < config->num_output; i++) {
+				xf86OutputPtr output = config->output[i];
+				if (output->crtc != NULL)
+					output->funcs->dpms(output, mode);
+			}
+			changed = true;
+		}
+	}
+
+	DBG(("%s: hiding outputs? %d, front active? %d, changed? %d\n",
+	     __FUNCTION__, sna->mode.hidden, sna->mode.front_active, changed));
+
+	if (changed)
+		sna_crtc_config_notify(xf86ScrnToScreen(scrn));
 }
 
-static void sna_dpms_set(ScrnInfoPtr scrn, int mode, int flags)
+static Bool sna_save_screen(ScreenPtr screen, int mode)
 {
-	DBG(("%s(mode=%d, flags=%d)\n", __FUNCTION__, mode));
-	if (!scrn->vtSema)
-		return;
+	ScrnInfoPtr scrn = xf86ScreenToScrn(screen);
+
+	DBG(("%s(mode=%d [unblank=%d])\n",
+	     __FUNCTION__, mode, xf86IsUnblank(mode)));
 
-	xf86DPMSSet(scrn, mode, flags);
-	sna_crtc_config_notify(xf86ScrnToScreen(scrn));
+	/* We have to unroll xf86SaveScreen() here as it is called
+	 * by DPMSSet() nullifying our special handling crtc->dpms()
+	 * in sna_dpms_set().
+	 */
+	sna_dpms_set(scrn,
+		     xf86IsUnblank(mode) ? DPMSModeOn : DPMSModeOff,
+		     0);
+	return TRUE;
 }
 
 static void sna_selftest(void)
@@ -330,107 +395,6 @@ static void sna_setup_capabilities(ScrnInfoPtr scrn, int fd)
 #endif
 }
 
-static int
-namecmp(const char *s1, const char *s2)
-{
-	char c1, c2;
-
-	if (!s1 || *s1 == 0) {
-		if (!s2 || *s2 == 0)
-			return 0;
-		else
-			return 1;
-	}
-
-	while (*s1 == '_' || *s1 == ' ' || *s1 == '\t')
-		s1++;
-
-	while (*s2 == '_' || *s2 == ' ' || *s2 == '\t')
-		s2++;
-
-	c1 = isupper(*s1) ? tolower(*s1) : *s1;
-	c2 = isupper(*s2) ? tolower(*s2) : *s2;
-	while (c1 == c2) {
-		if (c1 == '\0')
-			return 0;
-
-		s1++;
-		while (*s1 == '_' || *s1 == ' ' || *s1 == '\t')
-			s1++;
-
-		s2++;
-		while (*s2 == '_' || *s2 == ' ' || *s2 == '\t')
-			s2++;
-
-		c1 = isupper(*s1) ? tolower(*s1) : *s1;
-		c2 = isupper(*s2) ? tolower(*s2) : *s2;
-	}
-
-	return c1 - c2;
-}
-
-static Bool sna_option_cast_to_bool(struct sna *sna, int id, Bool val)
-{
-	const char *str = xf86GetOptValString(sna->Options, id);
-
-	if (str == NULL)
-		return val;
-
-	if (*str == '\0')
-		return TRUE;
-
-	if (namecmp(str, "1") == 0)
-		return TRUE;
-	if (namecmp(str, "on") == 0)
-		return TRUE;
-	if (namecmp(str, "true") == 0)
-		return TRUE;
-	if (namecmp(str, "yes") == 0)
-		return TRUE;
-
-	if (namecmp(str, "0") == 0)
-		return FALSE;
-	if (namecmp(str, "off") == 0)
-		return FALSE;
-	if (namecmp(str, "false") == 0)
-		return FALSE;
-	if (namecmp(str, "no") == 0)
-		return FALSE;
-
-	return val;
-}
-
-static unsigned sna_option_cast_to_unsigned(struct sna *sna, int id, unsigned val)
-{
-	const char *str = xf86GetOptValString(sna->Options, id);
-	unsigned v;
-
-	if (str == NULL || *str == '\0')
-		return val;
-
-	if (namecmp(str, "on") == 0)
-		return val;
-	if (namecmp(str, "true") == 0)
-		return val;
-	if (namecmp(str, "yes") == 0)
-		return val;
-
-	if (namecmp(str, "0") == 0)
-		return 0;
-	if (namecmp(str, "off") == 0)
-		return 0;
-	if (namecmp(str, "false") == 0)
-		return 0;
-	if (namecmp(str, "no") == 0)
-		return 0;
-
-	v = atoi(str);
-	if (v)
-		return v;
-
-	return val;
-}
-
 static Bool fb_supports_depth(int fd, int depth)
 {
 	struct drm_i915_gem_create create;
@@ -472,7 +436,7 @@ static void setup_dri(struct sna *sna)
 	sna->dri2.available = false;
 	sna->dri3.available = false;
 
-	level = sna_option_cast_to_unsigned(sna, OPTION_DRI, ~0);
+	level = intel_option_cast_to_unsigned(sna->Options, OPTION_DRI, DEFAULT_DRI_LEVEL);
 #if HAVE_DRI3
 	if (level >= 3)
 		sna->dri3.available = !!xf86LoadSubModule(sna->scrn, "dri3");
@@ -612,8 +576,10 @@ static Bool sna_pre_init(ScrnInfoPtr scrn, int probe)
 	}
 
 	intel_detect_chipset(scrn, sna->dev);
-	xf86DrvMsg(scrn->scrnIndex, X_PROBED, "CPU: %s\n",
-		   sna_cpu_features_to_string(sna->cpu_features, buf));
+	xf86DrvMsg(scrn->scrnIndex, X_PROBED,
+		   "CPU: %s; using a maximum of %d threads\n",
+		   sna_cpu_features_to_string(sna->cpu_features, buf),
+		   sna_use_threads(64*1024, 64*1024, 1));
 
 	if (!xf86SetDepthBpp(scrn, 24, 0, 0,
 			     Support32bppFb |
@@ -651,12 +617,6 @@ static Bool sna_pre_init(ScrnInfoPtr scrn, int probe)
 	kgem_init(&sna->kgem, fd,
 		  xf86GetPciInfoForEntity(pEnt->index),
 		  sna->info->gen);
-	if (xf86ReturnOptValBool(sna->Options, OPTION_ACCEL_DISABLE, FALSE) ||
-	    !sna_option_cast_to_bool(sna, OPTION_ACCEL_METHOD, TRUE)) {
-		xf86DrvMsg(sna->scrn->scrnIndex, X_CONFIG,
-			   "Disabling hardware acceleration.\n");
-		sna->kgem.wedged = true;
-	}
 
 	if (xf86ReturnOptValBool(sna->Options, OPTION_TILING_FB, FALSE))
 		sna->flags |= SNA_LINEAR_FB;
@@ -748,7 +708,7 @@ sna_block_handler(BLOCKHANDLER_ARGS_DECL)
 	sna->BlockHandler(BLOCKHANDLER_ARGS);
 
 	if (*tv == NULL || ((*tv)->tv_usec | (*tv)->tv_sec) || has_shadow(sna))
-		sna_accel_block_handler(sna, tv);
+		sna_accel_block(sna, tv);
 }
 
 static void
@@ -770,8 +730,6 @@ sna_wakeup_handler(WAKEUPHANDLER_ARGS_DECL)
 
 	sna->WakeupHandler(WAKEUPHANDLER_ARGS);
 
-	sna_accel_wakeup_handler(sna);
-
 	if (FD_ISSET(sna->kgem.fd, (fd_set*)read_mask)) {
 		sna_mode_wakeup(sna);
 		/* Clear the flag so that subsequent ZaphodHeads don't block  */
@@ -780,6 +738,8 @@ sna_wakeup_handler(WAKEUPHANDLER_ARGS_DECL)
 }
 
 #if HAVE_UDEV
+#include <sys/stat.h>
+
 static void
 sna_handle_uevents(int fd, void *closure)
 {
@@ -810,7 +770,6 @@ sna_handle_uevents(int fd, void *closure)
 		if (scrn->vtSema) {
 			sna_mode_discover(sna);
 			sna_mode_check(sna);
-			RRGetInfo(xf86ScrnToScreen(scrn), TRUE);
 		} else
 			sna->flags |= SNA_REPROBE;
 	}
@@ -861,7 +820,8 @@ sna_uevent_init(struct sna *sna)
 
 	sna->uevent_monitor = mon;
 out:
-	xf86DrvMsg(sna->scrn->scrnIndex, from, "display hotplug detection %s\n",
+	xf86DrvMsg(sna->scrn->scrnIndex, from,
+		   "Display hotplug detection %s\n",
 		   sna->uevent_monitor ? "enabled" : "disabled");
 	return;
 
@@ -1098,7 +1058,7 @@ sna_screen_init(SCREEN_INIT_ARGS_DECL)
 	DBG(("%s\n", __FUNCTION__));
 
 	assert(sna->scrn == scrn);
-	assert(scrn->pScreen == NULL); /* set afterwards */
+	assert(to_screen_from_sna(sna) == NULL); /* set afterwards */
 
 	assert(sna->freed_pixmap == NULL);
 
@@ -1245,12 +1205,11 @@ static Bool sna_enter_vt(VT_FUNC_ARGS_DECL)
 		return FALSE;
 
 	if (sna->flags & SNA_REPROBE) {
-		DBG(("%s: reporting deferred hotplug event\n",
-		     __FUNCTION__));
+		DBG(("%s: reporting deferred hotplug event\n", __FUNCTION__));
 		sna_mode_discover(sna);
-		RRGetInfo(xf86ScrnToScreen(scrn), TRUE);
 		sna->flags &= ~SNA_REPROBE;
 	}
+	sna_mode_check(sna);
 
 	if (!sna_set_desired_mode(sna)) {
 		intel_put_master(sna->dev);
@@ -1379,6 +1338,9 @@ static void describe_sna(ScrnInfoPtr scrn)
 	xf86DrvMsg(scrn->scrnIndex, X_INFO,
 		   "SNA compiled: %s\n", BUILDER_DESCRIPTION);
 #endif
+#if HAS_DEBUG_FULL
+	ErrorF("SNA compiled with full debug logging; expect to run slowly\n");
+#endif
 #if !NDEBUG
 	xf86DrvMsg(scrn->scrnIndex, X_INFO,
 		   "SNA compiled with assertions enabled\n");
@@ -1400,6 +1362,7 @@ static void describe_sna(ScrnInfoPtr scrn)
 		   "SNA compiled for use with valgrind\n");
 	VALGRIND_PRINTF("SNA compiled for use with valgrind\n");
 #endif
+	DBG(("xf86-video-intel version: %s\n", git_version));
 	DBG(("pixman version: %s\n", pixman_version_string()));
 }
 
diff --git a/src/sna/sna_glyphs.c b/src/sna/sna_glyphs.c
index a5dfb06..6d57e8f 100644
--- a/src/sna/sna_glyphs.c
+++ b/src/sna/sna_glyphs.c
@@ -185,7 +185,7 @@ void sna_glyphs_close(struct sna *sna)
  */
 bool sna_glyphs_create(struct sna *sna)
 {
-	ScreenPtr screen = sna->scrn->pScreen;
+	ScreenPtr screen = to_screen_from_sna(sna);
 	pixman_color_t white = { 0xffff, 0xffff, 0xffff, 0xffff };
 	unsigned int formats[] = {
 		PIXMAN_a8,
diff --git a/src/sna/sna_present.c b/src/sna/sna_present.c
index 6dd6fe8..836c89d 100644
--- a/src/sna/sna_present.c
+++ b/src/sna/sna_present.c
@@ -38,10 +38,16 @@
 static present_screen_info_rec present_info;
 
 struct sna_present_event {
-	uint64_t event_id;
 	xf86CrtcPtr crtc;
+	struct sna *sna;
+	struct list link;
+	uint64_t *event_id;
+	uint64_t target_msc;
+	int n_event_id;
 };
 
+static void sna_present_unflip(ScreenPtr screen, uint64_t event_id);
+
 static inline struct sna_present_event *
 to_present_event(uintptr_t  data)
 {
@@ -52,7 +58,7 @@ to_present_event(uintptr_t  data)
 
 static int pipe_from_crtc(RRCrtcPtr crtc)
 {
-	return crtc ? sna_crtc_to_pipe(crtc->devPrivate) : -1;
+	return crtc ? sna_crtc_to_pipe__safe(crtc->devPrivate) : -1;
 }
 
 static uint32_t pipe_select(int pipe)
@@ -74,6 +80,116 @@ static inline int sna_wait_vblank(struct sna *sna, union drm_wait_vblank *vbl, i
 	return drmIoctl(sna->kgem.fd, DRM_IOCTL_WAIT_VBLANK, vbl);
 }
 
+static uint64_t gettime_ust64(void)
+{
+	struct timespec tv;
+
+	if (clock_gettime(CLOCK_MONOTONIC, &tv))
+		return GetTimeInMicros();
+
+	return ust64(tv.tv_sec, tv.tv_nsec / 1000);
+}
+
+static void vblank_complete(struct sna_present_event *info,
+			    uint64_t ust, uint64_t msc)
+{
+	int n;
+
+	DBG(("%s: %d events complete\n", __FUNCTION__, info->n_event_id));
+	for (n = 0; n < info->n_event_id; n++) {
+		DBG(("%s: pipe=%d tv=%d.%06d msc=%lld (target=%lld), event=%lld complete%s\n", __FUNCTION__,
+		     sna_crtc_to_pipe(info->crtc),
+		     (int)(ust / 1000000), (int)(ust % 1000000),
+		     (long long)msc, (long long)info->target_msc,
+		     (long long)info->event_id[n],
+		     info->target_msc && msc == (uint32_t)info->target_msc ? "" : ": MISS"));
+		present_event_notify(info->event_id[n], ust, msc);
+	}
+	if (info->n_event_id > 1)
+		free(info->event_id);
+	list_del(&info->link);
+	free(info);
+}
+
+static uint32_t msc_to_delay(xf86CrtcPtr crtc, uint64_t target)
+{
+	const DisplayModeRec *mode = &crtc->desiredMode;
+	const struct ust_msc *swap = sna_crtc_last_swap(crtc);
+	int64_t delay, subframe;
+
+	delay = (target - swap->msc) * mode->VTotal * mode->HTotal / mode->Clock;
+	subframe = gettime_ust64() - swap_ust(swap);
+	subframe /= 1000;
+	if (subframe < delay)
+		delay -= subframe;
+	else
+		delay = 0;
+
+	DBG(("%s: sleep %d frames, %llu ms\n", __FUNCTION__,
+	     (int)(target - swap->msc), (long long)delay));
+	assert(delay >= 0);
+	return delay;
+}
+
+static CARD32 sna_fake_vblank_handler(OsTimerPtr timer, CARD32 now, void *data)
+{
+	struct sna_present_event *info = data;
+	union drm_wait_vblank vbl;
+	uint64_t msc, ust;
+
+	DBG(("%s(event=%lldx%d, now=%d)\n", __FUNCTION__, (long long)info->event_id[0], info->n_event_id, now));
+
+	VG_CLEAR(vbl);
+	vbl.request.type = DRM_VBLANK_RELATIVE;
+	vbl.request.sequence = 0;
+	if (sna_wait_vblank(info->sna, &vbl, sna_crtc_to_pipe(info->crtc)) == 0) {
+		ust = ust64(vbl.reply.tval_sec, vbl.reply.tval_usec);
+		msc = sna_crtc_record_vblank(info->crtc, &vbl);
+		DBG(("%s: event=%lld, target msc=%lld, now %lld\n",
+		     __FUNCTION__, (long long)info->event_id[0], (long long)info->target_msc, (long long)msc));
+		if (msc < info->target_msc) {
+			uint32_t delay = msc_to_delay(info->crtc, info->target_msc);
+			if (delay)
+				return delay;
+		}
+	} else {
+		const struct ust_msc *swap = sna_crtc_last_swap(info->crtc);
+		ust = swap_ust(swap);
+		msc = swap->msc;
+		DBG(("%s: event=%lld, CRTC OFF, target msc=%lld, was %lld\n",
+		     __FUNCTION__, (long long)info->event_id[0], (long long)info->target_msc, (long long)msc));
+	}
+
+	vblank_complete(info, ust, msc);
+	free(timer);
+	return 0;
+}
+
+static bool sna_fake_vblank(struct sna_present_event *info)
+{
+	uint64_t msc = sna_crtc_last_swap(info->crtc)->msc;
+	uint32_t delay;
+
+	assert(info->n_event_id == 1);
+
+	if (msc < info->target_msc)
+		delay = msc_to_delay(info->crtc, info->target_msc);
+	else
+		delay = 0;
+
+	DBG(("%s(event=%lld, target_msc=%lld, msc=%lld, delay=%ums)\n",
+	     __FUNCTION__, (long long)info->event_id[0], (long long)info->target_msc, msc, delay));
+	if (delay == 0) {
+		const struct ust_msc *swap = sna_crtc_last_swap(info->crtc);
+		present_event_notify(info->event_id[0], swap_ust(swap), swap->msc);
+		list_del(&info->link);
+		free(info);
+		return true;
+	}
+
+	return TimerSet(NULL, 0, delay, sna_fake_vblank_handler, info);
+}
+
 static RRCrtcPtr
 sna_present_get_crtc(WindowPtr window)
 {
@@ -112,7 +228,7 @@ sna_present_get_ust_msc(RRCrtcPtr crtc, CARD64 *ust, CARD64 *msc)
 		*msc = sna_crtc_record_vblank(crtc->devPrivate, &vbl);
 	} else {
 		const struct ust_msc *swap = sna_crtc_last_swap(crtc->devPrivate);
-		*ust = ust64(swap->tv_sec, swap->tv_usec);
+		*ust = swap_ust(swap);
 		*msc = swap->msc;
 	}
 
@@ -128,42 +244,85 @@ sna_present_vblank_handler(struct drm_event_vblank *event)
 {
 	struct sna_present_event *info = to_present_event(event->user_data);
 
-	DBG(("%s: pipe=%d tv=%d.%06d msc=%d, event %lld complete\n", __FUNCTION__,
-	     sna_crtc_to_pipe(info->crtc),
-	     event->tv_sec, event->tv_usec, event->sequence,
-	     (long long)info->event_id));
-	present_event_notify(info->event_id,
-			     ust64(event->tv_sec, event->tv_usec),
-			     sna_crtc_record_event(info->crtc, event));
-	free(info);
+	vblank_complete(info,
+			ust64(event->tv_sec, event->tv_usec),
+			sna_crtc_record_event(info->crtc, event));
 }
 
 static int
 sna_present_queue_vblank(RRCrtcPtr crtc, uint64_t event_id, uint64_t msc)
 {
 	struct sna *sna = to_sna_from_screen(crtc->pScreen);
-	struct sna_present_event *event;
+	struct sna_present_event *info, *tmp;
+	const struct ust_msc *swap;
 	union drm_wait_vblank vbl;
 
 	DBG(("%s(pipe=%d, event=%lld, msc=%lld)\n",
 	     __FUNCTION__, pipe_from_crtc(crtc),
 	     (long long)event_id, (long long)msc));
 
-	event = malloc(sizeof(struct sna_present_event));
-	if (event == NULL)
+	swap = sna_crtc_last_swap(crtc->devPrivate);
+	assert((int64_t)(msc - swap->msc) >= 0);
+	if ((int64_t)(msc - swap->msc) <= 0) {
+		DBG(("%s: pipe=%d tv=%d.%06d msc=%lld (target=%lld), event=%lld complete\n", __FUNCTION__,
+		     pipe_from_crtc(crtc),
+		     swap->tv_sec, swap->tv_usec,
+		     (long long)swap->msc, (long long)msc,
+		     (long long)event_id));
+		present_event_notify(event_id, swap_ust(swap), swap->msc);
+		return Success;
+	}
+
+	list_for_each_entry(tmp, &sna->present.vblank_queue, link) {
+		if (tmp->target_msc == msc && tmp->crtc == crtc->devPrivate) {
+			uint64_t *events = tmp->event_id;
+
+			if (is_power_of_two(tmp->n_event_id)) {
+				events = malloc(2*sizeof(uint64_t)*tmp->n_event_id);
+				if (events == NULL)
+					goto fail;
+
+				memcpy(events,
+				       tmp->event_id,
+				       tmp->n_event_id*sizeof(uint64_t));
+				if (tmp->n_event_id != 1)
+					free(tmp->event_id);
+				tmp->event_id = events;
+			}
+
+			DBG(("%s: appending event=%lld to vblank %lld x %d\n",
+			     __FUNCTION__, (long long)event_id, (long long)msc, tmp->n_event_id+1));
+			events[tmp->n_event_id++] = event_id;
+			return Success;
+		}
+		if ((int64_t)(tmp->target_msc - msc) > 0)
+			break;
+	}
+
+fail:
+	info = malloc(sizeof(struct sna_present_event) + sizeof(uint64_t));
+	if (info == NULL)
 		return BadAlloc;
 
-	event->event_id = event_id;
-	event->crtc = crtc->devPrivate;
+	info->crtc = crtc->devPrivate;
+	info->sna = sna;
+	info->target_msc = msc;
+	info->event_id = (uint64_t *)(info + 1);
+	info->event_id[0] = event_id;
+	info->n_event_id = 1;
+	list_add_tail(&info->link, &tmp->link);
 
 	VG_CLEAR(vbl);
 	vbl.request.type = DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT;
 	vbl.request.sequence = msc;
-	vbl.request.signal = (uintptr_t)MARK_PRESENT(event);
-	if (sna_wait_vblank(sna, &vbl, sna_crtc_to_pipe(event->crtc))) {
+	vbl.request.signal = (uintptr_t)MARK_PRESENT(info);
+	if (sna_wait_vblank(sna, &vbl, sna_crtc_to_pipe(info->crtc))) {
 		DBG(("%s: vblank enqueue failed\n", __FUNCTION__));
-		free(event);
-		return BadMatch;
+		if (!sna_fake_vblank(info)) {
+			list_del(&info->link);
+			free(info);
+			return BadAlloc;
+		}
 	}
 
 	return Success;
@@ -201,8 +360,13 @@ check_flip__crtc(struct sna *sna,
 
 	assert(sna->scrn->vtSema);
 
-	if (sna->mode.shadow_active) {
-		DBG(("%s: shadow buffer active\n", __FUNCTION__));
+	if (!sna->mode.front_active) {
+		DBG(("%s: DPMS off, no flips\n", __FUNCTION__));
+		return FALSE;
+	}
+
+	if (sna->mode.rr_active) {
+		DBG(("%s: RandR transformation active\n", __FUNCTION__));
 		return false;
 	}
 
@@ -224,6 +388,11 @@ sna_present_check_flip(RRCrtcPtr crtc,
 	     pixmap->drawable.serialNumber,
 	     sync_flip));
 
+	if (!sna->scrn->vtSema) {
+		DBG(("%s: VT switched away, no flips\n", __FUNCTION__));
+		return FALSE;
+	}
+
 	if (sna->flags & SNA_NO_FLIP) {
 		DBG(("%s: flips not suported\n", __FUNCTION__));
 		return FALSE;
@@ -231,7 +400,7 @@ sna_present_check_flip(RRCrtcPtr crtc,
 
 	if (sync_flip) {
 		if ((sna->flags & SNA_HAS_FLIP) == 0) {
-			DBG(("%s: async flips not suported\n", __FUNCTION__));
+			DBG(("%s: sync flips not suported\n", __FUNCTION__));
 			return FALSE;
 		}
 	} else {
@@ -260,21 +429,12 @@ sna_present_check_flip(RRCrtcPtr crtc,
 	return TRUE;
 }
 
-static uint64_t gettime_ust64(void)
-{
-	struct timespec tv;
-
-	if (clock_gettime(CLOCK_MONOTONIC, &tv))
-		return 0;
-
-	return ust64(tv.tv_sec, tv.tv_nsec / 1000);
-}
-
 static Bool
-page_flip__async(RRCrtcPtr crtc,
-		 uint64_t event_id,
-		 uint64_t target_msc,
-		 struct kgem_bo *bo)
+flip__async(struct sna *sna,
+	    RRCrtcPtr crtc,
+	    uint64_t event_id,
+	    uint64_t target_msc,
+	    struct kgem_bo *bo)
 {
 	DBG(("%s(pipe=%d, event=%lld, handle=%d)\n",
 	     __FUNCTION__,
@@ -282,17 +442,17 @@ page_flip__async(RRCrtcPtr crtc,
 	     (long long)event_id,
 	     bo->handle));
 
-	if (!sna_page_flip(to_sna_from_screen(crtc->pScreen), bo, NULL, NULL)) {
+	if (!sna_page_flip(sna, bo, NULL, NULL)) {
 		DBG(("%s: async pageflip failed\n", __FUNCTION__));
 		present_info.capabilities &= ~PresentCapabilityAsync;
 		return FALSE;
 	}
 
-	DBG(("%s: pipe=%d tv=%d.%06d msc=%d, event %lld complete\n", __FUNCTION__,
+	DBG(("%s: pipe=%d tv=%d.%06d msc=%lld (target=%lld), event=%lld complete\n", __FUNCTION__,
 	     pipe_from_crtc(crtc),
 	     gettime_ust64() / 1000000, gettime_ust64() % 1000000,
-	     sna_crtc_last_swap(crtc->devPrivate)->msc,
-	     (long long)event_id));
+	     crtc ? (long long)sna_crtc_last_swap(crtc->devPrivate)->msc : 0LL,
+	     (long long)target_msc, (long long)event_id));
 	present_event_notify(event_id, gettime_ust64(), target_msc);
 	return TRUE;
 }
@@ -303,7 +463,8 @@ present_flip_handler(struct drm_event_vblank *event, void *data)
 	struct sna_present_event *info = data;
 	struct ust_msc swap;
 
-	DBG(("%s(sequence=%d)\n", __FUNCTION__, event->sequence));
+	DBG(("%s(sequence=%d): event=%lld\n", __FUNCTION__, event->sequence, (long long)info->event_id[0]));
+	assert(info->n_event_id == 1);
 
 	if (info->crtc == NULL) {
 		swap.tv_sec = event->tv_sec;
@@ -312,22 +473,31 @@ present_flip_handler(struct drm_event_vblank *event, void *data)
 	} else
 		swap = *sna_crtc_last_swap(info->crtc);
 
-	DBG(("%s: pipe=%d, tv=%d.%06d msc %lld, event %lld complete\n", __FUNCTION__,
+	DBG(("%s: pipe=%d, tv=%d.%06d msc=%lld (target %lld), event=%lld complete%s\n", __FUNCTION__,
 	     info->crtc ? sna_crtc_to_pipe(info->crtc) : -1,
 	     swap.tv_sec, swap.tv_usec, (long long)swap.msc,
-	     (long long)info->event_id));
-	present_event_notify(info->event_id, ust64(swap.tv_sec, swap.tv_usec), swap.msc);
+	     (long long)info->target_msc,
+	     (long long)info->event_id[0],
+	     info->target_msc && info->target_msc == swap.msc ? "" : ": MISS"));
+	present_event_notify(info->event_id[0], swap_ust(&swap), swap.msc);
+
+	if (info->sna->present.unflip) {
+		DBG(("%s: executing queued unflip (event=%lld)\n", __FUNCTION__, info->sna->present.unflip));
+		sna_present_unflip(xf86ScrnToScreen(info->sna->scrn),
+				   info->sna->present.unflip);
+		info->sna->present.unflip = 0;
+	}
 	free(info);
 }
 
 static Bool
-page_flip(ScreenPtr screen,
-	  RRCrtcPtr crtc,
-	  uint64_t event_id,
-	  struct kgem_bo *bo)
+flip(struct sna *sna,
+     RRCrtcPtr crtc,
+     uint64_t event_id,
+     uint64_t target_msc,
+     struct kgem_bo *bo)
 {
-	struct sna *sna = to_sna_from_screen(screen);
-	struct sna_present_event *event;
+	struct sna_present_event *info;
 
 	DBG(("%s(pipe=%d, event=%lld, handle=%d)\n",
 	     __FUNCTION__,
@@ -335,15 +505,20 @@ page_flip(ScreenPtr screen,
 	     (long long)event_id,
 	     bo->handle));
 
-	event = malloc(sizeof(struct sna_present_event));
-	if (event == NULL)
+	info = malloc(sizeof(struct sna_present_event)+sizeof(uint64_t));
+	if (info == NULL)
 		return FALSE;
 
-	event->event_id = event_id;
-	event->crtc = crtc ? crtc->devPrivate : NULL;
-	if (!sna_page_flip(sna, bo, present_flip_handler, event)) {
+	info->crtc = crtc ? crtc->devPrivate : NULL;
+	info->sna = sna;
+	info->event_id = (uint64_t *)(info + 1);
+	info->event_id[0] = event_id;
+	info->n_event_id = 1;
+	info->target_msc = target_msc;
+
+	if (!sna_page_flip(sna, bo, present_flip_handler, info)) {
 		DBG(("%s: pageflip failed\n", __FUNCTION__));
-		free(event);
+		free(info);
 		return FALSE;
 	}
 
@@ -358,12 +533,48 @@ get_flip_bo(PixmapPtr pixmap)
 
 	DBG(("%s(pixmap=%ld)\n", __FUNCTION__, pixmap->drawable.serialNumber));
 
-	priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ | __MOVE_FORCE);
+	priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ | __MOVE_SCANOUT | __MOVE_FORCE);
 	if (priv == NULL) {
 		DBG(("%s: cannot force pixmap to the GPU\n", __FUNCTION__));
 		return NULL;
 	}
 
+	if (priv->gpu_bo->scanout)
+		return priv->gpu_bo;
+
+	if (sna->kgem.has_llc && !wedged(sna)) {
+		struct kgem_bo *bo;
+		uint32_t tiling;
+
+		tiling = I915_TILING_NONE;
+		if ((sna->flags & SNA_LINEAR_FB) == 0)
+			tiling = I915_TILING_X;
+
+		bo = kgem_create_2d(&sna->kgem,
+				    pixmap->drawable.width,
+				    pixmap->drawable.height,
+				    pixmap->drawable.bitsPerPixel,
+				    tiling, CREATE_SCANOUT | CREATE_CACHED);
+		if (bo) {
+			BoxRec box;
+
+			box.x1 = box.y1 = 0;
+			box.x2 = pixmap->drawable.width;
+			box.y2 = pixmap->drawable.height;
+
+			if (sna->render.copy_boxes(sna, GXcopy,
+						   &pixmap->drawable, priv->gpu_bo, 0, 0,
+						   &pixmap->drawable, bo, 0, 0,
+						   &box, 1, 0)) {
+				sna_pixmap_unmap(pixmap, priv);
+				kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
+
+				priv->gpu_bo = bo;
+			} else
+				kgem_bo_destroy(&sna->kgem, bo);
+		}
+	}
+
 	if (sna->flags & SNA_LINEAR_FB &&
 	    priv->gpu_bo->tiling &&
 	    !sna_pixmap_change_tiling(pixmap, I915_TILING_NONE)) {
@@ -377,7 +588,6 @@ get_flip_bo(PixmapPtr pixmap)
 		return NULL;
 	}
 
-	priv->pinned |= PIN_SCANOUT;
 	return priv->gpu_bo;
 }
 
@@ -388,6 +598,7 @@ sna_present_flip(RRCrtcPtr crtc,
 		 PixmapPtr pixmap,
 		 Bool sync_flip)
 {
+	struct sna *sna = to_sna_from_pixmap(pixmap);
 	struct kgem_bo *bo;
 
 	DBG(("%s(pipe=%d, event=%lld, msc=%lld, pixmap=%ld, sync?=%d)\n",
@@ -397,11 +608,22 @@ sna_present_flip(RRCrtcPtr crtc,
 	     (long long)target_msc,
 	     pixmap->drawable.serialNumber, sync_flip));
 
-	if (!check_flip__crtc(to_sna_from_pixmap(pixmap), crtc)) {
+	if (!check_flip__crtc(sna, crtc)) {
 		DBG(("%s: flip invalid for CRTC\n", __FUNCTION__));
 		return FALSE;
 	}
 
+	assert(sna->present.unflip == 0);
+
+	if (sna->flags & SNA_TEAR_FREE)
+		sna->mode.shadow_enabled = false;
+	assert(!sna->mode.shadow_enabled);
+
+	if (sna->mode.flip_active) {
+		DBG(("%s: flips still pending\n", __FUNCTION__));
+		return FALSE;
+	}
+
 	bo = get_flip_bo(pixmap);
 	if (bo == NULL) {
 		DBG(("%s: flip invalid bo\n", __FUNCTION__));
@@ -409,9 +631,9 @@ sna_present_flip(RRCrtcPtr crtc,
 	}
 
 	if (sync_flip)
-		return page_flip(crtc->pScreen, crtc, event_id, bo);
+		return flip(sna, crtc, event_id, target_msc, bo);
 	else
-		return page_flip__async(crtc, event_id, target_msc, bo);
+		return flip__async(sna, crtc, event_id, target_msc, bo);
 }
 
 static void
@@ -421,29 +643,49 @@ sna_present_unflip(ScreenPtr screen, uint64_t event_id)
 	struct kgem_bo *bo;
 
 	DBG(("%s(event=%lld)\n", __FUNCTION__, (long long)event_id));
-	if (sna->mode.front_active == 0 || sna->mode.shadow_active) {
+	if (sna->mode.front_active == 0 || sna->mode.rr_active) {
 		const struct ust_msc *swap;
 
 		DBG(("%s: no CRTC active, perform no-op flip\n", __FUNCTION__));
 
 notify:
-		swap = sna_crtc_last_swap(sna_mode_first_crtc(sna));
-		DBG(("%s: pipe=%d, tv=%d.%06d msc %lld, event %lld complete\n", __FUNCTION__,
+		swap = sna_crtc_last_swap(sna_primary_crtc(sna));
+		DBG(("%s: pipe=%d, tv=%d.%06d msc=%lld, event=%lld complete\n", __FUNCTION__,
 		     -1,
 		     swap->tv_sec, swap->tv_usec, (long long)swap->msc,
 		     (long long)event_id));
-		present_event_notify(event_id,
-				     ust64(swap->tv_sec, swap->tv_usec),
-				     swap->msc);
+		present_event_notify(event_id, swap_ust(swap), swap->msc);
+		return;
+	}
+
+	if (sna->mode.flip_active) {
+		DBG(("%s: %d outstanding flips, queueing unflip\n", __FUNCTION__, sna->mode.flip_active));
+		assert(sna->present.unflip == 0);
+		sna->present.unflip = event_id;
 		return;
 	}
 
+	if (sna->flags & SNA_TEAR_FREE)
+		sna->mode.shadow_enabled = sna->mode.shadow_damage != NULL;
+
 	bo = get_flip_bo(screen->GetScreenPixmap(screen));
-	if (bo == NULL || !page_flip(screen, NULL, event_id, bo)) {
+	if (bo == NULL) {
+reset_mode:
 		DBG(("%s: failed, trying to restore original mode\n", __FUNCTION__));
 		xf86SetDesiredModes(sna->scrn);
 		goto notify;
 	}
+
+	assert(sna_pixmap(screen->GetScreenPixmap(screen))->pinned & PIN_SCANOUT);
+
+	if (sna->flags & SNA_HAS_ASYNC_FLIP) {
+		DBG(("%s: trying async flip restore\n", __FUNCTION__));
+		if (flip__async(sna, NULL, event_id, 0, bo))
+			return;
+	}
+
+	if (!flip(sna, NULL, event_id, 0, bo))
+		goto reset_mode;
 }
 
 static present_screen_info_rec present_info = {
@@ -463,10 +705,13 @@ static present_screen_info_rec present_info = {
 
 bool sna_present_open(struct sna *sna, ScreenPtr screen)
 {
+	DBG(("%s(num_crtc=%d)\n", __FUNCTION__, sna->mode.num_real_crtc));
+
 	if (sna->mode.num_real_crtc == 0)
 		return false;
 
 	sna_present_update(sna);
+	list_init(&sna->present.vblank_queue);
 
 	return present_screen_init(screen, &present_info);
 }
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 3fbb9ec..8a9c7f4 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -2336,6 +2336,9 @@ memcpy_copy_boxes(struct sna *sna, uint8_t op,
 	if (op != GXcopy)
 		return false;
 
+	if (src_draw->depth != dst_draw->depth)
+		return false;
+
 	clipped = (n > 1 ||
 		   box->x1 + dx > 0 ||
 		   box->y1 + dy > 0 ||
@@ -2380,4 +2383,5 @@ void
 sna_render_mark_wedged(struct sna *sna)
 {
 	sna->render.copy_boxes = memcpy_copy_boxes;
+	sna->render.prefer_gpu = 0;
 }
diff --git a/src/sna/sna_render_inline.h b/src/sna/sna_render_inline.h
index 10fbbfe..e162e37 100644
--- a/src/sna/sna_render_inline.h
+++ b/src/sna/sna_render_inline.h
@@ -304,6 +304,12 @@ color_convert(uint32_t pixel,
 	return pixel;
 }
 
+inline static uint32_t
+solid_color(uint32_t format, uint32_t pixel)
+{
+	return color_convert(pixel, format, PICT_a8r8g8b8);
+}
+
 inline static bool dst_use_gpu(PixmapPtr pixmap)
 {
 	struct sna_pixmap *priv = sna_pixmap(pixmap);
diff --git a/src/sna/sna_trapezoids_imprecise.c b/src/sna/sna_trapezoids_imprecise.c
index 37def2f..df22add 100644
--- a/src/sna/sna_trapezoids_imprecise.c
+++ b/src/sna/sna_trapezoids_imprecise.c
@@ -962,6 +962,16 @@ tor_add_trapezoid(struct tor *tor,
 		  const xTrapezoid *t,
 		  int dx, int dy)
 {
+	if (!xTrapezoidValid(t)) {
+		__DBG(("%s: skipping invalid trapezoid: top=%d, bottom=%d, left=(%d, %d), (%d, %d), right=(%d, %d), (%d, %d)\n",
+		       __FUNCTION__,
+		       t->top, t->bottom,
+		       t->left.p1.x, t->left.p1.y,
+		       t->left.p2.x, t->left.p2.y,
+		       t->right.p1.x, t->right.p1.y,
+		       t->right.p2.x, t->right.p2.y));
+		return;
+	}
 	polygon_add_edge(tor->polygon, t, &t->left, 1, dx, dy);
 	polygon_add_edge(tor->polygon, t, &t->right, -1, dx, dy);
 }
@@ -1687,31 +1697,28 @@ struct span_thread {
 #define SPAN_THREAD_MAX_BOXES (8192/sizeof(struct sna_opacity_box))
 struct span_thread_boxes {
 	const struct sna_composite_spans_op *op;
+	const BoxRec *clip_start, *clip_end;
 	int num_boxes;
 	struct sna_opacity_box boxes[SPAN_THREAD_MAX_BOXES];
 };
 
-static void span_thread_add_boxes(struct sna *sna, void *data,
-				  const BoxRec *box, int count, float alpha)
+static void span_thread_add_box(struct sna *sna, void *data,
+				const BoxRec *box, float alpha)
 {
 	struct span_thread_boxes *b = data;
 
 	__DBG(("%s: adding %d boxes with alpha=%f\n",
 	       __FUNCTION__, count, alpha));
 
-	assert(count > 0 && count <= SPAN_THREAD_MAX_BOXES);
-	if (unlikely(b->num_boxes + count > SPAN_THREAD_MAX_BOXES)) {
-		DBG(("%s: flushing %d boxes, adding %d\n", __FUNCTION__, b->num_boxes, count));
-		assert(b->num_boxes <= SPAN_THREAD_MAX_BOXES);
+	if (unlikely(b->num_boxes == SPAN_THREAD_MAX_BOXES)) {
+		DBG(("%s: flushing %d boxes\n", __FUNCTION__, b->num_boxes));
 		b->op->thread_boxes(sna, b->op, b->boxes, b->num_boxes);
 		b->num_boxes = 0;
 	}
 
-	do {
-		b->boxes[b->num_boxes].box = *box++;
-		b->boxes[b->num_boxes].alpha = alpha;
-		b->num_boxes++;
-	} while (--count);
+	b->boxes[b->num_boxes].box = *box++;
+	b->boxes[b->num_boxes].alpha = alpha;
+	b->num_boxes++;
 	assert(b->num_boxes <= SPAN_THREAD_MAX_BOXES);
 }
 
@@ -1722,8 +1729,22 @@ span_thread_box(struct sna *sna,
 		const BoxRec *box,
 		int coverage)
 {
+	struct span_thread_boxes *b = (struct span_thread_boxes *)op;
+
 	__DBG(("%s: %d -> %d @ %d\n", __FUNCTION__, box->x1, box->x2, coverage));
-	span_thread_add_boxes(sna, op, box, 1, AREA_TO_ALPHA(coverage));
+	if (b->num_boxes) {
+		struct sna_opacity_box *bb = &b->boxes[b->num_boxes-1];
+		if (bb->box.x1 == box->x1 &&
+		    bb->box.x2 == box->x2 &&
+		    bb->box.y2 == box->y1 &&
+		    bb->alpha == AREA_TO_ALPHA(coverage)) {
+			bb->box.y2 = box->y2;
+			__DBG(("%s: contracted double row: %d -> %d\n", __func__, bb->box.y1, bb->box.y2));
+			return;
+		}
+	}
+
+	span_thread_add_box(sna, op, box, AREA_TO_ALPHA(coverage));
 }
 
 static void
@@ -1733,20 +1754,28 @@ span_thread_clipped_box(struct sna *sna,
 			const BoxRec *box,
 			int coverage)
 {
-	pixman_region16_t region;
+	struct span_thread_boxes *b = (struct span_thread_boxes *)op;
+	const BoxRec *c;
 
 	__DBG(("%s: %d -> %d @ %f\n", __FUNCTION__, box->x1, box->x2,
 	       AREA_TO_ALPHA(coverage)));
 
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	if (region_num_rects(&region)) {
-		span_thread_add_boxes(sna, op,
-				      region_rects(&region),
-				      region_num_rects(&region),
-				      AREA_TO_ALPHA(coverage));
+	b->clip_start =
+		find_clip_box_for_y(b->clip_start, b->clip_end, box->y1);
+
+	c = b->clip_start;
+	while (c != b->clip_end) {
+		BoxRec clipped;
+
+		if (box->y2 <= c->y1)
+			break;
+
+		clipped = *box;
+		if (!box_intersect(&clipped, c++))
+			continue;
+
+		span_thread_add_box(sna, op, &clipped, AREA_TO_ALPHA(coverage));
 	}
-	pixman_region_fini(&region);
 }
 
 static span_func_t
@@ -1777,6 +1806,16 @@ thread_choose_span(struct sna_composite_spans_op *tmp,
 	return span;
 }
 
+inline static void
+span_thread_boxes_init(struct span_thread_boxes *boxes,
+		       const struct sna_composite_spans_op *op,
+		       const RegionRec *clip)
+{
+	boxes->op = op;
+	region_get_boxes(clip, &boxes->clip_start, &boxes->clip_end);
+	boxes->num_boxes = 0;
+}
+
 static void
 span_thread(void *arg)
 {
@@ -1789,8 +1828,7 @@ span_thread(void *arg)
 	if (!tor_init(&tor, &thread->extents, 2*thread->ntrap))
 		return;
 
-	boxes.op = thread->op;
-	boxes.num_boxes = 0;
+	span_thread_boxes_init(&boxes, thread->op, thread->clip);
 
 	y1 = thread->extents.y1 - thread->draw_y;
 	y2 = thread->extents.y2 - thread->draw_y;
@@ -2190,6 +2228,52 @@ static void _tor_blt_src(struct inplace *in, const BoxRec *box, uint8_t v)
 	} while (--h);
 }
 
+struct clipped_span {
+	span_func_t span;
+	const BoxRec *clip_start, *clip_end;
+};
+
+static void
+tor_blt_clipped(struct sna *sna,
+		struct sna_composite_spans_op *op,
+		pixman_region16_t *clip,
+		const BoxRec *box,
+		int coverage)
+{
+	struct clipped_span *cs = (struct clipped_span *)clip;
+	const BoxRec *c;
+
+	cs->clip_start =
+		find_clip_box_for_y(cs->clip_start, cs->clip_end, box->y1);
+
+	c = cs->clip_start;
+	while (c != cs->clip_end) {
+		BoxRec clipped;
+
+		if (box->y2 <= c->y1)
+			break;
+
+		clipped = *box;
+		if (!box_intersect(&clipped, c++))
+			continue;
+
+		cs->span(sna, op, NULL, &clipped, coverage);
+	}
+}
+
+inline static span_func_t
+clipped_span(struct clipped_span *cs,
+	     span_func_t span,
+	     const RegionRec *clip)
+{
+	if (clip->data) {
+		cs->span = span;
+		region_get_boxes(clip, &cs->clip_start, &cs->clip_end);
+		span = tor_blt_clipped;
+	}
+	return span;
+}
+
 static void
 tor_blt_src(struct sna *sna,
 	    struct sna_composite_spans_op *op,
@@ -2203,25 +2287,6 @@ tor_blt_src(struct sna *sna,
 }
 
 static void
-tor_blt_src_clipped(struct sna *sna,
-		    struct sna_composite_spans_op *op,
-		    pixman_region16_t *clip,
-		    const BoxRec *box,
-		    int coverage)
-{
-	pixman_region16_t region;
-	int n;
-
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	n = region_num_rects(&region);
-	box = region_rects(&region);
-	while (n--)
-		tor_blt_src(sna, op, NULL, box++, coverage);
-	pixman_region_fini(&region);
-}
-
-static void
 tor_blt_in(struct sna *sna,
 	   struct sna_composite_spans_op *op,
 	   pixman_region16_t *clip,
@@ -2253,25 +2318,6 @@ tor_blt_in(struct sna *sna,
 }
 
 static void
-tor_blt_in_clipped(struct sna *sna,
-		   struct sna_composite_spans_op *op,
-		   pixman_region16_t *clip,
-		   const BoxRec *box,
-		   int coverage)
-{
-	pixman_region16_t region;
-	int n;
-
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	n = region_num_rects(&region);
-	box = region_rects(&region);
-	while (n--)
-		tor_blt_in(sna, op, NULL, box++, coverage);
-	pixman_region_fini(&region);
-}
-
-static void
 tor_blt_add(struct sna *sna,
 	    struct sna_composite_spans_op *op,
 	    pixman_region16_t *clip,
@@ -2310,25 +2356,6 @@ tor_blt_add(struct sna *sna,
 }
 
 static void
-tor_blt_add_clipped(struct sna *sna,
-		    struct sna_composite_spans_op *op,
-		    pixman_region16_t *clip,
-		    const BoxRec *box,
-		    int coverage)
-{
-	pixman_region16_t region;
-	int n;
-
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	n = region_num_rects(&region);
-	box = region_rects(&region);
-	while (n--)
-		tor_blt_add(sna, op, NULL, box++, coverage);
-	pixman_region_fini(&region);
-}
-
-static void
 tor_blt_lerp32(struct sna *sna,
 	       struct sna_composite_spans_op *op,
 	       pixman_region16_t *clip,
@@ -2383,25 +2410,6 @@ tor_blt_lerp32(struct sna *sna,
 	}
 }
 
-static void
-tor_blt_lerp32_clipped(struct sna *sna,
-		       struct sna_composite_spans_op *op,
-		       pixman_region16_t *clip,
-		       const BoxRec *box,
-		       int coverage)
-{
-	pixman_region16_t region;
-	int n;
-
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	n = region_num_rects(&region);
-	box = region_rects(&region);
-	while (n--)
-		tor_blt_lerp32(sna, op, NULL, box++, coverage);
-	pixman_region_fini(&region);
-}
-
 struct pixman_inplace {
 	pixman_image_t *image, *source, *mask;
 	uint32_t color;
@@ -2431,24 +2439,6 @@ pixmask_span_solid(struct sna *sna,
 			       pi->dx + box->x1, pi->dy + box->y1,
 			       box->x2 - box->x1, box->y2 - box->y1);
 }
-static void
-pixmask_span_solid__clipped(struct sna *sna,
-			    struct sna_composite_spans_op *op,
-			    pixman_region16_t *clip,
-			    const BoxRec *box,
-			    int coverage)
-{
-	pixman_region16_t region;
-	int n;
-
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	n = region_num_rects(&region);
-	box = region_rects(&region);
-	while (n--)
-		pixmask_span_solid(sna, op, NULL, box++, coverage);
-	pixman_region_fini(&region);
-}
 
 static void
 pixmask_span(struct sna *sna,
@@ -2471,24 +2461,6 @@ pixmask_span(struct sna *sna,
 			       pi->dx + box->x1, pi->dy + box->y1,
 			       box->x2 - box->x1, box->y2 - box->y1);
 }
-static void
-pixmask_span__clipped(struct sna *sna,
-		      struct sna_composite_spans_op *op,
-		      pixman_region16_t *clip,
-		      const BoxRec *box,
-		      int coverage)
-{
-	pixman_region16_t region;
-	int n;
-
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	n = region_num_rects(&region);
-	box = region_rects(&region);
-	while (n--)
-		pixmask_span(sna, op, NULL, box++, coverage);
-	pixman_region_fini(&region);
-}
 
 struct inplace_x8r8g8b8_thread {
 	xTrapezoid *traps;
@@ -2507,6 +2479,7 @@ static void inplace_x8r8g8b8_thread(void *arg)
 	struct inplace_x8r8g8b8_thread *thread = arg;
 	struct tor tor;
 	span_func_t span;
+	struct clipped_span clipped;
 	RegionPtr clip;
 	int y1, y2, n;
 
@@ -2537,12 +2510,11 @@ static void inplace_x8r8g8b8_thread(void *arg)
 		inplace.stride = pixmap->devKind;
 		inplace.color = thread->color;
 
-		if (clip->data)
-			span = tor_blt_lerp32_clipped;
-		else
-			span = tor_blt_lerp32;
+		span = clipped_span(&clipped, tor_blt_lerp32, clip);
 
-		tor_render(NULL, &tor, (void*)&inplace, clip, span, false);
+		tor_render(NULL, &tor,
+			   (void*)&inplace, (void*)&clipped,
+			   span, false);
 	} else if (thread->is_solid) {
 		struct pixman_inplace pi;
 
@@ -2555,12 +2527,11 @@ static void inplace_x8r8g8b8_thread(void *arg)
 						     1, 1, pi.bits, 0);
 		pixman_image_set_repeat(pi.source, PIXMAN_REPEAT_NORMAL);
 
-		if (clip->data)
-			span = pixmask_span_solid__clipped;
-		else
-			span = pixmask_span_solid;
+		span = clipped_span(&clipped, pixmask_span_solid, clip);
 
-		tor_render(NULL, &tor, (void*)&pi, clip, span, false);
+		tor_render(NULL, &tor,
+			   (void*)&pi, (void *)&clipped,
+			   span, false);
 
 		pixman_image_unref(pi.source);
 		pixman_image_unref(pi.image);
@@ -2579,12 +2550,11 @@ static void inplace_x8r8g8b8_thread(void *arg)
 		pi.bits = pixman_image_get_data(pi.mask);
 		pi.op = thread->op;
 
-		if (clip->data)
-			span = pixmask_span__clipped;
-		else
-			span = pixmask_span;
+		span = clipped_span(&clipped, pixmask_span, clip);
 
-		tor_render(NULL, &tor, (void*)&pi, clip, span, false);
+		tor_render(NULL, &tor,
+			   (void*)&pi, (void *)&clipped,
+			   span, false);
 
 		pixman_image_unref(pi.mask);
 		pixman_image_unref(pi.source);
@@ -2698,6 +2668,7 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 	if (num_threads == 1) {
 		struct tor tor;
 		span_func_t span;
+		struct clipped_span clipped;
 
 		if (!tor_init(&tor, &region.extents, 2*ntrap))
 			return true;
@@ -2723,17 +2694,15 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 			inplace.stride = pixmap->devKind;
 			inplace.color = color;
 
-			if (dst->pCompositeClip->data)
-				span = tor_blt_lerp32_clipped;
-			else
-				span = tor_blt_lerp32;
+			span = clipped_span(&clipped, tor_blt_lerp32, dst->pCompositeClip);
 
 			DBG(("%s: render inplace op=%d, color=%08x\n",
 			     __FUNCTION__, op, color));
 
 			if (sigtrap_get() == 0) {
-				tor_render(NULL, &tor, (void*)&inplace,
-					   dst->pCompositeClip, span, false);
+				tor_render(NULL, &tor,
+					   (void*)&inplace, (void*)&clipped,
+					   span, false);
 				sigtrap_put();
 			}
 		} else if (is_solid) {
@@ -2748,15 +2717,12 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 							     1, 1, pi.bits, 0);
 			pixman_image_set_repeat(pi.source, PIXMAN_REPEAT_NORMAL);
 
-			if (dst->pCompositeClip->data)
-				span = pixmask_span_solid__clipped;
-			else
-				span = pixmask_span_solid;
+			span = clipped_span(&clipped, pixmask_span_solid, dst->pCompositeClip);
 
 			if (sigtrap_get() == 0) {
-				tor_render(NULL, &tor, (void*)&pi,
-					   dst->pCompositeClip, span,
-					   false);
+				tor_render(NULL, &tor,
+					   (void*)&pi, (void*)&clipped,
+					   span, false);
 				sigtrap_put();
 			}
 
@@ -2777,15 +2743,12 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 			pi.bits = pixman_image_get_data(pi.mask);
 			pi.op = op;
 
-			if (dst->pCompositeClip->data)
-				span = pixmask_span__clipped;
-			else
-				span = pixmask_span;
+			span = clipped_span(&clipped, pixmask_span, dst->pCompositeClip);
 
 			if (sigtrap_get() == 0) {
-				tor_render(NULL, &tor, (void*)&pi,
-					   dst->pCompositeClip, span,
-					   false);
+				tor_render(NULL, &tor,
+					   (void*)&pi, (void*)&clipped,
+					   span, false);
 				sigtrap_put();
 			}
 
@@ -2847,9 +2810,9 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 
 struct inplace_thread {
 	xTrapezoid *traps;
-	RegionPtr clip;
 	span_func_t span;
 	struct inplace inplace;
+	struct clipped_span clipped;
 	BoxRec extents;
 	int dx, dy;
 	int draw_x, draw_y;
@@ -2874,8 +2837,9 @@ static void inplace_thread(void *arg)
 		tor_add_trapezoid(&tor, &thread->traps[n], thread->dx, thread->dy);
 	}
 
-	tor_render(NULL, &tor, (void*)&thread->inplace,
-		   thread->clip, thread->span, thread->unbounded);
+	tor_render(NULL, &tor,
+		   (void*)&thread->inplace, (void*)&thread->clipped,
+		   thread->span, thread->unbounded);
 
 	tor_fini(&tor);
 }
@@ -2889,6 +2853,7 @@ imprecise_trapezoid_span_inplace(struct sna *sna,
 				 bool fallback)
 {
 	struct inplace inplace;
+	struct clipped_span clipped;
 	span_func_t span;
 	PixmapPtr pixmap;
 	struct sna_pixmap *priv;
@@ -3005,21 +2970,12 @@ imprecise_trapezoid_span_inplace(struct sna *sna,
 	     region.extents.x2, region.extents.y2));
 
 	if (op == PictOpSrc) {
-		if (dst->pCompositeClip->data)
-			span = tor_blt_src_clipped;
-		else
-			span = tor_blt_src;
+		span = tor_blt_src;
 	} else if (op == PictOpIn) {
-		if (dst->pCompositeClip->data)
-			span = tor_blt_in_clipped;
-		else
-			span = tor_blt_in;
+		span = tor_blt_in;
 	} else {
 		assert(op == PictOpAdd);
-		if (dst->pCompositeClip->data)
-			span = tor_blt_add_clipped;
-		else
-			span = tor_blt_add;
+		span = tor_blt_add;
 	}
 
 	DBG(("%s: move-to-cpu\n", __FUNCTION__));
@@ -3037,6 +2993,8 @@ imprecise_trapezoid_span_inplace(struct sna *sna,
 	inplace.stride = pixmap->devKind;
 	inplace.opacity = color >> 24;
 
+	span = clipped_span(&clipped, span, dst->pCompositeClip);
+
 	num_threads = 1;
 	if ((flags & COMPOSITE_SPANS_RECTILINEAR) == 0)
 		num_threads = sna_use_threads(region.extents.x2 - region.extents.x1,
@@ -3057,8 +3015,9 @@ imprecise_trapezoid_span_inplace(struct sna *sna,
 		}
 
 		if (sigtrap_get() == 0) {
-			tor_render(NULL, &tor, (void*)&inplace,
-				   dst->pCompositeClip, span, unbounded);
+			tor_render(NULL, &tor,
+				   (void*)&inplace, (void *)&clipped,
+				   span, unbounded);
 			sigtrap_put();
 		}
 
@@ -3075,8 +3034,8 @@ imprecise_trapezoid_span_inplace(struct sna *sna,
 		threads[0].traps = traps;
 		threads[0].ntrap = ntrap;
 		threads[0].inplace = inplace;
+		threads[0].clipped = clipped;
 		threads[0].extents = region.extents;
-		threads[0].clip = dst->pCompositeClip;
 		threads[0].span = span;
 		threads[0].unbounded = unbounded;
 		threads[0].dx = dx;
@@ -3707,8 +3666,7 @@ tristrip_thread(void *arg)
 	if (!tor_init(&tor, &thread->extents, 2*thread->count))
 		return;
 
-	boxes.op = thread->op;
-	boxes.num_boxes = 0;
+	span_thread_boxes_init(&boxes, thread->op, thread->clip);
 
 	cw = 0; ccw = 1;
 	polygon_add_line(tor.polygon,
@@ -3874,7 +3832,7 @@ imprecise_tristrip_span_converter(struct sna *sna,
 				break;
 		} while (1);
 		polygon_add_line(tor.polygon,
-				 &points[cw], &points[2+ccw],
+				 &points[cw], &points[ccw],
 				 dx, dy);
 		assert(tor.polygon->num_edges <= 2*count);
 
diff --git a/src/sna/sna_trapezoids_mono.c b/src/sna/sna_trapezoids_mono.c
index 808703a..29cb58f 100644
--- a/src/sna/sna_trapezoids_mono.c
+++ b/src/sna/sna_trapezoids_mono.c
@@ -72,6 +72,7 @@ struct mono {
 	struct sna *sna;
 	struct sna_composite_op op;
 	pixman_region16_t clip;
+	const BoxRec *clip_start, *clip_end;
 
 	fastcall void (*span)(struct mono *, int, int, BoxPtr);
 
@@ -474,6 +475,34 @@ mono_span__fast(struct mono *c, int x1, int x2, BoxPtr box)
 	c->op.box(c->sna, &c->op, box);
 }
 
+fastcall static void
+mono_span__clipped(struct mono *c, int x1, int x2, BoxPtr box)
+{
+	const BoxRec *b;
+
+	__DBG(("%s [%d, %d]\n", __FUNCTION__, x1, x2));
+
+	c->clip_start =
+		find_clip_box_for_y(c->clip_start, c->clip_end, box->y1);
+
+	b = c->clip_start;
+	while (b != c->clip_end) {
+		BoxRec clipped;
+
+		if (box->y2 <= b->y1)
+			break;
+
+		clipped.x1 = x1;
+		clipped.x2 = x2;
+		clipped.y1 = box->y1;
+		clipped.y2 = box->y2;
+		if (!box_intersect(&clipped, b++))
+			continue;
+
+		c->op.box(c->sna, &c->op, &clipped);
+	}
+}
+
 struct mono_span_thread_boxes {
 	const struct sna_composite_op *op;
 #define MONO_SPAN_MAX_BOXES (8192/sizeof(BoxRec))
@@ -482,40 +511,45 @@ struct mono_span_thread_boxes {
 };
 
 inline static void
-thread_mono_span_add_boxes(struct mono *c, const BoxRec *box, int count)
+thread_mono_span_add_box(struct mono *c, const BoxRec *box)
 {
 	struct mono_span_thread_boxes *b = c->op.priv;
 
-	assert(count > 0 && count <= MONO_SPAN_MAX_BOXES);
-	if (unlikely(b->num_boxes + count > MONO_SPAN_MAX_BOXES)) {
+	if (unlikely(b->num_boxes == MONO_SPAN_MAX_BOXES)) {
 		b->op->thread_boxes(c->sna, b->op, b->boxes, b->num_boxes);
 		b->num_boxes = 0;
 	}
 
-	memcpy(b->boxes + b->num_boxes, box, count*sizeof(BoxRec));
-	b->num_boxes += count;
+	b->boxes[b->num_boxes++] = *box;
 	assert(b->num_boxes <= MONO_SPAN_MAX_BOXES);
 }
 
 fastcall static void
 thread_mono_span_clipped(struct mono *c, int x1, int x2, BoxPtr box)
 {
-	pixman_region16_t region;
+	const BoxRec *b;
 
 	__DBG(("%s [%d, %d]\n", __FUNCTION__, x1, x2));
 
-	box->x1 = x1;
-	box->x2 = x2;
+	c->clip_start =
+		find_clip_box_for_y(c->clip_start, c->clip_end, box->y1);
 
-	assert(c->clip.data);
+	b = c->clip_start;
+	while (b != c->clip_end) {
+		BoxRec clipped;
 
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, &c->clip);
-	if (region_num_rects(&region))
-		thread_mono_span_add_boxes(c,
-					   region_rects(&region),
-					   region_num_rects(&region));
-	pixman_region_fini(&region);
+		if (box->y2 <= b->y1)
+			break;
+
+		clipped.x1 = x1;
+		clipped.x2 = x2;
+		clipped.y1 = box->y1;
+		clipped.y2 = box->y2;
+		if (!box_intersect(&clipped, b++))
+			continue;
+
+		thread_mono_span_add_box(c, &clipped);
+	}
 }
 
 fastcall static void
@@ -525,7 +559,7 @@ thread_mono_span(struct mono *c, int x1, int x2, BoxPtr box)
 
 	box->x1 = x1;
 	box->x2 = x2;
-	thread_mono_span_add_boxes(c, box, 1);
+	thread_mono_span_add_box(c, box);
 }
 
 inline static void
@@ -717,6 +751,7 @@ mono_span_thread(void *arg)
 		if (RegionNil(&mono.clip))
 			return;
 	}
+	region_get_boxes(&mono.clip, &mono.clip_start, &mono.clip_end);
 
 	boxes.op = thread->op;
 	boxes.num_boxes = 0;
@@ -891,9 +926,12 @@ mono_trapezoids_span_converter(struct sna *sna,
 
 	if (mono.clip.data == NULL && mono.op.damage == NULL)
 		mono.span = mono_span__fast;
+	else if (mono.clip.data != NULL && mono.op.damage == NULL)
+		mono.span = mono_span__clipped;
 	else
 		mono.span = mono_span;
 
+	region_get_boxes(&mono.clip, &mono.clip_start, &mono.clip_end);
 	mono_render(&mono);
 	mono.op.done(mono.sna, &mono.op);
 	mono_fini(&mono);
@@ -939,6 +977,7 @@ mono_trapezoids_span_converter(struct sna *sna,
 					       mono.clip.extents.x2 - mono.clip.extents.x1,
 					       mono.clip.extents.y2 - mono.clip.extents.y1,
 					       COMPOSITE_PARTIAL, memset(&mono.op, 0, sizeof(mono.op)))) {
+			region_get_boxes(&mono.clip, &mono.clip_start, &mono.clip_end);
 			mono_render(&mono);
 			mono.op.done(mono.sna, &mono.op);
 		}
diff --git a/src/sna/sna_trapezoids_precise.c b/src/sna/sna_trapezoids_precise.c
index 9187ab4..f1532d3 100644
--- a/src/sna/sna_trapezoids_precise.c
+++ b/src/sna/sna_trapezoids_precise.c
@@ -1023,6 +1023,16 @@ tor_init(struct tor *converter, const BoxRec *box, int num_edges)
 static void
 tor_add_trapezoid(struct tor *tor, const xTrapezoid *t, int dx, int dy)
 {
+	if (!xTrapezoidValid(t)) {
+		__DBG(("%s: skipping invalid trapezoid: top=%d, bottom=%d, left=(%d, %d), (%d, %d), right=(%d, %d), (%d, %d)\n",
+		       __FUNCTION__,
+		       t->top, t->bottom,
+		       t->left.p1.x, t->left.p1.y,
+		       t->left.p2.x, t->left.p2.y,
+		       t->right.p1.x, t->right.p1.y,
+		       t->right.p2.x, t->right.p2.y));
+		return;
+	}
 	polygon_add_edge(tor->polygon, t, &t->left, 1, dx, dy);
 	polygon_add_edge(tor->polygon, t, &t->right, -1, dx, dy);
 }
@@ -1635,31 +1645,28 @@ struct span_thread {
 #define SPAN_THREAD_MAX_BOXES (8192/sizeof(struct sna_opacity_box))
 struct span_thread_boxes {
 	const struct sna_composite_spans_op *op;
+	const BoxRec *clip_start, *clip_end;
 	int num_boxes;
 	struct sna_opacity_box boxes[SPAN_THREAD_MAX_BOXES];
 };
 
-static void span_thread_add_boxes(struct sna *sna, void *data,
-				  const BoxRec *box, int count, float alpha)
+static void span_thread_add_box(struct sna *sna, void *data,
+				const BoxRec *box, float alpha)
 {
 	struct span_thread_boxes *b = data;
 
 	__DBG(("%s: adding %d boxes with alpha=%f\n",
 	       __FUNCTION__, count, alpha));
 
-	assert(count > 0 && count <= SPAN_THREAD_MAX_BOXES);
-	if (unlikely(b->num_boxes + count > SPAN_THREAD_MAX_BOXES)) {
-		DBG(("%s: flushing %d boxes, adding %d\n", __FUNCTION__, b->num_boxes, count));
-		assert(b->num_boxes <= SPAN_THREAD_MAX_BOXES);
+	if (unlikely(b->num_boxes == SPAN_THREAD_MAX_BOXES)) {
+		DBG(("%s: flushing %d boxes\n", __FUNCTION__, b->num_boxes));
 		b->op->thread_boxes(sna, b->op, b->boxes, b->num_boxes);
 		b->num_boxes = 0;
 	}
 
-	do {
-		b->boxes[b->num_boxes].box = *box++;
-		b->boxes[b->num_boxes].alpha = alpha;
-		b->num_boxes++;
-	} while (--count);
+	b->boxes[b->num_boxes].box = *box++;
+	b->boxes[b->num_boxes].alpha = alpha;
+	b->num_boxes++;
 	assert(b->num_boxes <= SPAN_THREAD_MAX_BOXES);
 }
 
@@ -1670,8 +1677,22 @@ span_thread_box(struct sna *sna,
 		const BoxRec *box,
 		int coverage)
 {
+	struct span_thread_boxes *b = (struct span_thread_boxes *)op;
+
 	__DBG(("%s: %d -> %d @ %d\n", __FUNCTION__, box->x1, box->x2, coverage));
-	span_thread_add_boxes(sna, op, box, 1, AREA_TO_FLOAT(coverage));
+	if (b->num_boxes) {
+		struct sna_opacity_box *bb = &b->boxes[b->num_boxes-1];
+		if (bb->box.x1 == box->x1 &&
+		    bb->box.x2 == box->x2 &&
+		    bb->box.y2 == box->y1 &&
+		    bb->alpha == AREA_TO_FLOAT(coverage)) {
+			bb->box.y2 = box->y2;
+			__DBG(("%s: contracted double row: %d -> %d\n", __func__, bb->box.y1, bb->box.y2));
+			return;
+		}
+	}
+
+	span_thread_add_box(sna, op, box, AREA_TO_FLOAT(coverage));
 }
 
 static void
@@ -1681,20 +1702,28 @@ span_thread_clipped_box(struct sna *sna,
 			const BoxRec *box,
 			int coverage)
 {
-	pixman_region16_t region;
+	struct span_thread_boxes *b = (struct span_thread_boxes *)op;
+	const BoxRec *c;
 
 	__DBG(("%s: %d -> %d @ %f\n", __FUNCTION__, box->x1, box->x2,
 	       AREA_TO_FLOAT(coverage)));
 
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	if (region_num_rects(&region)) {
-		span_thread_add_boxes(sna, op,
-				      region_rects(&region),
-				      region_num_rects(&region),
-				      AREA_TO_FLOAT(coverage));
+	b->clip_start =
+		find_clip_box_for_y(b->clip_start, b->clip_end, box->y1);
+
+	c = b->clip_start;
+	while (c != b->clip_end) {
+		BoxRec clipped;
+
+		if (box->y2 <= c->y1)
+			break;
+
+		clipped = *box;
+		if (!box_intersect(&clipped, c++))
+			continue;
+
+		span_thread_add_box(sna, op, &clipped, AREA_TO_FLOAT(coverage));
 	}
-	pixman_region_fini(&region);
 }
 
 static span_func_t
@@ -1712,7 +1741,7 @@ thread_choose_span(struct sna_composite_spans_op *tmp,
 
 	assert(!is_mono(dst, maskFormat));
 	assert(tmp->thread_boxes);
-	DBG(("%s: clipped? %d\n", __FUNCTION__, clip->data != NULL));
+	DBG(("%s: clipped? %d x %d\n", __FUNCTION__, clip->data != NULL, region_num_rects(clip)));
 	if (clip->data)
 		span = span_thread_clipped_box;
 	else
@@ -1721,6 +1750,17 @@ thread_choose_span(struct sna_composite_spans_op *tmp,
 	return span;
 }
 
+inline static void
+span_thread_boxes_init(struct span_thread_boxes *boxes,
+		       const struct sna_composite_spans_op *op,
+		       const RegionRec *clip)
+{
+	boxes->op = op;
+	boxes->clip_start = region_rects(clip);
+	boxes->clip_end = boxes->clip_start + region_num_rects(clip);
+	boxes->num_boxes = 0;
+}
+
 static void
 span_thread(void *arg)
 {
@@ -1733,8 +1773,7 @@ span_thread(void *arg)
 	if (!tor_init(&tor, &thread->extents, 2*thread->ntrap))
 		return;
 
-	boxes.op = thread->op;
-	boxes.num_boxes = 0;
+	span_thread_boxes_init(&boxes, thread->op, thread->clip);
 
 	y1 = thread->extents.y1 - thread->draw_y;
 	y2 = thread->extents.y2 - thread->draw_y;
@@ -2183,6 +2222,52 @@ static force_inline uint8_t coverage_opacity(int coverage, uint8_t opacity)
 	return opacity == 255 ? coverage : mul_8_8(coverage, opacity);
 }
 
+struct clipped_span {
+	span_func_t span;
+	const BoxRec *clip_start, *clip_end;
+};
+
+static void
+tor_blt_clipped(struct sna *sna,
+		struct sna_composite_spans_op *op,
+		pixman_region16_t *clip,
+		const BoxRec *box,
+		int coverage)
+{
+	struct clipped_span *cs = (struct clipped_span *)clip;
+	const BoxRec *c;
+
+	cs->clip_start =
+		find_clip_box_for_y(cs->clip_start, cs->clip_end, box->y1);
+
+	c = cs->clip_start;
+	while (c != cs->clip_end) {
+		BoxRec clipped;
+
+		if (box->y2 <= c->y1)
+			break;
+
+		clipped = *box;
+		if (!box_intersect(&clipped, c++))
+			continue;
+
+		cs->span(sna, op, NULL, &clipped, coverage);
+	}
+}
+
+inline static span_func_t
+clipped_span(struct clipped_span *cs,
+	     span_func_t span,
+	     const RegionRec *clip)
+{
+	if (clip->data) {
+		cs->span = span;
+		region_get_boxes(clip, &cs->clip_start, &cs->clip_end);
+		span = tor_blt_clipped;
+	}
+	return span;
+}
+
 static void _tor_blt_src(struct inplace *in, const BoxRec *box, uint8_t v)
 {
 	uint8_t *ptr = in->ptr;
@@ -2218,25 +2303,6 @@ tor_blt_src(struct sna *sna,
 }
 
 static void
-tor_blt_src_clipped(struct sna *sna,
-		    struct sna_composite_spans_op *op,
-		    pixman_region16_t *clip,
-		    const BoxRec *box,
-		    int coverage)
-{
-	pixman_region16_t region;
-	int n;
-
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	n = region_num_rects(&region);
-	box = region_rects(&region);
-	while (n--)
-		tor_blt_src(sna, op, NULL, box++, coverage);
-	pixman_region_fini(&region);
-}
-
-static void
 tor_blt_in(struct sna *sna,
 	   struct sna_composite_spans_op *op,
 	   pixman_region16_t *clip,
@@ -2268,25 +2334,6 @@ tor_blt_in(struct sna *sna,
 }
 
 static void
-tor_blt_in_clipped(struct sna *sna,
-		   struct sna_composite_spans_op *op,
-		   pixman_region16_t *clip,
-		   const BoxRec *box,
-		   int coverage)
-{
-	pixman_region16_t region;
-	int n;
-
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	n = region_num_rects(&region);
-	box = region_rects(&region);
-	while (n--)
-		tor_blt_in(sna, op, NULL, box++, coverage);
-	pixman_region_fini(&region);
-}
-
-static void
 tor_blt_add(struct sna *sna,
 	    struct sna_composite_spans_op *op,
 	    pixman_region16_t *clip,
@@ -2325,25 +2372,6 @@ tor_blt_add(struct sna *sna,
 }
 
 static void
-tor_blt_add_clipped(struct sna *sna,
-		    struct sna_composite_spans_op *op,
-		    pixman_region16_t *clip,
-		    const BoxRec *box,
-		    int coverage)
-{
-	pixman_region16_t region;
-	int n;
-
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	n = region_num_rects(&region);
-	box = region_rects(&region);
-	while (n--)
-		tor_blt_add(sna, op, NULL, box++, coverage);
-	pixman_region_fini(&region);
-}
-
-static void
 tor_blt_lerp32(struct sna *sna,
 	       struct sna_composite_spans_op *op,
 	       pixman_region16_t *clip,
@@ -2396,25 +2424,6 @@ tor_blt_lerp32(struct sna *sna,
 	}
 }
 
-static void
-tor_blt_lerp32_clipped(struct sna *sna,
-		       struct sna_composite_spans_op *op,
-		       pixman_region16_t *clip,
-		       const BoxRec *box,
-		       int coverage)
-{
-	pixman_region16_t region;
-	int n;
-
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	n = region_num_rects(&region);
-	box = region_rects(&region);
-	while (n--)
-		tor_blt_lerp32(sna, op, NULL, box++, coverage);
-	pixman_region_fini(&region);
-}
-
 struct pixman_inplace {
 	pixman_image_t *image, *source, *mask;
 	uint32_t color;
@@ -2442,24 +2451,6 @@ pixmask_span_solid(struct sna *sna,
 			       pi->dx + box->x1, pi->dy + box->y1,
 			       box->x2 - box->x1, box->y2 - box->y1);
 }
-static void
-pixmask_span_solid__clipped(struct sna *sna,
-			    struct sna_composite_spans_op *op,
-			    pixman_region16_t *clip,
-			    const BoxRec *box,
-			    int coverage)
-{
-	pixman_region16_t region;
-	int n;
-
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	n = region_num_rects(&region);
-	box = region_rects(&region);
-	while (n--)
-		pixmask_span_solid(sna, op, NULL, box++, coverage);
-	pixman_region_fini(&region);
-}
 
 static void
 pixmask_span(struct sna *sna,
@@ -2480,24 +2471,6 @@ pixmask_span(struct sna *sna,
 			       pi->dx + box->x1, pi->dy + box->y1,
 			       box->x2 - box->x1, box->y2 - box->y1);
 }
-static void
-pixmask_span__clipped(struct sna *sna,
-		      struct sna_composite_spans_op *op,
-		      pixman_region16_t *clip,
-		      const BoxRec *box,
-		      int coverage)
-{
-	pixman_region16_t region;
-	int n;
-
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	n = region_num_rects(&region);
-	box = region_rects(&region);
-	while (n--)
-		pixmask_span(sna, op, NULL, box++, coverage);
-	pixman_region_fini(&region);
-}
 
 struct inplace_x8r8g8b8_thread {
 	xTrapezoid *traps;
@@ -2516,6 +2489,7 @@ static void inplace_x8r8g8b8_thread(void *arg)
 	struct inplace_x8r8g8b8_thread *thread = arg;
 	struct tor tor;
 	span_func_t span;
+	struct clipped_span clipped;
 	RegionPtr clip;
 	int y1, y2, n;
 
@@ -2546,12 +2520,11 @@ static void inplace_x8r8g8b8_thread(void *arg)
 		inplace.stride = pixmap->devKind;
 		inplace.color = thread->color;
 
-		if (clip->data)
-			span = tor_blt_lerp32_clipped;
-		else
-			span = tor_blt_lerp32;
+		span = clipped_span(&clipped, tor_blt_lerp32, clip);
 
-		tor_render(NULL, &tor, (void*)&inplace, clip, span, false);
+		tor_render(NULL, &tor,
+			   (void*)&inplace, (void *)&clipped,
+			   span, false);
 	} else if (thread->is_solid) {
 		struct pixman_inplace pi;
 
@@ -2564,10 +2537,7 @@ static void inplace_x8r8g8b8_thread(void *arg)
 						     1, 1, pi.bits, 0);
 		pixman_image_set_repeat(pi.source, PIXMAN_REPEAT_NORMAL);
 
-		if (clip->data)
-			span = pixmask_span_solid__clipped;
-		else
-			span = pixmask_span_solid;
+		span = clipped_span(&clipped, pixmask_span_solid, clip);
 
 		tor_render(NULL, &tor, (void*)&pi, clip, span, false);
 
@@ -2588,12 +2558,11 @@ static void inplace_x8r8g8b8_thread(void *arg)
 		pi.bits = pixman_image_get_data(pi.mask);
 		pi.op = thread->op;
 
-		if (clip->data)
-			span = pixmask_span__clipped;
-		else
-			span = pixmask_span;
+		span = clipped_span(&clipped, pixmask_span, clip);
 
-		tor_render(NULL, &tor, (void*)&pi, clip, span, false);
+		tor_render(NULL, &tor,
+			   (void*)&pi, (void *)&clipped,
+			   span, false);
 
 		pixman_image_unref(pi.mask);
 		pixman_image_unref(pi.source);
@@ -2712,6 +2681,7 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 	if (num_threads == 1) {
 		struct tor tor;
 		span_func_t span;
+		struct clipped_span clipped;
 
 		if (!tor_init(&tor, &region.extents, 2*ntrap))
 			return true;
@@ -2737,17 +2707,14 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 			inplace.stride = pixmap->devKind;
 			inplace.color = color;
 
-			if (dst->pCompositeClip->data)
-				span = tor_blt_lerp32_clipped;
-			else
-				span = tor_blt_lerp32;
-
+			span = clipped_span(&clipped, tor_blt_lerp32, dst->pCompositeClip);
 			DBG(("%s: render inplace op=%d, color=%08x\n",
 			     __FUNCTION__, op, color));
 
 			if (sigtrap_get() == 0) {
-				tor_render(NULL, &tor, (void*)&inplace,
-					   dst->pCompositeClip, span, false);
+				tor_render(NULL, &tor,
+					   (void*)&inplace, (void*)&clipped,
+					   span, false);
 				sigtrap_put();
 			}
 		} else if (is_solid) {
@@ -2762,15 +2729,11 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 							     1, 1, pi.bits, 0);
 			pixman_image_set_repeat(pi.source, PIXMAN_REPEAT_NORMAL);
 
-			if (dst->pCompositeClip->data)
-				span = pixmask_span_solid__clipped;
-			else
-				span = pixmask_span_solid;
-
+			span = clipped_span(&clipped, pixmask_span_solid, dst->pCompositeClip);
 			if (sigtrap_get() == 0) {
-				tor_render(NULL, &tor, (void*)&pi,
-					   dst->pCompositeClip, span,
-					   false);
+				tor_render(NULL, &tor,
+					   (void*)&pi, (void*)&clipped,
+					    span, false);
 				sigtrap_put();
 			}
 
@@ -2791,15 +2754,11 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 			pi.bits = pixman_image_get_data(pi.mask);
 			pi.op = op;
 
-			if (dst->pCompositeClip->data)
-				span = pixmask_span__clipped;
-			else
-				span = pixmask_span;
-
+			span = clipped_span(&clipped, pixmask_span, dst->pCompositeClip);
 			if (sigtrap_get() == 0) {
-				tor_render(NULL, &tor, (void*)&pi,
-					   dst->pCompositeClip, span,
-					   false);
+				tor_render(NULL, &tor,
+					   (void*)&pi, (void *)&clipped,
+					   span, false);
 				sigtrap_put();
 			}
 
@@ -2861,9 +2820,9 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 
 struct inplace_thread {
 	xTrapezoid *traps;
-	RegionPtr clip;
 	span_func_t span;
 	struct inplace inplace;
+	struct clipped_span clipped;
 	BoxRec extents;
 	int dx, dy;
 	int draw_x, draw_y;
@@ -2888,8 +2847,9 @@ static void inplace_thread(void *arg)
 		tor_add_trapezoid(&tor, &thread->traps[n], thread->dx, thread->dy);
 	}
 
-	tor_render(NULL, &tor, (void*)&thread->inplace,
-		   thread->clip, thread->span, thread->unbounded);
+	tor_render(NULL, &tor, 
+		   (void*)&thread->inplace, (void*)&thread->clipped,
+		   thread->span, thread->unbounded);
 
 	tor_fini(&tor);
 }
@@ -2903,6 +2863,7 @@ precise_trapezoid_span_inplace(struct sna *sna,
 			       bool fallback)
 {
 	struct inplace inplace;
+	struct clipped_span clipped;
 	span_func_t span;
 	PixmapPtr pixmap;
 	struct sna_pixmap *priv;
@@ -3020,21 +2981,12 @@ precise_trapezoid_span_inplace(struct sna *sna,
 	     dst->pCompositeClip->data != NULL));
 
 	if (op == PictOpSrc) {
-		if (dst->pCompositeClip->data)
-			span = tor_blt_src_clipped;
-		else
-			span = tor_blt_src;
+		span = tor_blt_src;
 	} else if (op == PictOpIn) {
-		if (dst->pCompositeClip->data)
-			span = tor_blt_in_clipped;
-		else
-			span = tor_blt_in;
+		span = tor_blt_in;
 	} else {
 		assert(op == PictOpAdd);
-		if (dst->pCompositeClip->data)
-			span = tor_blt_add_clipped;
-		else
-			span = tor_blt_add;
+		span = tor_blt_add;
 	}
 
 	DBG(("%s: move-to-cpu(dst)\n", __FUNCTION__));
@@ -3052,6 +3004,8 @@ precise_trapezoid_span_inplace(struct sna *sna,
 	inplace.stride = pixmap->devKind;
 	inplace.opacity = color >> 24;
 
+	span = clipped_span(&clipped, span, dst->pCompositeClip);
+
 	num_threads = 1;
 	if (!NO_GPU_THREADS &&
 	    (flags & COMPOSITE_SPANS_RECTILINEAR) == 0)
@@ -3074,8 +3028,9 @@ precise_trapezoid_span_inplace(struct sna *sna,
 		}
 
 		if (sigtrap_get() == 0) {
-			tor_render(NULL, &tor, (void*)&inplace,
-				   dst->pCompositeClip, span, unbounded);
+			tor_render(NULL, &tor,
+				   (void*)&inplace, (void *)&clipped,
+				   span, unbounded);
 			sigtrap_put();
 		}
 
@@ -3093,7 +3048,7 @@ precise_trapezoid_span_inplace(struct sna *sna,
 		threads[0].ntrap = ntrap;
 		threads[0].inplace = inplace;
 		threads[0].extents = region.extents;
-		threads[0].clip = dst->pCompositeClip;
+		threads[0].clipped = clipped;
 		threads[0].span = span;
 		threads[0].unbounded = unbounded;
 		threads[0].dx = dx;
@@ -3316,8 +3271,7 @@ tristrip_thread(void *arg)
 	if (!tor_init(&tor, &thread->extents, 2*thread->count))
 		return;
 
-	boxes.op = thread->op;
-	boxes.num_boxes = 0;
+	span_thread_boxes_init(&boxes, thread->op, thread->clip);
 
 	cw = 0; ccw = 1;
 	polygon_add_line(tor.polygon,
diff --git a/src/sna/sna_video.c b/src/sna/sna_video.c
index ed0e7b3..e2b11c3 100644
--- a/src/sna/sna_video.c
+++ b/src/sna/sna_video.c
@@ -591,6 +591,72 @@ use_gtt: /* copy data, must use GTT so that we keep the overlay uncached */
 	return true;
 }
 
+void sna_video_fill_colorkey(struct sna_video *video,
+			     const RegionRec *clip)
+{
+	struct sna *sna = video->sna;
+	PixmapPtr front = sna->front;
+	struct kgem_bo *bo = __sna_pixmap_get_bo(front);
+	uint8_t *dst, *tmp;
+	int w, width;
+
+	if (video->AlwaysOnTop || RegionEqual(&video->clip, (RegionPtr)clip))
+		return;
+
+	assert(bo);
+	if (!wedged(sna) &&
+	    sna_blt_fill_boxes(sna, GXcopy, bo,
+			       front->drawable.bitsPerPixel,
+			       video->color_key,
+			       region_rects(clip),
+			       region_num_rects(clip))) {
+		RegionCopy(&video->clip, (RegionPtr)clip);
+		return;
+	}
+
+	dst = kgem_bo_map__gtt(&sna->kgem, bo);
+	if (dst == NULL)
+		return;
+
+	w = front->drawable.bitsPerPixel/8;
+	width = (clip->extents.x2 - clip->extents.x1) * w;
+	tmp = malloc(width);
+	if (tmp == NULL)
+		return;
+
+	memcpy(tmp, &video->color_key, w);
+	while (2 * w < width) {
+		memcpy(tmp + w, tmp, w);
+		w *= 2;
+	}
+	if (w < width)
+		memcpy(tmp + w, tmp, width - w);
+
+	if (sigtrap_get() == 0) {
+		const BoxRec *box = region_rects(clip);
+		int n = region_num_rects(clip);
+
+		w = front->drawable.bitsPerPixel/8;
+		do {
+			int y = box->y1;
+			uint8_t *row = dst + y*bo->pitch + w*box->x1;
+
+			width = (box->x2 - box->x1) * w;
+			while (y < box->y2) {
+				memcpy(row, tmp, width);
+				row += bo->pitch;
+				y++;
+			}
+			box++;
+		} while (--n);
+		sigtrap_put();
+
+		RegionCopy(&video->clip, (RegionPtr)clip);
+	}
+
+	free(tmp);
+}
+
 XvAdaptorPtr sna_xv_adaptor_alloc(struct sna *sna)
 {
 	XvAdaptorPtr new_adaptors;
diff --git a/src/sna/sna_video.h b/src/sna/sna_video.h
index f21605f..dfb8c0c 100644
--- a/src/sna/sna_video.h
+++ b/src/sna/sna_video.h
@@ -193,6 +193,9 @@ bool
 sna_video_copy_data(struct sna_video *video,
 		    struct sna_video_frame *frame,
 		    const uint8_t *buf);
+void
+sna_video_fill_colorkey(struct sna_video *video,
+			const RegionRec *clip);
 
 void sna_video_buffer_fini(struct sna_video *video);
 
diff --git a/src/sna/sna_video_overlay.c b/src/sna/sna_video_overlay.c
index ac81f1a..d782113 100644
--- a/src/sna/sna_video_overlay.c
+++ b/src/sna/sna_video_overlay.c
@@ -130,7 +130,7 @@ static int sna_video_overlay_stop(ddStopVideo_ARGS)
 
 	DBG(("%s()\n", __FUNCTION__));
 
-	REGION_EMPTY(scrn->pScreen, &video->clip);
+	REGION_EMPTY(to_screen_from_sna(sna), &video->clip);
 
 	request.flags = 0;
 	(void)drmIoctl(sna->kgem.fd,
@@ -551,15 +551,7 @@ sna_video_overlay_put_image(ddPutImage_ARGS)
 	ret = Success;
 	if (sna_video_overlay_show
 	    (sna, video, &frame, crtc, &dstBox, src_w, src_h, drw_w, drw_h)) {
-		//xf86XVFillKeyHelperDrawable(draw, video->color_key, &clip);
-		if (!video->AlwaysOnTop && !RegionEqual(&video->clip, &clip) &&
-		    sna_blt_fill_boxes(sna, GXcopy,
-				       __sna_pixmap_get_bo(sna->front),
-				       sna->front->drawable.bitsPerPixel,
-				       video->color_key,
-				       region_rects(&clip),
-				       region_num_rects(&clip)))
-			RegionCopy(&video->clip, &clip);
+		sna_video_fill_colorkey(video, &clip);
 		sna_window_set_port((WindowPtr)draw, port);
 	} else {
 		DBG(("%s: failed to show video frame\n", __FUNCTION__));
diff --git a/src/sna/sna_video_sprite.c b/src/sna/sna_video_sprite.c
index 92230f9..9ce9879 100644
--- a/src/sna/sna_video_sprite.c
+++ b/src/sna/sna_video_sprite.c
@@ -527,14 +527,7 @@ off:
 			goto err;
 	}
 
-	if (!video->AlwaysOnTop && !RegionEqual(&video->clip, &clip) &&
-	    sna_blt_fill_boxes(sna, GXcopy,
-			       __sna_pixmap_get_bo(sna->front),
-			       sna->front->drawable.bitsPerPixel,
-			       video->color_key,
-			       region_rects(&clip),
-			       region_num_rects(&clip)))
-		RegionCopy(&video->clip, &clip);
+	sna_video_fill_colorkey(video, &clip);
 	sna_window_set_port((WindowPtr)draw, port);
 
 	return Success;
diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
index 9501193..cea8887 100644
--- a/src/sna/sna_video_textured.c
+++ b/src/sna/sna_video_textured.c
@@ -152,6 +152,9 @@ sna_video_textured_put_image(ddPutImage_ARGS)
 	bool flush = false;
 	bool ret;
 
+	if (wedged(sna))
+		return BadAlloc;
+
 	clip.extents.x1 = draw->x + drw_x;
 	clip.extents.y1 = draw->y + drw_y;
 	clip.extents.x2 = clip.extents.x1 + drw_w;
@@ -316,7 +319,7 @@ void sna_video_textured_setup(struct sna *sna, ScreenPtr screen)
 
 	if (!sna->render.video) {
 		xf86DrvMsg(sna->scrn->scrnIndex, X_INFO,
-			   "Textured video not supported on this hardware\n");
+			   "Textured video not supported on this hardware or backend\n");
 		return;
 	}
 
diff --git a/src/sna/xassert.h b/src/sna/xassert.h
index 1bcfd08..bac70b8 100644
--- a/src/sna/xassert.h
+++ b/src/sna/xassert.h
@@ -43,6 +43,17 @@
 	xorg_backtrace(); \
 	FatalError("%s:%d assertion '%s' failed\n", __func__, __LINE__, #E); \
 } while (0)
+
+#define warn_unless(E) do if (unlikely(!(E))) { \
+	static int __warn_once__; \
+	if (!__warn_once__) { \
+		xorg_backtrace(); \
+		ErrorF("%s:%d assertion '%s' failed\n", __func__, __LINE__, #E); \
+		__warn_once__ = 1; \
+	} \
+} while (0)
+#else
+#define warn_unless(E)
 #endif
 
 #endif /* __XASSERT_H__ */
diff --git a/src/uxa/i965_video.c b/src/uxa/i965_video.c
index 68e6fd3..438ab90 100644
--- a/src/uxa/i965_video.c
+++ b/src/uxa/i965_video.c
@@ -37,7 +37,6 @@
 #include "fourcc.h"
 
 #include "intel.h"
-#include "intel_xvmc.h"
 #include "intel_uxa.h"
 #include "i830_reg.h"
 #include "i965_reg.h"
diff --git a/src/uxa/intel.h b/src/uxa/intel.h
index 1b7e533..37b23e9 100644
--- a/src/uxa/intel.h
+++ b/src/uxa/intel.h
@@ -121,7 +121,6 @@ typedef struct intel_screen_private {
 
 	void *modes;
 	drm_intel_bo *front_buffer, *back_buffer;
-	unsigned int back_name;
 	long front_pitch, front_tiling;
 
 	dri_bufmgr *bufmgr;
@@ -285,8 +284,6 @@ typedef struct intel_screen_private {
 	Bool has_kernel_flush;
 	Bool needs_flush;
 
-	struct _DRI2FrameEvent *pending_flip[MAX_PIPES];
-
 	/* Broken-out options. */
 	OptionInfoPtr Options;
 
@@ -368,6 +365,7 @@ typedef void (*intel_drm_abort_proc)(ScrnInfoPtr scrn,
 
 extern uint32_t intel_drm_queue_alloc(ScrnInfoPtr scrn, xf86CrtcPtr crtc, void *data, intel_drm_handler_proc handler, intel_drm_abort_proc abort);
 extern void intel_drm_abort(ScrnInfoPtr scrn, Bool (*match)(void *data, void *match_data), void *match_data);
+extern void intel_drm_abort_seq(ScrnInfoPtr scrn, uint32_t seq);
 
 extern int intel_get_pipe_from_crtc_id(drm_intel_bufmgr *bufmgr, xf86CrtcPtr crtc);
 extern int intel_crtc_id(xf86CrtcPtr crtc);
@@ -408,7 +406,6 @@ typedef struct _DRI2FrameEvent {
 	ClientPtr client;
 	enum DRI2FrameEventType type;
 	int frame;
-	int pipe;
 
 	struct list drawable_resource, client_resource;
 
@@ -418,7 +415,12 @@ typedef struct _DRI2FrameEvent {
 	DRI2BufferPtr front;
 	DRI2BufferPtr back;
 
-	struct _DRI2FrameEvent *chain;
+	/* current scanout for triple buffer */
+	int old_width;
+	int old_height;
+	int old_pitch;
+	int old_tiling;
+	dri_bo *old_buffer;
 } DRI2FrameEventRec, *DRI2FrameEventPtr;
 
 extern Bool intel_do_pageflip(intel_screen_private *intel,
@@ -456,10 +458,6 @@ extern xf86CrtcPtr intel_covering_crtc(ScrnInfoPtr scrn, BoxPtr box,
 
 Bool I830DRI2ScreenInit(ScreenPtr pScreen);
 void I830DRI2CloseScreen(ScreenPtr pScreen);
-void I830DRI2FrameEventHandler(unsigned int frame, unsigned int tv_sec,
-			       unsigned int tv_usec, DRI2FrameEventPtr flip_info);
-void I830DRI2FlipEventHandler(unsigned int frame, unsigned int tv_sec,
-			      unsigned int tv_usec, DRI2FrameEventPtr flip_info);
 
 /* intel_dri3.c */
 Bool intel_dri3_screen_init(ScreenPtr screen);
diff --git a/src/uxa/intel_display.c b/src/uxa/intel_display.c
index 7b4d4e0..544382a 100644
--- a/src/uxa/intel_display.c
+++ b/src/uxa/intel_display.c
@@ -89,9 +89,11 @@ struct intel_mode {
 	struct list outputs;
 	struct list crtcs;
 
-	void *pageflip_data;
-	intel_pageflip_handler_proc pageflip_handler;
-	intel_pageflip_abort_proc pageflip_abort;
+	struct {
+		intel_pageflip_handler_proc handler;
+		intel_pageflip_abort_proc abort;
+		void *data;
+	} pageflip;
 
 	Bool delete_dp_12_displays;
 };
@@ -114,7 +116,6 @@ struct intel_crtc {
 	struct list link;
 	PixmapPtr scanout_pixmap;
 	uint32_t scanout_fb_id;
-	int32_t vblank_offset;
 	uint32_t msc_prev;
 	uint64_t msc_high;
 };
@@ -1492,6 +1493,7 @@ intel_output_init(ScrnInfoPtr scrn, struct intel_mode *mode, drmModeResPtr mode_
 			intel_output = output->driver_private;
 			intel_output->output_id = mode_res->connectors[num];
 			intel_output->mode_output = koutput;
+			RROutputChanged(output->randr_output, TRUE);
 			return;
 		}
 	}
@@ -1650,9 +1652,6 @@ intel_pageflip_abort(ScrnInfoPtr scrn, xf86CrtcPtr crtc, void *data);
 static void
 intel_pageflip_complete(struct intel_mode *mode);
 
-static void
-intel_drm_abort_seq (ScrnInfoPtr scrn, uint32_t seq);
-
 Bool
 intel_do_pageflip(intel_screen_private *intel,
 		  dri_bo *new_front,
@@ -1671,23 +1670,30 @@ intel_do_pageflip(intel_screen_private *intel,
 	uint32_t new_fb_id;
 	uint32_t flags;
 	uint32_t seq;
+	int err = 0;
 	int i;
 
 	/*
+	 * We only have a single length queue in the kernel, so any
+	 * attempts to schedule a second flip before processing the first
+	 * is a bug. Punt it back to the caller.
+	 */
+	if (mode->flip_count)
+		return FALSE;
+
+	/*
 	 * Create a new handle for the back buffer
 	 */
 	if (drmModeAddFB(mode->fd, scrn->virtualX, scrn->virtualY,
 			 scrn->depth, scrn->bitsPerPixel, pitch,
-			 new_front->handle, &new_fb_id))
+			 new_front->handle, &new_fb_id)) {
+		err = errno;
 		goto error_out;
+	}
 
 	drm_intel_bo_disable_reuse(new_front);
         intel_flush(intel);
 
-	mode->pageflip_data = pageflip_data;
-	mode->pageflip_handler = pageflip_handler;
-	mode->pageflip_abort = pageflip_abort;
-
 	/*
 	 * Queue flips on all enabled CRTCs
 	 * Note that if/when we get per-CRTC buffers, we'll have to update this.
@@ -1699,6 +1705,7 @@ intel_do_pageflip(intel_screen_private *intel,
 	 */
 	mode->fe_msc = 0;
 	mode->fe_usec = 0;
+	memset(&mode->pageflip, 0, sizeof(mode->pageflip));
 
 	flags = DRM_MODE_PAGE_FLIP_EVENT;
 	if (async)
@@ -1711,8 +1718,7 @@ intel_do_pageflip(intel_screen_private *intel,
 
 		flip = calloc(1, sizeof(struct intel_pageflip));
 		if (flip == NULL) {
-			xf86DrvMsg(scrn->scrnIndex, X_WARNING,
-				   "flip queue: carrier alloc failed.\n");
+			err = errno;
 			goto error_undo;
 		}
 
@@ -1724,33 +1730,30 @@ intel_do_pageflip(intel_screen_private *intel,
 
 		seq = intel_drm_queue_alloc(scrn, config->crtc[i], flip, intel_pageflip_handler, intel_pageflip_abort);
 		if (!seq) {
+			err = errno;
 			free(flip);
 			goto error_undo;
 		}
 
-again:
+		mode->flip_count++;
+
 		if (drmModePageFlip(mode->fd,
 				    crtc_id(crtc),
 				    new_fb_id,
 				    flags, (void *)(uintptr_t)seq)) {
-			if (intel_mode_read_drm_events(intel)) {
-				xf86DrvMsg(scrn->scrnIndex, X_WARNING,
-					   "flip queue retry\n");
-				goto again;
-			}
-			xf86DrvMsg(scrn->scrnIndex, X_WARNING,
-				   "flip queue failed: %s\n", strerror(errno));
-			if (seq)
-				intel_drm_abort_seq(scrn, seq);
-			free(flip);
+			err = errno;
+			intel_drm_abort_seq(scrn, seq);
 			goto error_undo;
 		}
-		mode->flip_count++;
 	}
 
 	mode->old_fb_id = mode->fb_id;
 	mode->fb_id = new_fb_id;
 
+	mode->pageflip.data = pageflip_data;
+	mode->pageflip.handler = pageflip_handler;
+	mode->pageflip.abort = pageflip_abort;
+
 	if (!mode->flip_count)
 		intel_pageflip_complete(mode);
 
@@ -1765,7 +1768,7 @@ error_undo:
 
 error_out:
 	xf86DrvMsg(scrn->scrnIndex, X_WARNING, "Page flip failed: %s\n",
-		   strerror(errno));
+		   strerror(err));
 
 	mode->flip_count = 0;
 	return FALSE;
@@ -1839,7 +1842,7 @@ intel_drm_abort(ScrnInfoPtr scrn, Bool (*match)(void *data, void *match_data), v
 /*
  * Abort by drm queue sequence number
  */
-static void
+void
 intel_drm_abort_seq(ScrnInfoPtr scrn, uint32_t seq)
 {
 	struct intel_drm_queue *q;
@@ -1911,7 +1914,6 @@ intel_sequence_to_crtc_msc(xf86CrtcPtr crtc, uint32_t sequence)
 {
 	struct intel_crtc *intel_crtc = crtc->driver_private;
 
-        sequence += intel_crtc->vblank_offset;
         if ((int32_t) (sequence - intel_crtc->msc_prev) < -0x40000000)
                 intel_crtc->msc_high += 0x100000000L;
         intel_crtc->msc_prev = sequence;
@@ -1935,37 +1937,10 @@ intel_get_crtc_msc_ust(ScrnInfoPtr scrn, xf86CrtcPtr crtc, uint64_t *msc, uint64
         return 0;
 }
 
-/*
- * Convert a 64-bit adjusted MSC value into a 32-bit kernel sequence number,
- * removing the high 32 bits and subtracting out the vblank_offset term.
- *
- * This also updates the vblank_offset when it notices that the value should
- * change.
- */
-
-#define MAX_VBLANK_OFFSET       1000
-
 uint32_t
 intel_crtc_msc_to_sequence(ScrnInfoPtr scrn, xf86CrtcPtr crtc, uint64_t expect)
 {
-	struct intel_crtc *intel_crtc = crtc->driver_private;
-        uint64_t msc, ust;
-
-	if (intel_get_crtc_msc_ust(scrn, crtc, &msc, &ust) == 0) {
-		int64_t diff = expect - msc;
-
-		/* We're way off here, assume that the kernel has lost its mind
-		 * and smack the vblank back to something sensible
-		 */
-		if (diff < -MAX_VBLANK_OFFSET || diff > MAX_VBLANK_OFFSET) {
-			intel_crtc->vblank_offset += (int32_t) diff;
-			if (intel_crtc->vblank_offset > -MAX_VBLANK_OFFSET &&
-			    intel_crtc->vblank_offset < MAX_VBLANK_OFFSET)
-				intel_crtc->vblank_offset = 0;
-		}
-	}
-
-        return (uint32_t) (expect - intel_crtc->vblank_offset);
+        return (uint32_t)expect;
 }
 
 /*
@@ -1998,14 +1973,13 @@ intel_drm_handler(int fd, uint32_t frame, uint32_t sec, uint32_t usec, void *use
 static void
 intel_pageflip_complete(struct intel_mode *mode)
 {
-	/* Release framebuffer */
-	drmModeRmFB(mode->fd, mode->old_fb_id);
-
-	if (!mode->pageflip_handler)
+	if (!mode->pageflip.handler)
 		return;
 
-	mode->pageflip_handler(mode->fe_msc, mode->fe_usec,
-			       mode->pageflip_data);
+	/* Release framebuffer */
+	drmModeRmFB(mode->fd, mode->old_fb_id);
+	mode->pageflip.handler(mode->fe_msc, mode->fe_usec,
+			       mode->pageflip.data);
 }
 
 /*
@@ -2045,6 +2019,7 @@ intel_pageflip_handler(ScrnInfoPtr scrn, xf86CrtcPtr crtc,
 
 	if (!mode)
 		return;
+
 	intel_pageflip_complete(mode);
 }
 
@@ -2060,13 +2035,12 @@ intel_pageflip_abort(ScrnInfoPtr scrn, xf86CrtcPtr crtc, void *data)
 	if (!mode)
 		return;
 
-	/* Release framebuffer */
-	drmModeRmFB(mode->fd, mode->old_fb_id);
-
-	if (!mode->pageflip_abort)
+	if (!mode->pageflip.abort)
 		return;
 
-	mode->pageflip_abort(mode->pageflip_data);
+	/* Release framebuffer */
+	drmModeRmFB(mode->fd, mode->old_fb_id);
+	mode->pageflip.abort(mode->pageflip.data);
 }
 
 /*
@@ -2522,6 +2496,7 @@ restart_destroy:
 		drmModeFreeConnector(intel_output->mode_output);
 		intel_output->mode_output = NULL;
 		intel_output->output_id = -1;
+		RROutputChanged(output->randr_output, TRUE);
 
 		changed = TRUE;
 		if (mode->delete_dp_12_displays) {
@@ -2552,10 +2527,8 @@ restart_destroy:
 		intel_output_init(scrn, intel->modes, mode_res, i, 1);
 	}
 
-	if (changed) {
-		RRSetChanged(xf86ScrnToScreen(scrn));
+	if (changed)
 		RRTellChanged(xf86ScrnToScreen(scrn));
-	}
 
 	drmModeFreeResources(mode_res);
 out:
diff --git a/src/uxa/intel_dri.c b/src/uxa/intel_dri.c
index f61c621..524826d 100644
--- a/src/uxa/intel_dri.c
+++ b/src/uxa/intel_dri.c
@@ -81,6 +81,47 @@ static DevPrivateKeyRec i830_client_key;
 static int i830_client_key;
 #endif
 
+static void I830DRI2FlipEventHandler(unsigned int frame,
+				     unsigned int tv_sec,
+				     unsigned int tv_usec,
+				     DRI2FrameEventPtr flip_info);
+
+static void I830DRI2FrameEventHandler(unsigned int frame,
+				      unsigned int tv_sec,
+				      unsigned int tv_usec,
+				      DRI2FrameEventPtr swap_info);
+
+static void
+i830_dri2_del_frame_event(DRI2FrameEventPtr info);
+
+static uint32_t pipe_select(int pipe)
+{
+	if (pipe > 1)
+		return pipe << DRM_VBLANK_HIGH_CRTC_SHIFT;
+	else if (pipe > 0)
+		return DRM_VBLANK_SECONDARY;
+	else
+		return 0;
+}
+
+static void
+intel_dri2_vblank_handler(ScrnInfoPtr scrn,
+                          xf86CrtcPtr crtc,
+                          uint64_t msc,
+                          uint64_t usec,
+                          void *data)
+{
+        I830DRI2FrameEventHandler((uint32_t) msc, usec / 1000000, usec % 1000000, data);
+}
+
+static void
+intel_dri2_vblank_abort(ScrnInfoPtr scrn,
+                        xf86CrtcPtr crtc,
+                        void *data)
+{
+        i830_dri2_del_frame_event(data);
+}
+
 static uint32_t pixmap_flink(PixmapPtr pixmap)
 {
 	struct intel_uxa_pixmap *priv = intel_uxa_get_pixmap_private(pixmap);
@@ -135,9 +176,6 @@ I830DRI2CreateBuffers(DrawablePtr drawable, unsigned int *attachments,
 		pixmap = NULL;
 		if (attachments[i] == DRI2BufferFrontLeft) {
 			pixmap = get_front_buffer(drawable);
-
-			if (pixmap == NULL)
-				drawable = &(get_drawable_pixmap(drawable)->drawable);
 		} else if (attachments[i] == DRI2BufferStencil && pDepthPixmap) {
 			pixmap = pDepthPixmap;
 			pixmap->refcnt++;
@@ -246,11 +284,8 @@ I830DRI2CreateBuffer(DrawablePtr drawable, unsigned int attachment,
 	}
 
 	pixmap = NULL;
-	if (attachment == DRI2BufferFrontLeft) {
+	if (attachment == DRI2BufferFrontLeft)
 		pixmap = get_front_buffer(drawable);
-		if (pixmap == NULL)
-			drawable = &(get_drawable_pixmap(drawable)->drawable);
-	}
 
 	if (pixmap == NULL) {
 		unsigned int hint = INTEL_CREATE_PIXMAP_DRI2;
@@ -673,6 +708,20 @@ i830_dri2_del_frame_event(DRI2FrameEventPtr info)
 	if (info->back)
 		I830DRI2DestroyBuffer(NULL, info->back);
 
+	if (info->old_buffer) {
+		/* Check that the old buffer still matches the front buffer
+		 * in case a mode change occurred before we woke up.
+		 */
+		if (info->intel->back_buffer == NULL &&
+		    info->old_width  == info->intel->scrn->virtualX &&
+		    info->old_height == info->intel->scrn->virtualY &&
+		    info->old_pitch  == info->intel->front_pitch &&
+		    info->old_tiling == info->intel->front_tiling)
+			info->intel->back_buffer = info->old_buffer;
+		else
+			dri_bo_unreference(info->old_buffer);
+	}
+
 	free(info);
 }
 
@@ -708,16 +757,14 @@ static void
 I830DRI2ExchangeBuffers(struct intel_screen_private *intel, DRI2BufferPtr front, DRI2BufferPtr back)
 {
 	I830DRI2BufferPrivatePtr front_priv, back_priv;
-	int tmp;
 	struct intel_uxa_pixmap *new_front;
 
 	front_priv = front->driverPrivate;
 	back_priv = back->driverPrivate;
 
 	/* Swap BO names so DRI works */
-	tmp = front->name;
 	front->name = back->name;
-	back->name = tmp;
+	back->name = pixmap_flink(front_priv->pixmap);
 
 	/* Swap pixmap bos */
 	new_front = intel_exchange_pixmap_buffers(intel,
@@ -753,87 +800,30 @@ I830DRI2FlipAbort(void *pageflip_data)
         i830_dri2_del_frame_event(info);
 }
 
-/*
- * Our internal swap routine takes care of actually exchanging, blitting, or
- * flipping buffers as necessary.
- */
 static Bool
-I830DRI2ScheduleFlip(struct intel_screen_private *intel,
-		     DrawablePtr draw,
-		     DRI2FrameEventPtr info)
+allocate_back_buffer(struct intel_screen_private *intel)
 {
-	I830DRI2BufferPrivatePtr priv = info->back->driverPrivate;
-	drm_intel_bo *new_back, *old_back;
-	int tmp_name;
-
-	if (!intel->use_triple_buffer) {
-		info->type = DRI2_SWAP;
-		if (!intel_do_pageflip(intel,
-				       get_pixmap_bo(priv),
-				       info->pipe, FALSE, info,
-                                       I830DRI2FlipComplete,
-                                       I830DRI2FlipAbort))
-			return FALSE;
-
-		I830DRI2ExchangeBuffers(intel, info->front, info->back);
-		return TRUE;
-	}
+	drm_intel_bo *bo;
+	int pitch;
+	uint32_t tiling;
 
-	if (intel->pending_flip[info->pipe]) {
-		assert(intel->pending_flip[info->pipe]->chain == NULL);
-		intel->pending_flip[info->pipe]->chain = info;
+	if (intel->back_buffer)
 		return TRUE;
-	}
 
-	if (intel->back_buffer == NULL) {
-		new_back = drm_intel_bo_alloc(intel->bufmgr, "front buffer",
-					      intel->front_buffer->size, 0);
-		if (new_back == NULL)
-			return FALSE;
-
-		if (intel->front_tiling != I915_TILING_NONE) {
-			uint32_t tiling = intel->front_tiling;
-			drm_intel_bo_set_tiling(new_back, &tiling, intel->front_pitch);
-			if (tiling != intel->front_tiling) {
-				drm_intel_bo_unreference(new_back);
-				return FALSE;
-			}
-		}
-
-		drm_intel_bo_disable_reuse(new_back);
-		dri_bo_flink(new_back, &intel->back_name);
-	} else {
-		new_back = intel->back_buffer;
-		intel->back_buffer = NULL;
-	}
+	bo = intel_allocate_framebuffer(intel->scrn,
+					intel->scrn->virtualX,
+					intel->scrn->virtualY,
+					intel->cpp,
+					&pitch, &tiling);
+	if (bo == NULL)
+		return FALSE;
 
-	old_back = get_pixmap_bo(priv);
-	if (!intel_do_pageflip(intel, old_back, info->pipe, FALSE, info, I830DRI2FlipComplete, I830DRI2FlipAbort)) {
-		intel->back_buffer = new_back;
+	if (pitch != intel->front_pitch || tiling != intel->front_tiling) {
+		drm_intel_bo_unreference(bo);
 		return FALSE;
 	}
-	info->type = DRI2_SWAP_CHAIN;
-	intel->pending_flip[info->pipe] = info;
-
-	priv = info->front->driverPrivate;
-
-	/* Exchange the current front-buffer with the fresh bo */
-
-	intel->back_buffer = intel->front_buffer;
-	drm_intel_bo_reference(intel->back_buffer);
-	intel_set_pixmap_bo(priv->pixmap, new_back);
-	drm_intel_bo_unreference(new_back);
-
-	tmp_name = info->front->name;
-	info->front->name = intel->back_name;
-	intel->back_name = tmp_name;
 
-	/* Then flip DRI2 pointers and update the screen pixmap */
-	I830DRI2ExchangeBuffers(intel, info->front, info->back);
-	DRI2SwapComplete(info->client, draw, 0, 0, 0,
-			 DRI2_EXCHANGE_COMPLETE,
-			 info->event_complete,
-			 info->event_data);
+	intel->back_buffer = bo;
 	return TRUE;
 }
 
@@ -889,8 +879,88 @@ can_exchange(DrawablePtr drawable, DRI2BufferPtr front, DRI2BufferPtr back)
 	return TRUE;
 }
 
-void I830DRI2FrameEventHandler(unsigned int frame, unsigned int tv_sec,
-			       unsigned int tv_usec, DRI2FrameEventPtr swap_info)
+static Bool
+queue_flip(struct intel_screen_private *intel,
+	   DrawablePtr draw,
+	   DRI2FrameEventPtr info)
+{
+	xf86CrtcPtr crtc = I830DRI2DrawableCrtc(draw);
+	I830DRI2BufferPrivatePtr priv = info->back->driverPrivate;
+	drm_intel_bo *old_back = get_pixmap_bo(priv);
+
+	if (crtc == NULL)
+		return FALSE;
+
+	if (!can_exchange(draw, info->front, info->back))
+		return FALSE;
+
+	if (!intel_do_pageflip(intel, old_back,
+			       intel_crtc_to_pipe(crtc),
+			       FALSE, info,
+			       I830DRI2FlipComplete, I830DRI2FlipAbort))
+		return FALSE;
+
+#if DRI2INFOREC_VERSION >= 6
+	if (intel->use_triple_buffer && allocate_back_buffer(intel)) {
+		info->old_width  = intel->scrn->virtualX;
+		info->old_height = intel->scrn->virtualY;
+		info->old_pitch  = intel->front_pitch;
+		info->old_tiling = intel->front_tiling;
+		info->old_buffer = intel->front_buffer;
+		dri_bo_reference(info->old_buffer);
+
+		priv = info->front->driverPrivate;
+		intel_set_pixmap_bo(priv->pixmap, intel->back_buffer);
+
+		dri_bo_unreference(intel->back_buffer);
+		intel->back_buffer = NULL;
+
+		DRI2SwapLimit(draw, 2);
+	} else
+		DRI2SwapLimit(draw, 1);
+#endif
+
+	/* Then flip DRI2 pointers and update the screen pixmap */
+	I830DRI2ExchangeBuffers(intel, info->front, info->back);
+	return TRUE;
+}
+
+static Bool
+queue_swap(struct intel_screen_private *intel,
+	   DrawablePtr draw,
+	   DRI2FrameEventPtr info)
+{
+	xf86CrtcPtr crtc = I830DRI2DrawableCrtc(draw);
+	drmVBlank vbl;
+
+	if (crtc == NULL)
+		return FALSE;
+
+	vbl.request.type =
+		DRM_VBLANK_RELATIVE |
+		DRM_VBLANK_EVENT |
+		pipe_select(intel_crtc_to_pipe(crtc));
+	vbl.request.sequence = 1;
+	vbl.request.signal =
+		intel_drm_queue_alloc(intel->scrn, crtc, info,
+				      intel_dri2_vblank_handler,
+				      intel_dri2_vblank_abort);
+	if (vbl.request.signal == 0)
+		return FALSE;
+
+	info->type = DRI2_SWAP;
+	if (drmWaitVBlank(intel->drmSubFD, &vbl)) {
+		intel_drm_abort_seq(intel->scrn, vbl.request.signal);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static void I830DRI2FrameEventHandler(unsigned int frame,
+				      unsigned int tv_sec,
+				      unsigned int tv_usec,
+				      DRI2FrameEventPtr swap_info)
 {
 	intel_screen_private *intel = swap_info->intel;
 	DrawablePtr drawable;
@@ -906,24 +976,22 @@ void I830DRI2FrameEventHandler(unsigned int frame, unsigned int tv_sec,
 		return;
 	}
 
-
 	switch (swap_info->type) {
 	case DRI2_FLIP:
 		/* If we can still flip... */
-		if (can_exchange(drawable, swap_info->front, swap_info->back) &&
-		    I830DRI2ScheduleFlip(intel, drawable, swap_info))
-			return;
-
-		/* else fall through to exchange/blit */
-	case DRI2_SWAP: {
-		I830DRI2FallbackBlitSwap(drawable,
-					 swap_info->front, swap_info->back);
-		DRI2SwapComplete(swap_info->client, drawable, frame, tv_sec, tv_usec,
-				 DRI2_BLIT_COMPLETE,
-				 swap_info->client ? swap_info->event_complete : NULL,
-				 swap_info->event_data);
-		break;
-	}
+		if (!queue_flip(intel, drawable, swap_info) &&
+		    !queue_swap(intel, drawable, swap_info)) {
+		case DRI2_SWAP:
+			I830DRI2FallbackBlitSwap(drawable,
+						 swap_info->front, swap_info->back);
+			DRI2SwapComplete(swap_info->client, drawable, frame, tv_sec, tv_usec,
+					 DRI2_BLIT_COMPLETE,
+					 swap_info->client ? swap_info->event_complete : NULL,
+					 swap_info->event_data);
+			break;
+		}
+		return;
+
 	case DRI2_WAITMSC:
 		if (swap_info->client)
 			DRI2WaitMSCComplete(swap_info->client, drawable,
@@ -939,12 +1007,13 @@ void I830DRI2FrameEventHandler(unsigned int frame, unsigned int tv_sec,
 	i830_dri2_del_frame_event(swap_info);
 }
 
-void I830DRI2FlipEventHandler(unsigned int frame, unsigned int tv_sec,
-			      unsigned int tv_usec, DRI2FrameEventPtr flip_info)
+static void I830DRI2FlipEventHandler(unsigned int frame,
+				     unsigned int tv_sec,
+				     unsigned int tv_usec,
+				     DRI2FrameEventPtr flip_info)
 {
 	struct intel_screen_private *intel = flip_info->intel;
 	DrawablePtr drawable;
-	DRI2FrameEventPtr chain;
 
 	drawable = NULL;
 	if (flip_info->drawable_id)
@@ -954,6 +1023,7 @@ void I830DRI2FlipEventHandler(unsigned int frame, unsigned int tv_sec,
 
 	/* We assume our flips arrive in order, so we don't check the frame */
 	switch (flip_info->type) {
+	case DRI2_FLIP:
 	case DRI2_SWAP:
 		if (!drawable)
 			break;
@@ -984,35 +1054,6 @@ void I830DRI2FlipEventHandler(unsigned int frame, unsigned int tv_sec,
 				 flip_info->event_data);
 		break;
 
-	case DRI2_SWAP_CHAIN:
-		assert(intel->pending_flip[flip_info->pipe] == flip_info);
-		intel->pending_flip[flip_info->pipe] = NULL;
-
-		chain = flip_info->chain;
-		if (chain) {
-			DrawablePtr chain_drawable = NULL;
-			if (chain->drawable_id)
-				 dixLookupDrawable(&chain_drawable,
-						   chain->drawable_id,
-						   serverClient,
-						   M_ANY, DixWriteAccess);
-			if (chain_drawable == NULL) {
-				i830_dri2_del_frame_event(chain);
-			} else if (!can_exchange(chain_drawable, chain->front, chain->back) ||
-				   !I830DRI2ScheduleFlip(intel, chain_drawable, chain)) {
-				I830DRI2FallbackBlitSwap(chain_drawable,
-							 chain->front,
-							 chain->back);
-
-				DRI2SwapComplete(chain->client, chain_drawable, frame, tv_sec, tv_usec,
-						 DRI2_BLIT_COMPLETE,
-						 chain->client ? chain->event_complete : NULL,
-						 chain->event_data);
-				i830_dri2_del_frame_event(chain);
-			}
-		}
-		break;
-
 	default:
 		xf86DrvMsg(intel->scrn->scrnIndex, X_WARNING,
 			   "%s: unknown vblank event received\n", __func__);
@@ -1023,38 +1064,6 @@ void I830DRI2FlipEventHandler(unsigned int frame, unsigned int tv_sec,
 	i830_dri2_del_frame_event(flip_info);
 }
 
-static uint32_t pipe_select(int pipe)
-{
-	if (pipe > 1)
-		return pipe << DRM_VBLANK_HIGH_CRTC_SHIFT;
-	else if (pipe > 0)
-		return DRM_VBLANK_SECONDARY;
-	else
-		return 0;
-}
-
-static void
-intel_dri2_vblank_handler(ScrnInfoPtr scrn,
-                          xf86CrtcPtr crtc,
-                          uint64_t msc,
-                          uint64_t usec,
-                          void *data)
-{
-        DRI2FrameEventPtr swap_info = data;
-
-        I830DRI2FrameEventHandler((uint32_t) msc, usec / 1000000, usec % 1000000, swap_info);
-}
-
-static void
-intel_dri2_vblank_abort(ScrnInfoPtr scrn,
-                        xf86CrtcPtr crtc,
-                        void *data)
-{
-        DRI2FrameEventPtr swap_info = data;
-
-        i830_dri2_del_frame_event(swap_info);
-}
-
 /*
  * ScheduleSwap is responsible for requesting a DRM vblank event for the
  * appropriate frame.
@@ -1089,7 +1098,6 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
         int pipe = crtc ? intel_crtc_to_pipe(crtc) : -1;
         int flip = 0;
 	DRI2FrameEventPtr swap_info = NULL;
-	enum DRI2FrameEventType swap_type = DRI2_SWAP;
 	uint64_t current_msc, current_ust;
         uint64_t request_msc;
         uint32_t seq;
@@ -1109,7 +1117,7 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	swap_info->event_data = data;
 	swap_info->front = front;
 	swap_info->back = back;
-	swap_info->pipe = pipe;
+	swap_info->type = DRI2_SWAP;
 
 	if (!i830_dri2_add_frame_event(swap_info)) {
 	    free(swap_info);
@@ -1124,20 +1132,27 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	if (ret)
 	    goto blit_fallback;
 
-	/* Flips need to be submitted one frame before */
+	/*
+	 * If we can, schedule the flip directly from here rather
+	 * than waiting for an event from the kernel for the current
+	 * (or a past) MSC.
+	 */
+	if (divisor == 0 &&
+	    current_msc >= *target_msc &&
+	    queue_flip(intel, draw, swap_info))
+		return TRUE;
+
 	if (can_exchange(draw, front, back)) {
-	    swap_type = DRI2_FLIP;
-	    flip = 1;
+		swap_info->type = DRI2_FLIP;
+		/* Flips need to be submitted one frame before */
+		if (*target_msc > 0)
+			--*target_msc;
+		flip = 1;
 	}
 
-	swap_info->type = swap_type;
-
-	/* Correct target_msc by 'flip' if swap_type == DRI2_FLIP.
-	 * Do it early, so handling of different timing constraints
-	 * for divisor, remainder and msc vs. target_msc works.
-	 */
-	if (*target_msc > 0)
-		*target_msc -= flip;
+#if DRI2INFOREC_VERSION >= 6
+	DRI2SwapLimit(draw, 1);
+#endif
 
 	/*
 	 * If divisor is zero, or current_msc is smaller than target_msc
@@ -1145,15 +1160,6 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	 * the swap.
 	 */
 	if (divisor == 0 || current_msc < *target_msc) {
-		/*
-		 * If we can, schedule the flip directly from here rather
-		 * than waiting for an event from the kernel for the current
-		 * (or a past) MSC.
-		 */
-		if (flip && divisor == 0 && current_msc >= *target_msc &&
-		    I830DRI2ScheduleFlip(intel, draw, swap_info))
-			return TRUE;
-
 		vbl.request.type =
 			DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT | pipe_select(pipe);
 
@@ -1168,7 +1174,7 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 		 * current_msc to ensure we return a reasonable value back
 		 * to the caller. This makes swap_interval logic more robust.
 		 */
-		if (current_msc >= *target_msc)
+		if (current_msc > *target_msc)
 			*target_msc = current_msc;
 
                 seq = intel_drm_queue_alloc(scrn, crtc, swap_info, intel_dri2_vblank_handler, intel_dri2_vblank_abort);
@@ -1183,6 +1189,8 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 			xf86DrvMsg(scrn->scrnIndex, X_WARNING,
 				   "divisor 0 get vblank counter failed: %s\n",
 				   strerror(errno));
+			intel_drm_abort_seq(intel->scrn, seq);
+			swap_info = NULL;
 			goto blit_fallback;
 		}
 
@@ -1332,7 +1340,6 @@ I830DRI2ScheduleWaitMSC(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
 
 	if (!i830_dri2_add_frame_event(wait_info)) {
 	    free(wait_info);
-	    wait_info = NULL;
 	    goto out_complete;
 	}
 
@@ -1374,7 +1381,8 @@ I830DRI2ScheduleWaitMSC(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
 					   strerror(errno));
 				limit--;
 			}
-			goto out_free;
+			intel_drm_abort_seq(intel->scrn, seq);
+			goto out_complete;
 		}
 
 		wait_info->frame = intel_sequence_to_crtc_msc(crtc, vbl.reply.sequence);
@@ -1417,7 +1425,8 @@ I830DRI2ScheduleWaitMSC(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
 				   strerror(errno));
 			limit--;
 		}
-		goto out_free;
+		intel_drm_abort_seq(intel->scrn, seq);
+		goto out_complete;
 	}
 
 	wait_info->frame = intel_sequence_to_crtc_msc(crtc, vbl.reply.sequence);
@@ -1440,13 +1449,92 @@ static int has_i830_dri(void)
 	return access(DRI_DRIVER_PATH "/i830_dri.so", R_OK) == 0;
 }
 
-static const char *dri_driver_name(intel_screen_private *intel)
+static int
+namecmp(const char *s1, const char *s2)
+{
+	char c1, c2;
+
+	if (!s1 || *s1 == 0) {
+		if (!s2 || *s2 == 0)
+			return 0;
+		else
+			return 1;
+	}
+
+	while (*s1 == '_' || *s1 == ' ' || *s1 == '\t')
+		s1++;
+
+	while (*s2 == '_' || *s2 == ' ' || *s2 == '\t')
+		s2++;
+
+	c1 = isupper(*s1) ? tolower(*s1) : *s1;
+	c2 = isupper(*s2) ? tolower(*s2) : *s2;
+	while (c1 == c2) {
+		if (c1 == '\0')
+			return 0;
+
+		s1++;
+		while (*s1 == '_' || *s1 == ' ' || *s1 == '\t')
+			s1++;
+
+		s2++;
+		while (*s2 == '_' || *s2 == ' ' || *s2 == '\t')
+			s2++;
+
+		c1 = isupper(*s1) ? tolower(*s1) : *s1;
+		c2 = isupper(*s2) ? tolower(*s2) : *s2;
+	}
+
+	return c1 - c2;
+}
+
+static Bool is_level(const char **str)
+{
+	const char *s = *str;
+	char *end;
+	unsigned val;
+
+	if (s == NULL || *s == '\0')
+		return TRUE;
+
+	if (namecmp(s, "on") == 0)
+		return TRUE;
+	if (namecmp(s, "true") == 0)
+		return TRUE;
+	if (namecmp(s, "yes") == 0)
+		return TRUE;
+
+	if (namecmp(s, "0") == 0)
+		return TRUE;
+	if (namecmp(s, "off") == 0)
+		return TRUE;
+	if (namecmp(s, "false") == 0)
+		return TRUE;
+	if (namecmp(s, "no") == 0)
+		return TRUE;
+
+	val = strtoul(s, &end, 0);
+	if (val && *end == '\0')
+		return TRUE;
+	if (val && *end == ':')
+		*str = end + 1;
+	return FALSE;
+}
+
+static const char *options_get_dri(intel_screen_private *intel)
 {
 #if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,7,99,901,0)
-	const char *s = xf86GetOptValString(intel->Options, OPTION_DRI);
-	Bool dummy;
+	return xf86GetOptValString(intel->Options, OPTION_DRI);
+#else
+	return NULL;
+#endif
+}
 
-	if (s == NULL || xf86getBoolValue(&dummy, s)) {
+static const char *dri_driver_name(intel_screen_private *intel)
+{
+	const char *s = options_get_dri(intel);
+
+	if (is_level(&s)) {
 		if (INTEL_INFO(intel)->gen < 030)
 			return has_i830_dri() ? "i830" : "i915";
 		else if (INTEL_INFO(intel)->gen < 040)
@@ -1456,14 +1544,6 @@ static const char *dri_driver_name(intel_screen_private *intel)
 	}
 
 	return s;
-#else
-	if (INTEL_INFO(intel)->gen < 030)
-		return has_i830_dri() ? "i830" : "i915";
-	else if (INTEL_INFO(intel)->gen < 040)
-		return "i915";
-	else
-		return "i965";
-#endif
 }
 
 Bool I830DRI2ScreenInit(ScreenPtr screen)
@@ -1544,7 +1624,7 @@ Bool I830DRI2ScreenInit(ScreenPtr screen)
 	info.numDrivers = 2;
 	info.driverNames = driverNames;
 	driverNames[0] = info.driverName;
-	driverNames[1] = info.driverName;
+	driverNames[1] = "va_gl";
 #endif
 
 	return DRI2ScreenInit(screen, &info);
diff --git a/src/uxa/intel_driver.c b/src/uxa/intel_driver.c
index 2793da5..6e64b8c 100644
--- a/src/uxa/intel_driver.c
+++ b/src/uxa/intel_driver.c
@@ -237,24 +237,17 @@ static Bool I830GetEarlyOptions(ScrnInfoPtr scrn)
 	return TRUE;
 }
 
-static Bool intel_option_cast_string_to_bool(intel_screen_private *intel,
-					     int id, Bool val)
-{
-#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,7,99,901,0)
-	xf86getBoolValue(&val, xf86GetOptValString(intel->Options, id));
-	return val;
-#else
-	return val;
-#endif
-}
-
 static void intel_check_dri_option(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
+	unsigned level;
 
 	intel->dri2 = intel->dri3 = DRI_NONE;
-	if (!intel_option_cast_string_to_bool(intel, OPTION_DRI, TRUE))
-		intel->dri2 = intel->dri3 = DRI_DISABLED;
+	level = intel_option_cast_to_unsigned(intel->Options, OPTION_DRI, DEFAULT_DRI_LEVEL);
+	if (level < 3)
+		intel->dri3 = DRI_DISABLED;
+	if (level < 2)
+		intel->dri2 = DRI_DISABLED;
 
 	if (scrn->depth != 16 && scrn->depth != 24 && scrn->depth != 30) {
 		xf86DrvMsg(scrn->scrnIndex, X_CONFIG,
@@ -372,7 +365,7 @@ static Bool can_accelerate_blt(struct intel_screen_private *intel)
 		return FALSE;
 
 	if (xf86ReturnOptValBool(intel->Options, OPTION_ACCEL_DISABLE, FALSE) ||
-	    !intel_option_cast_string_to_bool(intel, OPTION_ACCEL_METHOD, TRUE)) {
+	    !intel_option_cast_to_bool(intel->Options, OPTION_ACCEL_METHOD, TRUE)) {
 		xf86DrvMsg(intel->scrn->scrnIndex, X_CONFIG,
 			   "Disabling hardware acceleration.\n");
 		return FALSE;
@@ -735,6 +728,8 @@ intel_flush_callback(CallbackListPtr *list,
 }
 
 #if HAVE_UDEV
+#include <sys/stat.h>
+
 static void
 I830HandleUEvents(int fd, void *closure)
 {
diff --git a/src/uxa/intel_present.c b/src/uxa/intel_present.c
index d20043f..a7f904c 100644
--- a/src/uxa/intel_present.c
+++ b/src/uxa/intel_present.c
@@ -343,29 +343,33 @@ intel_present_unflip(ScreenPtr screen, uint64_t event_id)
 {
 	ScrnInfoPtr                             scrn = xf86ScreenToScrn(screen);
 	intel_screen_private                    *intel = intel_get_screen_private(scrn);
-	struct intel_present_vblank_event       *event;
 	PixmapPtr                               pixmap = screen->GetScreenPixmap(screen);
+	struct intel_present_vblank_event       *event = NULL;
 	dri_bo                                  *bo;
-	Bool                                    ret;
 
 	if (!intel_present_check_flip(NULL, screen->root, pixmap, true))
-		return;
+		goto fail;
 
 	bo = intel_get_pixmap_bo(pixmap);
 	if (!bo)
-		return;
+		goto fail;
 
 	event = calloc(1, sizeof(struct intel_present_vblank_event));
 	if (!event)
-		return;
+		goto fail;
 
 	event->event_id = event_id;
 
-	ret = intel_do_pageflip(intel, bo, -1, FALSE, event, intel_present_flip_event, intel_present_flip_abort);
-	if (!ret) {
-		xf86DrvMsg(scrn->scrnIndex, X_ERROR,
-			   "present unflip failed\n");
-	}
+	if (!intel_do_pageflip(intel, bo, -1, FALSE, event,
+			       intel_present_flip_event,
+			       intel_present_flip_abort))
+		goto fail;
+
+	return;
+fail:
+	xf86SetDesiredModes(scrn);
+	present_event_notify(event_id, 0, 0);
+	free(event);
 }
 
 static present_screen_info_rec intel_present_screen_info = {
diff --git a/test/Makefile.am b/test/Makefile.am
index 66ed8eb..9b37222 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -28,6 +28,7 @@ stress_TESTS = \
 if DRI2
 stress_TESTS += \
 	dri2-race \
+	dri2-speed \
 	dri2-swap \
 	dri2-test \
 	$(NULL)
@@ -37,6 +38,7 @@ if X11_DRI3
 stress_TESTS += \
 	dri3-test \
 	present-test \
+	present-speed \
 	$(NULL)
 endif
 check_PROGRAMS = $(stress_TESTS)
diff --git a/test/dri2-race.c b/test/dri2-race.c
index 8862c84..c589f2b 100644
--- a/test/dri2-race.c
+++ b/test/dri2-race.c
@@ -5,6 +5,10 @@
 #include <X11/Xlib.h>
 #include <X11/Xutil.h>
 #include <X11/extensions/Xfixes.h>
+#include <X11/Xlib-xcb.h>
+#include <xcb/xcb.h>
+#include <xcb/xcbext.h>
+#include <xcb/dri2.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <string.h>
@@ -17,6 +21,16 @@
 
 #define COUNT 60
 
+static uint32_t upper_32_bits(uint64_t val)
+{
+	return val >> 32;
+}
+
+static uint32_t lower_32_bits(uint64_t val)
+{
+	return val & 0xffffffff;
+}
+
 static int dri2_open(Display *dpy)
 {
 	drm_auth_t auth;
@@ -41,15 +55,36 @@ static int dri2_open(Display *dpy)
 	return fd;
 }
 
-static void run(Display *dpy, int width, int height,
-		unsigned int *attachments, int nattachments,
-		const char *name)
+static void swap_buffers(Display *dpy, Window win, int divisor,
+			 unsigned int *attachments, int nattachments)
+{
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	unsigned int seq[2];
+
+	seq[0] = xcb_dri2_swap_buffers_unchecked(c, win,
+						 0, 0, 0, divisor, 0, 0).sequence;
+
+
+	seq[1] = xcb_dri2_get_buffers_unchecked(c, win,
+						nattachments, nattachments,
+						attachments).sequence;
+
+	xcb_flush(c);
+	xcb_discard_reply(c, seq[0]);
+	xcb_discard_reply(c, seq[1]);
+}
+
+static void race_window(Display *dpy, int width, int height,
+			unsigned int *attachments, int nattachments,
+			const char *name)
 {
 	Window win;
 	XSetWindowAttributes attr;
 	int count, loop;
 	DRI2Buffer *buffers;
 
+	printf("%s(%s)\n", __func__, name);
+
 	/* Be nasty and install a fullscreen window on top so that we
 	 * can guarantee we do not get clipped by children.
 	 */
@@ -75,13 +110,234 @@ static void run(Display *dpy, int width, int height,
 		for (count = 0; count < loop; count++)
 			DRI2SwapBuffers(dpy, win, 0, 0, 0);
 		XDestroyWindow(dpy, win);
+		printf("."); fflush(stdout);
+	} while (--loop);
+	printf("*\n");
+
+	loop = 100;
+	do {
+		win = XCreateWindow(dpy, DefaultRootWindow(dpy),
+				    0, 0, width, height, 0,
+				    DefaultDepth(dpy, DefaultScreen(dpy)),
+				    InputOutput,
+				    DefaultVisual(dpy, DefaultScreen(dpy)),
+				    CWOverrideRedirect, &attr);
+		XMapWindow(dpy, win);
+
+		DRI2CreateDrawable(dpy, win);
+
+		buffers = DRI2GetBuffers(dpy, win, &width, &height,
+					 attachments, nattachments, &count);
+		if (count != nattachments)
+			return;
+
+		free(buffers);
+		for (count = 0; count < loop; count++)
+			DRI2SwapBuffers(dpy, win, 0, 1, 0);
+		XDestroyWindow(dpy, win);
+		printf("."); fflush(stdout);
+	} while (--loop);
+	printf("*\n");
+
+	loop = 100;
+	do {
+		win = XCreateWindow(dpy, DefaultRootWindow(dpy),
+				    0, 0, width, height, 0,
+				    DefaultDepth(dpy, DefaultScreen(dpy)),
+				    InputOutput,
+				    DefaultVisual(dpy, DefaultScreen(dpy)),
+				    CWOverrideRedirect, &attr);
+		XMapWindow(dpy, win);
+
+		DRI2CreateDrawable(dpy, win);
+
+		buffers = DRI2GetBuffers(dpy, win, &width, &height,
+					 attachments, nattachments, &count);
+		if (count != nattachments)
+			return;
+
+		free(buffers);
+		for (count = 0; count < loop; count++)
+			swap_buffers(dpy, win, 0, attachments, nattachments);
+		XDestroyWindow(dpy, win);
+		printf("."); fflush(stdout);
+	} while (--loop);
+	printf("*\n");
+
+	loop = 100;
+	do {
+		win = XCreateWindow(dpy, DefaultRootWindow(dpy),
+				    0, 0, width, height, 0,
+				    DefaultDepth(dpy, DefaultScreen(dpy)),
+				    InputOutput,
+				    DefaultVisual(dpy, DefaultScreen(dpy)),
+				    CWOverrideRedirect, &attr);
+		XMapWindow(dpy, win);
+
+		DRI2CreateDrawable(dpy, win);
+
+		buffers = DRI2GetBuffers(dpy, win, &width, &height,
+					 attachments, nattachments, &count);
+		if (count != nattachments)
+			return;
+
+		free(buffers);
+		for (count = 0; count < loop; count++)
+			swap_buffers(dpy, win, 1, attachments, nattachments);
+		XDestroyWindow(dpy, win);
+		printf("."); fflush(stdout);
+	} while (--loop);
+	printf("*\n");
+
+	loop = 100;
+	do {
+		uint64_t ignore, msc;
+		xcb_connection_t *c = XGetXCBConnection(dpy);
+
+		win = XCreateWindow(dpy, DefaultRootWindow(dpy),
+				    0, 0, width, height, 0,
+				    DefaultDepth(dpy, DefaultScreen(dpy)),
+				    InputOutput,
+				    DefaultVisual(dpy, DefaultScreen(dpy)),
+				    CWOverrideRedirect, &attr);
+		XMapWindow(dpy, win);
+
+		DRI2CreateDrawable(dpy, win);
+		DRI2GetMSC(dpy, win, &ignore, &msc, &ignore);
+		for (count = 0; count < loop; count++)
+			xcb_discard_reply(c,
+					  xcb_dri2_wait_msc(c, win,
+							    upper_32_bits(msc + count + 1),
+							    lower_32_bits(msc + count + 1),
+							    0, 1, 0, 0).sequence);
+		XFlush(dpy);
+		XDestroyWindow(dpy, win);
+		printf("."); fflush(stdout);
 	} while (--loop);
+	printf("*\n");
 
 	XSync(dpy, 1);
 	sleep(2);
 	XSync(dpy, 1);
 }
 
+static void race_client(int width, int height,
+			unsigned int *attachments, int nattachments,
+			const char *name)
+{
+	XSetWindowAttributes attr;
+	int count, loop;
+
+	printf("%s(%s)\n", __func__, name);
+
+	/* Be nasty and install a fullscreen window on top so that we
+	 * can guarantee we do not get clipped by children.
+	 */
+	attr.override_redirect = 1;
+	loop = 100;
+	do {
+		Display *dpy = XOpenDisplay(NULL);
+		Window win = XCreateWindow(dpy, DefaultRootWindow(dpy),
+					   0, 0, width, height, 0,
+					   DefaultDepth(dpy, DefaultScreen(dpy)),
+					   InputOutput,
+					   DefaultVisual(dpy, DefaultScreen(dpy)),
+					   CWOverrideRedirect, &attr);
+
+		XMapWindow(dpy, win);
+
+		DRI2CreateDrawable(dpy, win);
+		free(DRI2GetBuffers(dpy, win, &width, &height,
+				    attachments, nattachments, &count));
+		if (count != nattachments)
+			return;
+
+		for (count = 0; count < loop; count++)
+			DRI2SwapBuffers(dpy, win, 0, 0, 0);
+		XCloseDisplay(dpy);
+		printf("."); fflush(stdout);
+	} while (--loop);
+	printf("*\n");
+
+	loop = 100;
+	do {
+		Display *dpy = XOpenDisplay(NULL);
+		Window win = XCreateWindow(dpy, DefaultRootWindow(dpy),
+					   0, 0, width, height, 0,
+					   DefaultDepth(dpy, DefaultScreen(dpy)),
+					   InputOutput,
+					   DefaultVisual(dpy, DefaultScreen(dpy)),
+					   CWOverrideRedirect, &attr);
+
+		XMapWindow(dpy, win);
+
+		DRI2CreateDrawable(dpy, win);
+		free(DRI2GetBuffers(dpy, win, &width, &height,
+				    attachments, nattachments, &count));
+		if (count != nattachments)
+			return;
+
+		for (count = 0; count < loop; count++)
+			swap_buffers(dpy, win, 0, attachments, nattachments);
+		XCloseDisplay(dpy);
+		printf("."); fflush(stdout);
+	} while (--loop);
+	printf("*\n");
+
+	loop = 100;
+	do {
+		Display *dpy = XOpenDisplay(NULL);
+		Window win = XCreateWindow(dpy, DefaultRootWindow(dpy),
+					   0, 0, width, height, 0,
+					   DefaultDepth(dpy, DefaultScreen(dpy)),
+					   InputOutput,
+					   DefaultVisual(dpy, DefaultScreen(dpy)),
+					   CWOverrideRedirect, &attr);
+
+		XMapWindow(dpy, win);
+
+		DRI2CreateDrawable(dpy, win);
+		free(DRI2GetBuffers(dpy, win, &width, &height,
+				    attachments, nattachments, &count));
+		if (count != nattachments)
+			return;
+
+		for (count = 0; count < loop; count++)
+			swap_buffers(dpy, win, 1, attachments, nattachments);
+		XCloseDisplay(dpy);
+		printf("."); fflush(stdout);
+	} while (--loop);
+	printf("*\n");
+
+	loop = 100;
+	do {
+		uint64_t ignore, msc;
+		Display *dpy = XOpenDisplay(NULL);
+		xcb_connection_t *c = XGetXCBConnection(dpy);
+		Window win = XCreateWindow(dpy, DefaultRootWindow(dpy),
+					   0, 0, width, height, 0,
+					   DefaultDepth(dpy, DefaultScreen(dpy)),
+					   InputOutput,
+					   DefaultVisual(dpy, DefaultScreen(dpy)),
+					   CWOverrideRedirect, &attr);
+
+		XMapWindow(dpy, win);
+
+		DRI2CreateDrawable(dpy, win);
+		DRI2GetMSC(dpy, win, &ignore, &msc, &ignore);
+		for (count = 0; count < loop; count++)
+			xcb_discard_reply(c,
+					  xcb_dri2_wait_msc(c, win,
+							    upper_32_bits(msc + count + 1),
+							    lower_32_bits(msc + count + 1),
+							    0, 1, 0, 0).sequence);
+		XFlush(dpy);
+		XCloseDisplay(dpy);
+		printf("."); fflush(stdout);
+	} while (--loop);
+	printf("*\n");
+}
+
 int main(void)
 {
 	Display *dpy;
@@ -101,13 +357,17 @@ int main(void)
 
 	width = WidthOfScreen(DefaultScreenOfDisplay(dpy));
 	height = HeightOfScreen(DefaultScreenOfDisplay(dpy));
-	run(dpy, width, height, attachments, 1, "fullscreen");
-	run(dpy, width, height, attachments, 2, "fullscreen (with front)");
+	race_window(dpy, width, height, attachments, 1, "fullscreen");
+	race_window(dpy, width, height, attachments, 2, "fullscreen (with front)");
+	race_client(width, height, attachments, 1, "fullscreen");
+	race_client(width, height, attachments, 2, "fullscreen (with front)");
 
 	width /= 2;
 	height /= 2;
-	run(dpy, width, height, attachments, 1, "windowed");
-	run(dpy, width, height, attachments, 2, "windowed (with front)");
+	race_window(dpy, width, height, attachments, 1, "windowed");
+	race_window(dpy, width, height, attachments, 2, "windowed (with front)");
+	race_client(width, height, attachments, 1, "windowed");
+	race_client(width, height, attachments, 2, "windowed (with front)");
 
 	return 0;
 }
diff --git a/test/dri2-speed.c b/test/dri2-speed.c
new file mode 100644
index 0000000..bd7b6e1
--- /dev/null
+++ b/test/dri2-speed.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <X11/Xlib.h>
+#include <X11/Xatom.h>
+#include <X11/Xlib-xcb.h>
+#include <X11/Xutil.h>
+#include <X11/Xlibint.h>
+#include <X11/extensions/dpms.h>
+#include <X11/extensions/randr.h>
+#include <X11/extensions/Xrandr.h>
+#include <xcb/xcb.h>
+#include <xcb/dri2.h>
+#include <xf86drm.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <setjmp.h>
+#include <signal.h>
+
+#include "dri2.h"
+
+static int _x_error_occurred;
+
+static int
+_check_error_handler(Display     *display,
+		     XErrorEvent *event)
+{
+	printf("X11 error from display %s, serial=%ld, error=%d, req=%d.%d\n",
+	       DisplayString(display),
+	       event->serial,
+	       event->error_code,
+	       event->request_code,
+	       event->minor_code);
+	_x_error_occurred++;
+	return False; /* ignored */
+}
+
+static double elapsed(const struct timespec *start,
+		      const struct timespec *end)
+{
+	return 1e6*(end->tv_sec - start->tv_sec) + (end->tv_nsec - start->tv_nsec)/1000;
+}
+
+static void run(Display *dpy, Window win, const char *name)
+{
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	struct timespec start, end;
+	int n, completed = 0;
+
+	_x_error_occurred = 0;
+
+	clock_gettime(CLOCK_MONOTONIC, &start);
+	do {
+		for (n = 0; n < 1000; n++) {
+			unsigned int attachments[] = { DRI2BufferBackLeft };
+			unsigned int seq[2];
+
+			seq[0] = xcb_dri2_swap_buffers_unchecked(c, win,
+								 0, 0, 0, 0, 0, 0).sequence;
+
+
+			seq[1] = xcb_dri2_get_buffers_unchecked(c, win,
+								1, 1, attachments).sequence;
+
+			xcb_flush(c);
+			xcb_discard_reply(c, seq[0]);
+			xcb_discard_reply(c, seq[1]);
+			completed++;
+		}
+		clock_gettime(CLOCK_MONOTONIC, &end);
+	} while (end.tv_sec < start.tv_sec + 10);
+
+	XSync(dpy, True);
+	if (_x_error_occurred)
+		abort();
+
+	printf("%s: Completed %d swaps in %.1fs, %.3fus each (%.1f FPS)\n",
+	       name, completed, elapsed(&start, &end) / 1000000,
+	       elapsed(&start, &end) / completed,
+	       completed / (elapsed(&start, &end) / 1000000));
+}
+
+static inline XRRScreenResources *_XRRGetScreenResourcesCurrent(Display *dpy, Window window)
+{
+	XRRScreenResources *res;
+
+	res = XRRGetScreenResourcesCurrent(dpy, window);
+	if (res == NULL)
+		res = XRRGetScreenResources(dpy, window);
+
+	return res;
+}
+
+static XRRModeInfo *lookup_mode(XRRScreenResources *res, int id)
+{
+	int i;
+
+	for (i = 0; i < res->nmode; i++) {
+		if (res->modes[i].id == id)
+			return &res->modes[i];
+	}
+
+	return NULL;
+}
+
+static int dri2_open(Display *dpy)
+{
+	drm_auth_t auth;
+	char *driver, *device;
+	int fd;
+
+	if (!DRI2Connect(dpy, DefaultRootWindow(dpy), &driver, &device))
+		return -1;
+
+	printf ("Connecting to %s driver on %s\n", driver, device);
+
+	fd = open(device, O_RDWR);
+	if (fd < 0)
+		return -1;
+
+	if (drmIoctl(fd, DRM_IOCTL_GET_MAGIC, &auth))
+		return -1;
+
+	if (!DRI2Authenticate(dpy, DefaultRootWindow(dpy), auth.magic))
+		return -1;
+
+	return fd;
+}
+
+static void fullscreen(Display *dpy, Window win)
+{
+	Atom atom = XInternAtom(dpy, "_NET_WM_STATE_FULLSCREEN", False);
+	XChangeProperty(dpy, win,
+			XInternAtom(dpy, "_NET_WM_STATE", False),
+			XA_ATOM, 32, PropModeReplace,
+			(unsigned char *)&atom, 1);
+}
+
+int main(void)
+{
+	Display *dpy;
+	Window root, win;
+	XRRScreenResources *res;
+	XRRCrtcInfo **original_crtc;
+	XSetWindowAttributes attr;
+	int i, j, fd;
+
+	attr.override_redirect = 1;
+
+	dpy = XOpenDisplay(NULL);
+	if (dpy == NULL)
+		return 77;
+
+	fd = dri2_open(dpy);
+	if (fd < 0)
+		return 77;
+
+	if (DPMSQueryExtension(dpy, &i, &i))
+		DPMSDisable(dpy);
+
+	root = DefaultRootWindow(dpy);
+
+	signal(SIGALRM, SIG_IGN);
+	XSetErrorHandler(_check_error_handler);
+
+	res = NULL;
+	if (XRRQueryVersion(dpy, &i, &i))
+		res = _XRRGetScreenResourcesCurrent(dpy, root);
+	if (res == NULL)
+		return 77;
+
+	original_crtc = malloc(sizeof(XRRCrtcInfo *)*res->ncrtc);
+	for (i = 0; i < res->ncrtc; i++)
+		original_crtc[i] = XRRGetCrtcInfo(dpy, res, res->crtcs[i]);
+
+	printf("noutput=%d, ncrtc=%d\n", res->noutput, res->ncrtc);
+	for (i = 0; i < res->ncrtc; i++)
+		XRRSetCrtcConfig(dpy, res, res->crtcs[i], CurrentTime,
+				 0, 0, None, RR_Rotate_0, NULL, 0);
+
+	DRI2CreateDrawable(dpy, root);
+	DRI2SwapInterval(dpy, root, 0);
+	run(dpy, root, "off");
+
+	for (i = 0; i < res->noutput; i++) {
+		XRROutputInfo *output;
+		XRRModeInfo *mode;
+
+		output = XRRGetOutputInfo(dpy, res, res->outputs[i]);
+		if (output == NULL)
+			continue;
+
+		mode = NULL;
+		if (res->nmode)
+			mode = lookup_mode(res, output->modes[0]);
+
+		for (j = 0; mode && j < 2*output->ncrtc; j++) {
+			int c = j;
+			if (c >= output->ncrtc)
+				c = 2*output->ncrtc - j - 1;
+
+			printf("[%d, %d] -- OUTPUT:%ld, CRTC:%ld: %dx%d\n",
+			       i, c, (long)res->outputs[i], (long)output->crtcs[c],
+			       mode->width, mode->height);
+			XRRSetCrtcConfig(dpy, res, output->crtcs[c], CurrentTime,
+					 0, 0, output->modes[0], RR_Rotate_0, &res->outputs[i], 1);
+
+			run(dpy, root, "root");
+
+			win = XCreateWindow(dpy, root,
+					    0, 0, mode->width, mode->height, 0,
+					    DefaultDepth(dpy, DefaultScreen(dpy)),
+					    InputOutput,
+					    DefaultVisual(dpy, DefaultScreen(dpy)),
+					    CWOverrideRedirect, &attr);
+			DRI2CreateDrawable(dpy, win);
+			DRI2SwapInterval(dpy, win, 0);
+			fullscreen(dpy, win);
+			XMapWindow(dpy, win);
+			run(dpy, win, "fullscreen");
+			XDestroyWindow(dpy, win);
+
+			win = XCreateWindow(dpy, root,
+					    0, 0, mode->width, mode->height, 0,
+					    DefaultDepth(dpy, DefaultScreen(dpy)),
+					    InputOutput,
+					    DefaultVisual(dpy, DefaultScreen(dpy)),
+					    CWOverrideRedirect, &attr);
+			DRI2CreateDrawable(dpy, win);
+			DRI2SwapInterval(dpy, win, 0);
+			XMapWindow(dpy, win);
+			run(dpy, win, "windowed");
+			XDestroyWindow(dpy, win);
+
+			win = XCreateWindow(dpy, root,
+					    0, 0, mode->width/2, mode->height/2, 0,
+					    DefaultDepth(dpy, DefaultScreen(dpy)),
+					    InputOutput,
+					    DefaultVisual(dpy, DefaultScreen(dpy)),
+					    CWOverrideRedirect, &attr);
+			DRI2CreateDrawable(dpy, win);
+			DRI2SwapInterval(dpy, win, 0);
+			XMapWindow(dpy, win);
+			run(dpy, win, "half");
+			XDestroyWindow(dpy, win);
+
+			XRRSetCrtcConfig(dpy, res, output->crtcs[c], CurrentTime,
+					 0, 0, None, RR_Rotate_0, NULL, 0);
+		}
+
+		XRRFreeOutputInfo(output);
+	}
+
+	for (i = 0; i < res->ncrtc; i++)
+		XRRSetCrtcConfig(dpy, res, res->crtcs[i], CurrentTime,
+				 original_crtc[i]->x,
+				 original_crtc[i]->y,
+				 original_crtc[i]->mode,
+				 original_crtc[i]->rotation,
+				 original_crtc[i]->outputs,
+				 original_crtc[i]->noutput);
+
+	if (DPMSQueryExtension(dpy, &i, &i))
+		DPMSEnable(dpy);
+	return 0;
+}
diff --git a/test/dri2-test.c b/test/dri2-test.c
index dd4179f..d44ed99 100644
--- a/test/dri2-test.c
+++ b/test/dri2-test.c
@@ -6,6 +6,10 @@
 #include <X11/Xutil.h>
 #include <X11/extensions/Xfixes.h>
 #include <X11/extensions/Xrandr.h>
+#include <X11/Xlib-xcb.h>
+#include <xcb/xcb.h>
+#include <xcb/xcbext.h>
+#include <xcb/dri2.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <string.h>
@@ -101,10 +105,29 @@ static uint64_t check_msc(Display *dpy, Window win, uint64_t last_msc)
 	return current_msc;
 }
 
+static void swap_buffers(xcb_connection_t *c, Window win,
+		unsigned int *attachments, int nattachments)
+{
+	unsigned int seq[2];
+
+	seq[0] = xcb_dri2_swap_buffers_unchecked(c, win,
+						 0, 0, 0, 0, 0, 0).sequence;
+
+
+	seq[1] = xcb_dri2_get_buffers_unchecked(c, win,
+						nattachments, nattachments,
+						attachments).sequence;
+
+	xcb_flush(c);
+	xcb_discard_reply(c, seq[0]);
+	xcb_discard_reply(c, seq[1]);
+}
+
 static void run(Display *dpy, int width, int height,
 		unsigned int *attachments, int nattachments,
 		const char *name)
 {
+	xcb_connection_t *c = XGetXCBConnection(dpy);
 	Window win;
 	XSetWindowAttributes attr;
 	int count;
@@ -132,15 +155,17 @@ static void run(Display *dpy, int width, int height,
 	if (count != nattachments)
 		return;
 
+	swap_buffers(c, win, attachments, nattachments);
 	msc = check_msc(dpy, win, msc);
 	clock_gettime(CLOCK_MONOTONIC, &start);
 	for (count = 0; count < COUNT; count++)
-		DRI2SwapBuffers(dpy, win, 0, 0, 0);
+		swap_buffers(c, win, attachments, nattachments);
 	msc = check_msc(dpy, win, msc);
 	clock_gettime(CLOCK_MONOTONIC, &end);
 	printf("%d %s (%dx%d) swaps in %fs.\n",
 	       count, name, width, height, elapsed(&start, &end));
 
+	swap_buffers(c, win, attachments, nattachments);
 	msc = check_msc(dpy, win, msc);
 	clock_gettime(CLOCK_MONOTONIC, &start);
 	for (count = 0; count < COUNT; count++)
@@ -153,10 +178,11 @@ static void run(Display *dpy, int width, int height,
 
 	DRI2SwapInterval(dpy, win, 0);
 
+	swap_buffers(c, win, attachments, nattachments);
 	msc = check_msc(dpy, win, msc);
 	clock_gettime(CLOCK_MONOTONIC, &start);
 	for (count = 0; count < COUNT; count++)
-		DRI2SwapBuffers(dpy, win, 0, 0, 0);
+		swap_buffers(c, win, attachments, nattachments);
 	msc = check_msc(dpy, win, msc);
 	clock_gettime(CLOCK_MONOTONIC, &end);
 	printf("%d %s (%dx%d) vblank=0 swaps in %fs.\n",
diff --git a/test/present-speed.c b/test/present-speed.c
new file mode 100644
index 0000000..1d3411b
--- /dev/null
+++ b/test/present-speed.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <X11/Xlib.h>
+#include <X11/Xatom.h>
+#include <X11/Xlib-xcb.h>
+#include <X11/xshmfence.h>
+#include <X11/Xutil.h>
+#include <X11/Xlibint.h>
+#include <X11/extensions/dpms.h>
+#include <X11/extensions/randr.h>
+#include <X11/extensions/Xrandr.h>
+#include <xcb/xcb.h>
+#include <xcb/present.h>
+#include <xf86drm.h>
+#include <i915_drm.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <setjmp.h>
+#include <signal.h>
+
+static int _x_error_occurred;
+static uint32_t stamp;
+
+static int
+_check_error_handler(Display     *display,
+		     XErrorEvent *event)
+{
+	printf("X11 error from display %s, serial=%ld, error=%d, req=%d.%d\n",
+	       DisplayString(display),
+	       event->serial,
+	       event->error_code,
+	       event->request_code,
+	       event->minor_code);
+	_x_error_occurred++;
+	return False; /* ignored */
+}
+
+static void *setup_msc(Display *dpy, Window win)
+{
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	uint32_t id = xcb_generate_id(c);
+
+	xcb_present_select_input(c, id, win, XCB_PRESENT_EVENT_MASK_COMPLETE_NOTIFY);
+	return xcb_register_for_special_xge(c, &xcb_present_id, id, &stamp);
+}
+
+static void teardown_msc(Display *dpy, void *q)
+{
+	xcb_unregister_for_special_event(XGetXCBConnection(dpy), q);
+}
+
+static double elapsed(const struct timespec *start,
+		      const struct timespec *end)
+{
+	return 1e6*(end->tv_sec - start->tv_sec) + (end->tv_nsec - start->tv_nsec)/1000;
+}
+
+static void run(Display *dpy, Window win, const char *name)
+{
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	struct timespec start, end;
+	Pixmap pixmap[4];
+	int busy[4];
+	Window root;
+	unsigned int width, height;
+	unsigned border, depth;
+	int i, j, n, back = 0;
+	int completed = 0;
+	void *Q;
+
+	XGetGeometry(dpy, win,
+		     &root, &i, &j, &width, &height, &border, &depth);
+
+	_x_error_occurred = 0;
+
+	for (n = 0; n < 4; n++) {
+		pixmap[n] = XCreatePixmap(dpy, win, width, height, depth);
+		busy[n] = 0;
+	}
+
+	Q = setup_msc(dpy, win);
+	clock_gettime(CLOCK_MONOTONIC, &start);
+	do {
+		for (n = 0; n < 1000; n++) {
+			Pixmap p = 0;
+			for (i = 0; i < 4; i++) {
+				j = (back + i) % 4;
+				if (!busy[j]) {
+					p = pixmap[j];
+					break;
+				}
+			}
+			if (p == 0) {
+				xcb_present_complete_notify_event_t *ce;
+				xcb_generic_event_t *ev;
+
+				ev = xcb_wait_for_special_event(c, Q);
+				if (ev == NULL)
+					abort();
+
+				do {
+					ce = (xcb_present_complete_notify_event_t *)ev;
+					if (ce->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP) {
+						completed++;
+						busy[ce->serial] = 0;
+						if (p == 0)
+							p = pixmap[j = ce->serial];
+					}
+					free(ev);
+				} while ((ev = xcb_poll_for_special_event(c, Q)));
+			}
+
+			back = j;
+			busy[back] = 1;
+
+			xcb_present_pixmap(c, win, p, back++,
+					   0, /* valid */
+					   0, /* update */
+					   0, /* x_off */
+					   0, /* y_off */
+					   None,
+					   None, /* wait fence */
+					   None,
+					   XCB_PRESENT_OPTION_ASYNC,
+					   0, /* target msc */
+					   0, /* divisor */
+					   0, /* remainder */
+					   0, NULL);
+			xcb_flush(c);
+		}
+		clock_gettime(CLOCK_MONOTONIC, &end);
+	} while (end.tv_sec < start.tv_sec + 10);
+
+	for (n = 0; n < 4; n++)
+		XFreePixmap(dpy, pixmap[n]);
+
+	XSync(dpy, True);
+	teardown_msc(dpy, Q);
+	if (_x_error_occurred)
+		abort();
+
+	printf("%s: Completed %d presents in %.1fs, %.3fus each (%.1f FPS)\n",
+	       name, completed, elapsed(&start, &end) / 1000000,
+	       elapsed(&start, &end) / completed,
+	       completed / (elapsed(&start, &end) / 1000000));
+}
+
+static int has_present(Display *dpy)
+{
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	xcb_generic_error_t *error = NULL;
+	void *reply;
+
+	reply = xcb_present_query_version_reply(c,
+						xcb_present_query_version(c,
+									  XCB_PRESENT_MAJOR_VERSION,
+									  XCB_PRESENT_MINOR_VERSION),
+						&error);
+
+	free(reply);
+	free(error);
+	if (reply == NULL) {
+		fprintf(stderr, "Present not supported on %s\n", DisplayString(dpy));
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline XRRScreenResources *_XRRGetScreenResourcesCurrent(Display *dpy, Window window)
+{
+	XRRScreenResources *res;
+
+	res = XRRGetScreenResourcesCurrent(dpy, window);
+	if (res == NULL)
+		res = XRRGetScreenResources(dpy, window);
+
+	return res;
+}
+
+static XRRModeInfo *lookup_mode(XRRScreenResources *res, int id)
+{
+	int i;
+
+	for (i = 0; i < res->nmode; i++) {
+		if (res->modes[i].id == id)
+			return &res->modes[i];
+	}
+
+	return NULL;
+}
+
+static void fullscreen(Display *dpy, Window win)
+{
+	Atom atom = XInternAtom(dpy, "_NET_WM_STATE_FULLSCREEN", False);
+	XChangeProperty(dpy, win,
+			XInternAtom(dpy, "_NET_WM_STATE", False),
+			XA_ATOM, 32, PropModeReplace,
+			(unsigned char *)&atom, 1);
+}
+
+int main(void)
+{
+	Display *dpy;
+	Window root, win;
+	XRRScreenResources *res;
+	XRRCrtcInfo **original_crtc;
+	XSetWindowAttributes attr;
+	int i, j;
+
+	attr.override_redirect = 1;
+
+	dpy = XOpenDisplay(NULL);
+	if (dpy == NULL)
+		return 77;
+
+	if (!has_present(dpy))
+		return 77;
+
+	if (DPMSQueryExtension(dpy, &i, &i))
+		DPMSDisable(dpy);
+
+	root = DefaultRootWindow(dpy);
+
+	signal(SIGALRM, SIG_IGN);
+	XSetErrorHandler(_check_error_handler);
+
+	res = NULL;
+	if (XRRQueryVersion(dpy, &i, &i))
+		res = _XRRGetScreenResourcesCurrent(dpy, root);
+	if (res == NULL)
+		return 77;
+
+	original_crtc = malloc(sizeof(XRRCrtcInfo *)*res->ncrtc);
+	for (i = 0; i < res->ncrtc; i++)
+		original_crtc[i] = XRRGetCrtcInfo(dpy, res, res->crtcs[i]);
+
+	printf("noutput=%d, ncrtc=%d\n", res->noutput, res->ncrtc);
+	for (i = 0; i < res->ncrtc; i++)
+		XRRSetCrtcConfig(dpy, res, res->crtcs[i], CurrentTime,
+				 0, 0, None, RR_Rotate_0, NULL, 0);
+
+	run(dpy, root, "off");
+
+	for (i = 0; i < res->noutput; i++) {
+		XRROutputInfo *output;
+		XRRModeInfo *mode;
+
+		output = XRRGetOutputInfo(dpy, res, res->outputs[i]);
+		if (output == NULL)
+			continue;
+
+		mode = NULL;
+		if (res->nmode)
+			mode = lookup_mode(res, output->modes[0]);
+
+		for (j = 0; mode && j < 2*output->ncrtc; j++) {
+			int c = j;
+			if (c >= output->ncrtc)
+				c = 2*output->ncrtc - j - 1;
+
+			printf("[%d, %d] -- OUTPUT:%ld, CRTC:%ld: %dx%d\n",
+			       i, c, (long)res->outputs[i], (long)output->crtcs[c],
+			       mode->width, mode->height);
+			XRRSetCrtcConfig(dpy, res, output->crtcs[c], CurrentTime,
+					 0, 0, output->modes[0], RR_Rotate_0, &res->outputs[i], 1);
+
+			run(dpy, root, "root");
+
+			win = XCreateWindow(dpy, root,
+					    0, 0, mode->width, mode->height, 0,
+					    DefaultDepth(dpy, DefaultScreen(dpy)),
+					    InputOutput,
+					    DefaultVisual(dpy, DefaultScreen(dpy)),
+					    CWOverrideRedirect, &attr);
+			fullscreen(dpy, win);
+			XMapWindow(dpy, win);
+			run(dpy, win, "fullscreen");
+			XDestroyWindow(dpy, win);
+
+			win = XCreateWindow(dpy, root,
+					    0, 0, mode->width, mode->height, 0,
+					    DefaultDepth(dpy, DefaultScreen(dpy)),
+					    InputOutput,
+					    DefaultVisual(dpy, DefaultScreen(dpy)),
+					    CWOverrideRedirect, &attr);
+			XMapWindow(dpy, win);
+			run(dpy, win, "windowed");
+			XDestroyWindow(dpy, win);
+
+			win = XCreateWindow(dpy, root,
+					    0, 0, mode->width/2, mode->height/2, 0,
+					    DefaultDepth(dpy, DefaultScreen(dpy)),
+					    InputOutput,
+					    DefaultVisual(dpy, DefaultScreen(dpy)),
+					    CWOverrideRedirect, &attr);
+			XMapWindow(dpy, win);
+			run(dpy, win, "half");
+			XDestroyWindow(dpy, win);
+
+			XRRSetCrtcConfig(dpy, res, output->crtcs[c], CurrentTime,
+					 0, 0, None, RR_Rotate_0, NULL, 0);
+		}
+
+		XRRFreeOutputInfo(output);
+	}
+
+	for (i = 0; i < res->ncrtc; i++)
+		XRRSetCrtcConfig(dpy, res, res->crtcs[i], CurrentTime,
+				 original_crtc[i]->x,
+				 original_crtc[i]->y,
+				 original_crtc[i]->mode,
+				 original_crtc[i]->rotation,
+				 original_crtc[i]->outputs,
+				 original_crtc[i]->noutput);
+
+	if (DPMSQueryExtension(dpy, &i, &i))
+		DPMSEnable(dpy);
+	return 0;
+}
diff --git a/test/present-test.c b/test/present-test.c
index 6b562eb..a4cadc2 100644
--- a/test/present-test.c
+++ b/test/present-test.c
@@ -31,6 +31,7 @@
 #include <X11/xshmfence.h>
 #include <X11/Xutil.h>
 #include <X11/Xlibint.h>
+#include <X11/extensions/dpms.h>
 #include <X11/extensions/randr.h>
 #include <X11/extensions/Xrandr.h>
 #include <X11/extensions/Xrender.h>
@@ -44,6 +45,8 @@
 #endif
 #include <xcb/xcb.h>
 #include <xcb/present.h>
+#include <xcb/xfixes.h>
+#include <xcb/dri3.h>
 #include <xf86drm.h>
 #include <i915_drm.h>
 
@@ -134,12 +137,14 @@ static void *setup_msc(Display *dpy,  Window win)
 	return q;
 }
 
-static uint64_t check_msc(Display *dpy, Window win, void *q, uint64_t last_msc)
+static uint64_t check_msc(Display *dpy, Window win, void *q, uint64_t last_msc, uint64_t *ust)
 {
 	xcb_connection_t *c = XGetXCBConnection(dpy);
+	static uint32_t serial = 1;
 	uint64_t msc = 0;
+	int complete = 0;
 
-	xcb_present_notify_msc(c, win, 0, 0, 0, 0);
+	xcb_present_notify_msc(c, win, serial ^ 0xcc00ffee, 0, 0, 0);
 	xcb_flush(c);
 
 	do {
@@ -151,25 +156,75 @@ static uint64_t check_msc(Display *dpy, Window win, void *q, uint64_t last_msc)
 			break;
 
 		ce = (xcb_present_complete_notify_event_t *)ev;
-		if (ce->kind != XCB_PRESENT_COMPLETE_KIND_PIXMAP)
+		if (ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC &&
+		    ce->serial == (serial ^ 0xcc00ffee)) {
 			msc = ce->msc;
+			if (ust)
+				*ust = ce->ust;
+			complete = 1;
+		}
 		free(ev);
-	} while (msc == 0);
+	} while (!complete);
 
-	if (msc < last_msc) {
+	if ((int64_t)(msc - last_msc) < 0) {
 		printf("Invalid MSC: was %llu, now %llu\n",
 		       (long long)last_msc, (long long)msc);
 	}
 
+	if (++serial == 0)
+		serial = 1;
+
 	return msc;
 }
 
+static uint64_t msc_interval(Display *dpy, Window win, void *q)
+{
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	uint64_t msc, ust;
+	int complete = 0;
+
+	msc = check_msc(dpy, win, q, 0, NULL);
+
+	xcb_present_notify_msc(c, win, 0xc0ffee00, msc, 0, 0);
+	xcb_present_notify_msc(c, win, 0xc0ffee01, msc + 10, 0, 0);
+	xcb_flush(c);
+
+	ust = msc = 0;
+	do {
+		xcb_present_complete_notify_event_t *ce;
+		xcb_generic_event_t *ev;
+
+		ev = xcb_wait_for_special_event(c, q);
+		if (ev == NULL)
+			break;
+
+		ce = (xcb_present_complete_notify_event_t *)ev;
+		if (ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC &&
+		    ce->serial == 0xc0ffee00) {
+			msc -= ce->msc;
+			ust -= ce->ust;
+			complete++;
+		}
+		if (ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC &&
+		    ce->serial == 0xc0ffee01) {
+			msc += ce->msc;
+			ust += ce->ust;
+			complete++;
+		}
+		free(ev);
+	} while (complete != 2);
+
+	return (ust + msc/2) / msc;
+}
+
 static void teardown_msc(Display *dpy, void *q)
 {
 	xcb_unregister_for_special_event(XGetXCBConnection(dpy), q);
 }
+
 static int test_whole(Display *dpy)
 {
+	xcb_connection_t *c = XGetXCBConnection(dpy);
 	Pixmap pixmap;
 	struct dri3_fence fence;
 	Window root;
@@ -189,9 +244,7 @@ static int test_whole(Display *dpy)
 	xshmfence_reset(fence.addr);
 
 	pixmap = XCreatePixmap(dpy, root, width, height, depth);
-	xcb_present_pixmap(XGetXCBConnection(dpy),
-			   root, pixmap,
-			   0, /* sbc */
+	xcb_present_pixmap(c, root, pixmap, 0,
 			   0, /* valid */
 			   0, /* update */
 			   0, /* x_off */
@@ -207,9 +260,7 @@ static int test_whole(Display *dpy)
 	XFreePixmap(dpy, pixmap);
 
 	pixmap = XCreatePixmap(dpy, root, width, height, depth);
-	xcb_present_pixmap(XGetXCBConnection(dpy),
-			   root, pixmap,
-			   0, /* sbc */
+	xcb_present_pixmap(c, root, pixmap, 0,
 			   0, /* valid */
 			   0, /* update */
 			   0, /* x_off */
@@ -234,6 +285,889 @@ static int test_whole(Display *dpy)
 	return ret;
 }
 
+static uint64_t flush_flips(Display *dpy, Window win, Pixmap pixmap, void *Q, uint64_t *ust)
+{
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	uint64_t msc;
+	int complete;
+
+	msc = check_msc(dpy, win, Q, 0, NULL);
+	xcb_present_pixmap(c, win, pixmap,
+			   0xdeadbeef, /* serial */
+			   0, /* valid */
+			   0, /* update */
+			   0, /* x_off */
+			   0, /* y_off */
+			   None,
+			   None, /* wait fence */
+			   None,
+			   XCB_PRESENT_OPTION_NONE,
+			   msc + 60, /* target msc */
+			   0, /* divisor */
+			   0, /* remainder */
+			   0, NULL);
+	xcb_flush(c);
+	complete = 0;
+	do {
+		xcb_present_complete_notify_event_t *ce;
+		xcb_generic_event_t *ev;
+
+		ev = xcb_wait_for_special_event(c, Q);
+		if (ev == NULL)
+			break;
+
+		ce = (xcb_present_complete_notify_event_t *)ev;
+		complete = (ce->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP &&
+			    ce->serial == 0xdeadbeef);
+		free(ev);
+	} while (!complete);
+	XSync(dpy, True);
+
+	return check_msc(dpy, win, Q, msc, ust);
+}
+
+static int test_double(Display *dpy, void *Q)
+{
+#define COUNT (15*60)
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	Pixmap pixmap;
+	Window root;
+	unsigned int width, height;
+	unsigned border, depth;
+	int x, y, n, ret;
+	struct {
+		uint64_t msc, ust;
+	} frame[COUNT+1];
+	int offset = 0;
+
+	XGetGeometry(dpy, DefaultRootWindow(dpy),
+		     &root, &x, &y, &width, &height, &border, &depth);
+
+	printf("Testing whole screen flip double buffering: %dx%d\n", width, height);
+	_x_error_occurred = 0;
+
+	pixmap = XCreatePixmap(dpy, root, width, height, depth);
+	flush_flips(dpy, root, pixmap, Q, NULL);
+	for (n = 0; n <= COUNT; n++) {
+		int complete;
+
+		xcb_present_pixmap(c, root, pixmap, n,
+				   0, /* valid */
+				   0, /* update */
+				   0, /* x_off */
+				   0, /* y_off */
+				   None,
+				   None, /* wait fence */
+				   None,
+				   XCB_PRESENT_OPTION_NONE,
+				   0, /* target msc */
+				   0, /* divisor */
+				   0, /* remainder */
+				   0, NULL);
+		xcb_flush(c);
+
+		complete = 0;
+		do {
+			xcb_present_complete_notify_event_t *ce;
+			xcb_generic_event_t *ev;
+
+			ev = xcb_wait_for_special_event(c, Q);
+			if (ev == NULL)
+				break;
+
+			ce = (xcb_present_complete_notify_event_t *)ev;
+			if (ce->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP &&
+			    ce->serial == n) {
+				frame[n].msc = ce->msc;
+				frame[n].ust = ce->ust;
+				complete = 1;
+			}
+			free(ev);
+		} while (!complete);
+	}
+	XFreePixmap(dpy, pixmap);
+
+	XSync(dpy, True);
+	ret = !!_x_error_occurred;
+
+	if (frame[COUNT].msc - frame[0].msc != COUNT) {
+		printf("Expected %d frames interval, %d elapsed instead\n",
+		       COUNT, (int)(frame[COUNT].msc - frame[0].msc));
+		for (n = 0; n <= COUNT; n++) {
+			if (frame[n].msc - frame[0].msc != n + offset) {
+				printf("frame[%d]: msc=%03lld, ust=%lld\n", n,
+				       (long long)(frame[n].msc - frame[0].msc),
+				       (long long)(frame[n].ust - frame[0].ust));
+				offset = frame[n].msc - frame[0].msc - n;
+				ret++;
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int test_future(Display *dpy, void *Q)
+{
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	Pixmap pixmap;
+	struct dri3_fence fence;
+	Window root;
+	unsigned int width, height;
+	unsigned border, depth;
+	int x, y, ret = 0, n;
+	uint64_t msc, ust;
+	int complete;
+	int early = 0, late = 0;
+	int earliest = 0, latest = 0;
+	uint64_t interval;
+
+	XGetGeometry(dpy, DefaultRootWindow(dpy),
+		     &root, &x, &y, &width, &height, &border, &depth);
+
+	if (dri3_create_fence(dpy, root, &fence))
+		return 0;
+
+	printf("Testing whole screen flips into the future: %dx%d\n", width, height);
+	_x_error_occurred = 0;
+
+	interval = msc_interval(dpy, root, Q);
+
+	pixmap = XCreatePixmap(dpy, root, width, height, depth);
+	msc = flush_flips(dpy, root, pixmap, Q, &ust);
+	for (n = 1; n <= 10; n++)
+		xcb_present_pixmap(c, root, pixmap,
+				   n, /* serial */
+				   0, /* valid */
+				   0, /* update */
+				   0, /* x_off */
+				   0, /* y_off */
+				   None,
+				   None, /* wait fence */
+				   None,
+				   XCB_PRESENT_OPTION_NONE,
+				   msc + 60 + n*15*60, /* target msc */
+				   0, /* divisor */
+				   0, /* remainder */
+				   0, NULL);
+	xcb_present_pixmap(c, root, pixmap,
+			   0xdeadbeef, /* serial */
+			   0, /* valid */
+			   0, /* update */
+			   0, /* x_off */
+			   0, /* y_off */
+			   None,
+			   None, /* wait fence */
+			   None,
+			   XCB_PRESENT_OPTION_NONE,
+			   msc + 60 + n*15*60, /* target msc */
+			   0, /* divisor */
+			   0, /* remainder */
+			   0, NULL);
+	xcb_flush(c);
+
+	complete = 0;
+	do {
+		xcb_present_complete_notify_event_t *ce;
+		xcb_generic_event_t *ev;
+
+		ev = xcb_wait_for_special_event(c, Q);
+		if (ev == NULL)
+			break;
+
+		ce = (xcb_present_complete_notify_event_t *)ev;
+		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP);
+
+		if (ce->serial == 0xdeadbeef) {
+			int64_t time;
+
+			time = ce->ust - (ust + (60 + 15*60*n) * interval);
+			if (time < -(int64_t)interval) {
+				fprintf(stderr,
+					"\tflips completed too early by %lldms\n",
+					(long long)(-time / 1000));
+			} else if (time > (int64_t)interval) {
+				fprintf(stderr,
+					"\tflips completed too late by %lldms\n",
+					(long long)(time / 1000));
+			}
+			complete = 1;
+		} else {
+			int diff = (int64_t)(ce->msc - (15*60*ce->serial + msc + 60));
+			if (diff < 0) {
+				if (-diff > earliest) {
+					fprintf(stderr, "\tframe %d displayed early by %d frames\n", ce->serial, -diff);
+					earliest = -diff;
+				}
+				early++;
+				ret++;
+			} else if (diff > 0) {
+				if (diff > latest) {
+					fprintf(stderr, "\tframe %d displayed late by %d frames\n", ce->serial, diff);
+					latest = diff;
+				}
+				late++;
+				ret++;
+			}
+		}
+		free(ev);
+	} while (!complete);
+
+	if (early)
+		printf("\t%d frames shown too early (worst %d)!\n", early, earliest);
+	if (late)
+		printf("\t%d frames shown too late (worst %d)!\n", late, latest);
+
+	ret += !!_x_error_occurred;
+
+	return ret;
+}
+
+static int test_exhaustion(Display *dpy, void *Q)
+{
+#define N_VBLANKS 256 /* kernel event queue length: 128 vblanks */
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	Pixmap pixmap;
+	struct dri3_fence fence[2];
+	Window root;
+	xcb_xfixes_region_t region;
+	unsigned int width, height;
+	unsigned border, depth;
+	int x, y, ret = 0, n;
+	uint64_t target, final;
+
+	XGetGeometry(dpy, DefaultRootWindow(dpy),
+		     &root, &x, &y, &width, &height, &border, &depth);
+
+	if (dri3_create_fence(dpy, root, &fence[0]) ||
+	    dri3_create_fence(dpy, root, &fence[1]))
+		return 0;
+
+	printf("Testing whole screen flips with long vblank queues: %dx%d\n", width, height);
+	_x_error_occurred = 0;
+
+	region = xcb_generate_id(c);
+	xcb_xfixes_create_region(c, region, 0, NULL);
+
+	pixmap = XCreatePixmap(dpy, root, width, height, depth);
+	xshmfence_reset(fence[0].addr);
+	xshmfence_reset(fence[1].addr);
+	target = check_msc(dpy, root, Q, 0, NULL);
+	for (n = N_VBLANKS; n--; )
+		xcb_present_pixmap(c, root, pixmap, 0,
+				   0, /* valid */
+				   region, /* update */
+				   0, /* x_off */
+				   0, /* y_off */
+				   None,
+				   None, /* wait fence */
+				   None,
+				   XCB_PRESENT_OPTION_NONE,
+				   target + N_VBLANKS, /* target msc */
+				   1, /* divisor */
+				   0, /* remainder */
+				   0, NULL);
+	xcb_present_pixmap(c, root, pixmap, 0,
+			   region, /* valid */
+			   region, /* update */
+			   0, /* x_off */
+			   0, /* y_off */
+			   None,
+			   None, /* wait fence */
+			   fence[0].xid,
+			   XCB_PRESENT_OPTION_NONE,
+			   target, /* target msc */
+			   0, /* divisor */
+			   0, /* remainder */
+			   0, NULL);
+	for (n = 1; n < N_VBLANKS; n++)
+		xcb_present_pixmap(c, root, pixmap, 0,
+				   region, /* valid */
+				   region, /* update */
+				   0, /* x_off */
+				   0, /* y_off */
+				   None,
+				   None, /* wait fence */
+				   None,
+				   XCB_PRESENT_OPTION_NONE,
+				   target + n, /* target msc */
+				   0, /* divisor */
+				   0, /* remainder */
+				   0, NULL);
+	xcb_present_pixmap(c, root, pixmap, 0,
+			   region, /* valid */
+			   region, /* update */
+			   0, /* x_off */
+			   0, /* y_off */
+			   None,
+			   None, /* wait fence */
+			   fence[1].xid,
+			   XCB_PRESENT_OPTION_NONE,
+			   target + N_VBLANKS, /* target msc */
+			   0, /* divisor */
+			   0, /* remainder */
+			   0, NULL);
+	xcb_flush(c);
+
+	ret += !!xshmfence_await(fence[0].addr);
+	final = check_msc(dpy, root, Q, 0, NULL);
+	if (final < target) {
+		printf("\tFirst flip too early, MSC was %llu, expected %llu\n",
+		       (long long)final, (long long)target);
+		ret++;
+	} else if (final > target + 1) {
+		printf("\tFirst flip too late, MSC was %llu, expected %llu\n",
+		       (long long)final, (long long)target);
+		ret++;
+	}
+
+	ret += !!xshmfence_await(fence[1].addr);
+	final = check_msc(dpy, root, Q, 0, NULL);
+	if (final < target + N_VBLANKS) {
+		printf("\tLast flip too early, MSC was %llu, expected %llu\n",
+		       (long long)final, (long long)(target + N_VBLANKS));
+		ret++;
+	} else if (final > target + N_VBLANKS + 1) {
+		printf("\tLast flip too late, MSC was %llu, expected %llu\n",
+		       (long long)final, (long long)(target + N_VBLANKS));
+		ret++;
+	}
+
+	flush_flips(dpy, root, pixmap, Q, NULL);
+
+	XFreePixmap(dpy, pixmap);
+	xcb_xfixes_destroy_region(c, region);
+	dri3_fence_free(dpy, &fence[1]);
+	dri3_fence_free(dpy, &fence[0]);
+
+	XSync(dpy, True);
+	ret += !!_x_error_occurred;
+
+	return ret;
+#undef N_VBLANKS
+}
+
+static int test_accuracy(Display *dpy, void *Q)
+{
+#define N_VBLANKS (60 * 120) /* ~2 minutes */
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	Pixmap pixmap;
+	Window root;
+	unsigned int width, height;
+	unsigned border, depth;
+	int x, y, ret = 0, n;
+	uint64_t target;
+	int early = 0, late = 0;
+	int earliest = 0, latest = 0;
+	int complete;
+
+	XGetGeometry(dpy, DefaultRootWindow(dpy),
+		     &root, &x, &y, &width, &height, &border, &depth);
+
+	printf("Testing whole screen flip accuracy: %dx%d\n", width, height);
+	_x_error_occurred = 0;
+
+	pixmap = XCreatePixmap(dpy, root, width, height, depth);
+	target = flush_flips(dpy, root, pixmap, Q, NULL);
+	for (n = 0; n <= N_VBLANKS; n++)
+		xcb_present_pixmap(c, root, pixmap,
+				   n, /* serial */
+				   0, /* valid */
+				   0, /* update */
+				   0, /* x_off */
+				   0, /* y_off */
+				   None,
+				   None, /* wait fence */
+				   None,
+				   XCB_PRESENT_OPTION_NONE,
+				   target + 60 + n, /* target msc */
+				   0, /* divisor */
+				   0, /* remainder */
+				   0, NULL);
+	xcb_present_pixmap(c, root, pixmap,
+			   0xdeadbeef, /* serial */
+			   0, /* valid */
+			   0, /* update */
+			   0, /* x_off */
+			   0, /* y_off */
+			   None,
+			   None, /* wait fence */
+			   None,
+			   XCB_PRESENT_OPTION_NONE,
+			   target + 60 + n, /* target msc */
+			   0, /* divisor */
+			   0, /* remainder */
+			   0, NULL);
+	xcb_flush(c);
+
+	complete = 0;
+	do {
+		xcb_present_complete_notify_event_t *ce;
+		xcb_generic_event_t *ev;
+
+		ev = xcb_wait_for_special_event(c, Q);
+		if (ev == NULL)
+			break;
+
+		ce = (xcb_present_complete_notify_event_t *)ev;
+		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP);
+
+		if (ce->serial != 0xdeadbeef) {
+			int diff = (int64_t)(ce->msc - (target + ce->serial + 60));
+			if (diff < 0) {
+				if (-diff > earliest) {
+					fprintf(stderr, "\tframe %d displayed early by %d frames\n", ce->serial, -diff);
+					earliest = -diff;
+				}
+				early++;
+				ret++;
+			} else if (diff > 0) {
+				if (diff > latest) {
+					fprintf(stderr, "\tframe %d displayed late by %d frames\n", ce->serial, diff);
+					latest = diff;
+				}
+				late++;
+				ret++;
+			}
+		} else
+			complete = 1;
+		free(ev);
+	} while (!complete);
+
+	if (early)
+		printf("\t%d frames shown too early (worst %d)!\n", early, earliest);
+	if (late)
+		printf("\t%d frames shown too late (worst %d)!\n", late, latest);
+
+	XFreePixmap(dpy, pixmap);
+
+	XSync(dpy, True);
+	ret += !!_x_error_occurred;
+
+	return ret;
+#undef N_VBLANKS
+}
+
+static int test_modulus(Display *dpy, void *Q)
+{
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	Pixmap pixmap;
+	Window root;
+	unsigned int width, height;
+	unsigned border, depth;
+	xcb_xfixes_region_t region;
+	int x, y, ret = 0;
+	uint64_t target;
+	int early = 0, late = 0;
+	int earliest = 0, latest = 0;
+	int complete;
+
+	XGetGeometry(dpy, DefaultRootWindow(dpy),
+		     &root, &x, &y, &width, &height, &border, &depth);
+
+	printf("Testing whole screen flip modulus: %dx%d\n", width, height);
+	_x_error_occurred = 0;
+
+	region = xcb_generate_id(c);
+	xcb_xfixes_create_region(c, region, 0, NULL);
+
+	pixmap = XCreatePixmap(dpy, root, width, height, depth);
+	target = flush_flips(dpy, root, pixmap, Q, NULL);
+	for (x = 1; x <= 7; x++) {
+		for (y = 0; y < x; y++) {
+			xcb_present_pixmap(c, root, pixmap,
+					   y << 16 | x, /* serial */
+					   region, /* valid */
+					   region, /* update */
+					   0, /* x_off */
+					   0, /* y_off */
+					   None,
+					   None, /* wait fence */
+					   None,
+					   XCB_PRESENT_OPTION_NONE,
+					   0, /* target msc */
+					   x, /* divisor */
+					   y, /* remainder */
+					   0, NULL);
+		}
+	}
+	xcb_present_pixmap(c, root, pixmap,
+			   0xdeadbeef, /* serial */
+			   0, /* valid */
+			   0, /* update */
+			   0, /* x_off */
+			   0, /* y_off */
+			   None,
+			   None, /* wait fence */
+			   None,
+			   XCB_PRESENT_OPTION_NONE,
+			   target + 2*x, /* target msc */
+			   0, /* divisor */
+			   0, /* remainder */
+			   0, NULL);
+	xcb_flush(c);
+
+	complete = 0;
+	do {
+		xcb_present_complete_notify_event_t *ce;
+		xcb_generic_event_t *ev;
+
+		ev = xcb_wait_for_special_event(c, Q);
+		if (ev == NULL)
+			break;
+
+		ce = (xcb_present_complete_notify_event_t *)ev;
+		if (ce->kind != XCB_PRESENT_COMPLETE_KIND_PIXMAP)
+			break;
+
+		assert(ce->serial);
+		if (ce->serial != 0xdeadbeef) {
+			uint64_t msc;
+			int diff;
+
+			x = ce->serial & 0xffff;
+			y = ce->serial >> 16;
+
+			msc = target;
+			msc -= target % x;
+			msc += y;
+			if (msc <= target)
+				msc += x;
+
+			diff = (int64_t)(ce->msc - msc);
+			if (diff < 0) {
+				if (-diff > earliest) {
+					fprintf(stderr, "\tframe (%d, %d) displayed early by %d frames\n", y, x, -diff);
+					earliest = -diff;
+				}
+				early++;
+				ret++;
+			} else if (diff > 0) {
+				if (diff > latest) {
+					fprintf(stderr, "\tframe (%d, %d) displayed late by %d frames\n", y, x, diff);
+					latest = diff;
+				}
+				late++;
+				ret++;
+			}
+		} else
+			complete = 1;
+		free(ev);
+	} while (!complete);
+
+	if (early)
+		printf("\t%d frames shown too early (worst %d)!\n", early, earliest);
+	if (late)
+		printf("\t%d frames shown too late (worst %d)!\n", late, latest);
+
+	XFreePixmap(dpy, pixmap);
+	xcb_xfixes_destroy_region(c, region);
+
+	XSync(dpy, True);
+	ret += !!_x_error_occurred;
+
+	return ret;
+}
+
+static int test_future_msc(Display *dpy, void *Q)
+{
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	Window root = DefaultRootWindow(dpy);
+	int ret = 0, n;
+	uint64_t msc, ust;
+	int complete;
+	int early = 0, late = 0;
+	int earliest = 0, latest = 0;
+	uint64_t interval;
+
+	printf("Testing notifies into the future\n");
+	_x_error_occurred = 0;
+
+	interval = msc_interval(dpy, root, Q);
+	msc = check_msc(dpy, root, Q, 0, &ust);
+
+	for (n = 1; n <= 10; n++)
+		xcb_present_notify_msc(c, root, n, msc + 60 + n*15*60, 0, 0);
+	xcb_present_notify_msc(c, root, 0xdeadbeef, msc + 60 + n*15*60, 0, 0);
+	xcb_flush(c);
+
+	complete = 0;
+	do {
+		xcb_present_complete_notify_event_t *ce;
+		xcb_generic_event_t *ev;
+
+		ev = xcb_wait_for_special_event(c, Q);
+		if (ev == NULL)
+			break;
+
+		ce = (xcb_present_complete_notify_event_t *)ev;
+		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
+
+		if (ce->serial == 0xdeadbeef) {
+			int64_t time;
+
+			time = ce->ust - (ust + (60 + 15*60*n) * interval);
+			if (time < -(int64_t)interval) {
+				fprintf(stderr,
+					"\tnotifies completed too early by %lldms\n",
+					(long long)(-time / 1000));
+			} else if (time > (int64_t)interval) {
+				fprintf(stderr,
+					"\tnotifies completed too late by %lldms\n",
+					(long long)(time / 1000));
+			}
+			complete = 1;
+		} else {
+			int diff = (int64_t)(ce->msc - (15*60*ce->serial + msc + 60));
+			if (diff < 0) {
+				if (-diff > earliest) {
+					fprintf(stderr, "\tnotify %d early by %d msc\n", ce->serial, -diff);
+					earliest = -diff;
+				}
+				early++;
+				ret++;
+			} else if (diff > 0) {
+				if (diff > latest) {
+					fprintf(stderr, "\tnotify %d late by %d msc\n", ce->serial, diff);
+					latest = diff;
+				}
+				late++;
+				ret++;
+			}
+		}
+		free(ev);
+	} while (!complete);
+
+	if (early)
+		printf("\t%d notifies too early (worst %d)!\n", early, earliest);
+	if (late)
+		printf("\t%d notifies too late (worst %d)!\n", late, latest);
+
+	XSync(dpy, True);
+	ret += !!_x_error_occurred;
+
+	return ret;
+}
+
+static int test_exhaustion_msc(Display *dpy, void *Q)
+{
+#define N_VBLANKS 256 /* kernel event queue length: 128 vblanks */
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	Window root = DefaultRootWindow(dpy);
+	int ret = 0, n, complete;
+	int earliest = 0, early = 0;
+	int latest = 0, late = 0;
+	uint64_t msc;
+
+	printf("Testing notifies with long queues\n");
+	_x_error_occurred = 0;
+
+	msc = check_msc(dpy, root, Q, 0, NULL);
+	for (n = N_VBLANKS; n--; )
+		xcb_present_notify_msc(c, root, N_VBLANKS, msc + N_VBLANKS, 0, 0);
+	for (n = 1; n <= N_VBLANKS ; n++)
+		xcb_present_notify_msc(c, root, n, msc + n, 0, 0);
+	xcb_flush(c);
+
+	complete = 2*N_VBLANKS;
+	do {
+		xcb_present_complete_notify_event_t *ce;
+		xcb_generic_event_t *ev;
+		int diff;
+
+		ev = xcb_wait_for_special_event(c, Q);
+		if (ev == NULL)
+			break;
+
+		ce = (xcb_present_complete_notify_event_t *)ev;
+		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
+
+		diff = (int64_t)(ce->msc - msc - ce->serial);
+		if (diff < 0) {
+			if (-diff > earliest) {
+				fprintf(stderr, "\tnotify %d early by %d msc\n",(int)ce->serial, -diff);
+				earliest = -diff;
+			}
+			early++;
+			ret++;
+		} else if (diff > 0) {
+			if (diff > latest) {
+				fprintf(stderr, "\tnotify %d late by %d msc\n", (int)ce->serial, diff);
+				latest = diff;
+			}
+			late++;
+			ret++;
+		}
+		free(ev);
+	} while (--complete);
+
+	if (early)
+		printf("\t%d notifies too early (worst %d)!\n", early, earliest);
+	if (late)
+		printf("\t%d notifies too late (worst %d)!\n", late, latest);
+
+	XSync(dpy, True);
+	ret += !!_x_error_occurred;
+
+	return ret;
+#undef N_VBLANKS
+}
+
+static int test_accuracy_msc(Display *dpy, void *Q)
+{
+#define N_VBLANKS (60 * 120) /* ~2 minutes */
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	Window root = DefaultRootWindow(dpy);
+	int ret = 0, n;
+	uint64_t msc;
+	int early = 0, late = 0;
+	int earliest = 0, latest = 0;
+	int complete;
+
+	printf("Testing notify accuracy\n");
+	_x_error_occurred = 0;
+
+	msc = check_msc(dpy, root, Q, 0, NULL);
+	for (n = 0; n <= N_VBLANKS; n++)
+		xcb_present_notify_msc(c, root, n, msc + 60 + n, 0, 0);
+	xcb_present_notify_msc(c, root, 0xdeadbeef, msc + 60 + n, 0, 0);
+	xcb_flush(c);
+
+	complete = 0;
+	do {
+		xcb_present_complete_notify_event_t *ce;
+		xcb_generic_event_t *ev;
+
+		ev = xcb_wait_for_special_event(c, Q);
+		if (ev == NULL)
+			break;
+
+		ce = (xcb_present_complete_notify_event_t *)ev;
+		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
+
+		if (ce->serial != 0xdeadbeef) {
+			int diff = (int64_t)(ce->msc - (msc + ce->serial + 60));
+			if (diff < 0) {
+				if (-diff > earliest) {
+					fprintf(stderr, "\tnotify %d early by %d msc\n", ce->serial, -diff);
+					earliest = -diff;
+				}
+				early++;
+				ret++;
+			} else if (diff > 0) {
+				if (diff > latest) {
+					fprintf(stderr, "\tnotify %d late by %d msc\n", ce->serial, diff);
+					latest = diff;
+				}
+				late++;
+				ret++;
+			}
+		} else
+			complete = 1;
+		free(ev);
+	} while (!complete);
+
+	if (early)
+		printf("\t%d notifies too early (worst %d)!\n", early, earliest);
+	if (late)
+		printf("\t%d notifies too late (worst %d)!\n", late, latest);
+
+	XSync(dpy, True);
+	ret += !!_x_error_occurred;
+
+	return ret;
+#undef N_VBLANKS
+}
+
+static int test_modulus_msc(Display *dpy, void *Q)
+{
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	Window root = DefaultRootWindow(dpy);
+	xcb_present_complete_notify_event_t *ce;
+	xcb_generic_event_t *ev;
+	int x, y, ret = 0;
+	uint64_t target;
+	int early = 0, late = 0;
+	int earliest = 0, latest = 0;
+	int complete;
+
+	printf("Testing notify modulus\n");
+	_x_error_occurred = 0;
+
+	target = check_msc(dpy, root, Q, 0, NULL);
+
+	xcb_present_notify_msc(c, root, 0, 0, 0, 0);
+	for (x = 1; x <= 7; x++) {
+		for (y = 0; y < x; y++) {
+			xcb_present_notify_msc(c, root, y << 16 | x, 0, x, y);
+		}
+	}
+	xcb_present_notify_msc(c, root, 0xdeadbeef, target + 2*x, 0, 0);
+	xcb_flush(c);
+
+	ev = xcb_wait_for_special_event(c, Q);
+	if (ev) {
+		ce = (xcb_present_complete_notify_event_t *)ev;
+		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
+		assert(ce->serial == 0);
+		target = ce->msc;
+	}
+
+	complete = 0;
+	do {
+		ev = xcb_wait_for_special_event(c, Q);
+		if (ev == NULL)
+			break;
+
+		ce = (xcb_present_complete_notify_event_t *)ev;
+		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
+
+		assert(ce->serial);
+		if (ce->serial != 0xdeadbeef) {
+			uint64_t msc;
+			int diff;
+
+			x = ce->serial & 0xffff;
+			y = ce->serial >> 16;
+
+			msc = target;
+			msc -= target % x;
+			msc += y;
+			if (msc <= target)
+				msc += x;
+
+			diff = (int64_t)(ce->msc - msc);
+			if (diff < 0) {
+				if (-diff > earliest) {
+					fprintf(stderr, "\tnotify (%d, %d) early by %d msc (target %lld, reported %lld)\n", y, x, -diff, (long long)msc, (long long)ce->msc);
+					earliest = -diff;
+				}
+				early++;
+				ret++;
+			} else if (diff > 0) {
+				if (diff > latest) {
+					fprintf(stderr, "\tnotify (%d, %d) late by %d msc (target %lld, reported %lld)\n", y, x, diff, (long long)msc, (long long)ce->msc);
+					latest = diff;
+				}
+				late++;
+				ret++;
+			}
+		} else
+			complete = 1;
+		free(ev);
+	} while (!complete);
+
+	if (early)
+		printf("\t%d notifies too early (worst %d)!\n", early, earliest);
+	if (late)
+		printf("\t%d notifies too late (worst %d)!\n", late, latest);
+
+	XSync(dpy, True);
+	ret += !!_x_error_occurred;
+
+	return ret;
+}
+
 static inline XRRScreenResources *_XRRGetScreenResourcesCurrent(Display *dpy, Window window)
 {
 	XRRScreenResources *res;
@@ -279,8 +1213,6 @@ static int for_each_crtc(Display *dpy,
 	for (i = 0; i < res->ncrtc; i++)
 		original_crtc[i] = XRRGetCrtcInfo(dpy, res, res->crtcs[i]);
 
-	printf("noutput=%d, ncrtc=%d\n", res->noutput, res->ncrtc);
-
 	for (i = 0; i < res->noutput; i++) {
 		XRROutputInfo *output;
 		XRRModeInfo *mode;
@@ -322,7 +1254,7 @@ static int for_each_crtc(Display *dpy,
 	free(original_crtc);
 	XRRFreeScreenResources(res);
 
-	return j;
+	return err;
 }
 
 struct test_crtc {
@@ -335,6 +1267,7 @@ struct test_crtc {
 	uint64_t msc;
 };
 #define SYNC 0x1
+#define FUTURE 0x2
 
 static int __test_crtc(Display *dpy, RRCrtc crtc,
 		       int width, int height,
@@ -344,7 +1277,7 @@ static int __test_crtc(Display *dpy, RRCrtc crtc,
 	Pixmap pixmap;
 	int err = 0;
 
-	test->msc = check_msc(dpy, test->win, test->queue, test->msc);
+	test->msc = check_msc(dpy, test->win, test->queue, test->msc, NULL);
 
 	if (test->flags & SYNC)
 		xshmfence_reset(test->fence.addr);
@@ -361,16 +1294,14 @@ static int __test_crtc(Display *dpy, RRCrtc crtc,
 			   None, /* wait fence */
 			   test->flags & SYNC ? test->fence.xid : None,
 			   XCB_PRESENT_OPTION_NONE,
-			   0, /* target msc */
+			   test->msc, /* target msc */
 			   1, /* divisor */
 			   0, /* remainder */
 			   0, NULL);
-	XFreePixmap(dpy, pixmap);
-
 	if (test->flags & SYNC) {
-		pixmap = XCreatePixmap(dpy, test->win, width, height, test->depth);
+		Pixmap tmp = XCreatePixmap(dpy, test->win, width, height, test->depth);
 		xcb_present_pixmap(XGetXCBConnection(dpy),
-				   test->win, pixmap,
+				   test->win, tmp,
 				   1, /* sbc */
 				   0, /* valid */
 				   0, /* update */
@@ -380,16 +1311,17 @@ static int __test_crtc(Display *dpy, RRCrtc crtc,
 				   None, /* wait fence */
 				   None, /* sync fence */
 				   XCB_PRESENT_OPTION_NONE,
-				   1, /* target msc */
+				   test->msc + (test->flags & FUTURE ? 5 * 16 : 1), /* target msc */
 				   1, /* divisor */
 				   0, /* remainder */
 				   0, NULL);
-		XFreePixmap(dpy, pixmap);
+		XFreePixmap(dpy, tmp);
 		XFlush(dpy);
 		err += !!xshmfence_await(test->fence.addr);
 	}
+	XFreePixmap(dpy, pixmap);
 
-	test->msc = check_msc(dpy, test->win, test->queue, test->msc);
+	test->msc = check_msc(dpy, test->win, test->queue, test->msc, NULL);
 	return err;
 }
 
@@ -410,15 +1342,23 @@ static int test_crtc(Display *dpy, void *queue, uint64_t last_msc)
 
 	printf("Testing each crtc, without waiting for each flip\n");
 	test.flags = 0;
+	test.msc = check_msc(dpy, test.win, test.queue, test.msc, NULL);
 	err += for_each_crtc(dpy, __test_crtc, &test);
+	test.msc = check_msc(dpy, test.win, test.queue, test.msc, NULL);
 
 	printf("Testing each crtc, waiting for flips to complete\n");
 	test.flags = SYNC;
+	test.msc = check_msc(dpy, test.win, test.queue, test.msc, NULL);
 	err += for_each_crtc(dpy, __test_crtc, &test);
+	test.msc = check_msc(dpy, test.win, test.queue, test.msc, NULL);
 
-	test.msc = check_msc(dpy, test.win, test.queue, test.msc);
-	dri3_fence_free(dpy, &test.fence);
+	printf("Testing each crtc, with future flips\n");
+	test.flags = FUTURE | SYNC;
+	test.msc = check_msc(dpy, test.win, test.queue, test.msc, NULL);
+	err += for_each_crtc(dpy, __test_crtc, &test);
+	test.msc = check_msc(dpy, test.win, test.queue, test.msc, NULL);
 
+	dri3_fence_free(dpy, &test.fence);
 	XSync(dpy, True);
 	err += !!_x_error_occurred;
 
@@ -670,8 +1610,32 @@ fail:
 static int has_present(Display *dpy)
 {
 	xcb_connection_t *c = XGetXCBConnection(dpy);
-	xcb_present_query_version_reply_t *reply;
 	xcb_generic_error_t *error = NULL;
+	void *reply;
+
+	reply = xcb_xfixes_query_version_reply(c,
+					       xcb_xfixes_query_version(c,
+									XCB_XFIXES_MAJOR_VERSION,
+									XCB_XFIXES_MINOR_VERSION),
+					       &error);
+	free(reply);
+	free(error);
+	if (reply == NULL) {
+		fprintf(stderr, "XFixes not supported on %s\n", DisplayString(dpy));
+		return 0;
+	}
+
+	reply = xcb_dri3_query_version_reply(c,
+					     xcb_dri3_query_version(c,
+								    XCB_DRI3_MAJOR_VERSION,
+								    XCB_DRI3_MINOR_VERSION),
+					     &error);
+	free(reply);
+	free(error);
+	if (reply == NULL) {
+		fprintf(stderr, "DRI3 not supported on %s\n", DisplayString(dpy));
+		return 0;
+	}
 
 	reply = xcb_present_query_version_reply(c,
 						xcb_present_query_version(c,
@@ -681,14 +1645,19 @@ static int has_present(Display *dpy)
 
 	free(reply);
 	free(error);
+	if (reply == NULL) {
+		fprintf(stderr, "Present not supported on %s\n", DisplayString(dpy));
+		return 0;
+	}
 
-	return reply != NULL;
+	return 1;
 }
 
 int main(void)
 {
 	Display *dpy;
 	Window root;
+	int dummy;
 	int error = 0;
 	uint64_t last_msc;
 	void *queue;
@@ -700,27 +1669,59 @@ int main(void)
 	if (!has_present(dpy))
 		return 77;
 
+	if (DPMSQueryExtension(dpy, &dummy, &dummy))
+		DPMSDisable(dpy);
+
 	root = DefaultRootWindow(dpy);
 
 	signal(SIGALRM, SIG_IGN);
 	XSetErrorHandler(_check_error_handler);
 
 	queue = setup_msc(dpy, root);
-	last_msc = check_msc(dpy, root, queue, 0);
+	last_msc = check_msc(dpy, root, queue, 0, NULL);
+
+	error += test_future_msc(dpy, queue);
+	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
+
+	error += test_accuracy_msc(dpy, queue);
+	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
+
+	error += test_modulus_msc(dpy, queue);
+	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
+
+	error += test_exhaustion_msc(dpy, queue);
+	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
 
 	error += test_whole(dpy);
-	last_msc = check_msc(dpy, root, queue, last_msc);
+	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
+
+	error += test_double(dpy, queue);
+	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
+
+	error += test_future(dpy, queue);
+	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
+
+	error += test_accuracy(dpy, queue);
+	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
+
+	error += test_modulus(dpy, queue);
+	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
+
+	error += test_exhaustion(dpy, queue);
+	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
 
 	error += test_crtc(dpy, queue, last_msc);
-	last_msc = check_msc(dpy, root, queue, last_msc);
+	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
 
 	error += test_shm(dpy);
-	last_msc = check_msc(dpy, root, queue, last_msc);
+	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
 
 	error += test_dri3(dpy);
-	last_msc = check_msc(dpy, root, queue, last_msc);
+	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
 
 	teardown_msc(dpy, queue);
 
+	if (DPMSQueryExtension(dpy, &dummy, &dummy))
+		DPMSEnable(dpy);
 	return !!error;
 }
diff --git a/tools/Makefile.am b/tools/Makefile.am
index b5de2c9..20f8a00 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -26,11 +26,26 @@ AM_CFLAGS = \
 drivermandir = $(DRIVER_MAN_DIR)
 policydir = $(datarootdir)/polkit-1/actions
 
+noinst_PROGRAMS =
+
 if BUILD_TOOLS
 bin_PROGRAMS = intel-virtual-output
 driverman_DATA = intel-virtual-output.$(DRIVER_MAN_SUFFIX)
 endif
 
+if BUILD_TOOL_CURSOR
+noinst_PROGRAMS += cursor
+cursor_CFLAGS = $(TOOL_CURSOR_CFLAGS)
+cursor_LDADD = $(TOOL_CURSOR_LIBS)
+endif
+
+if X11_DRI3
+noinst_PROGRAMS += dri3info
+dri3info_SOURCES = dri3info.c
+dri3info_CFLAGS = $(X11_DRI3_CFLAGS) $(DRI_CFLAGS)
+dri3info_LDADD = $(X11_DRI3_LIBS) $(DRI_LIBS)
+endif
+
 if BUILD_BACKLIGHT_HELPER
 libexec_PROGRAMS = xf86-video-intel-backlight-helper
 nodist_policy_DATA = org.x.xf86-video-intel.backlight-helper.policy
diff --git a/tools/cursor.c b/tools/cursor.c
new file mode 100644
index 0000000..31d45d7
--- /dev/null
+++ b/tools/cursor.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <X11/Xlib.h>
+#include <X11/extensions/Xfixes.h>
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <png.h>
+
+int main(int argc, char **argv)
+{
+	Display *dpy;
+	XFixesCursorImage *cur;
+	unsigned long *src; /* XXX deep sigh */
+	unsigned x, y;
+	png_struct *png;
+	png_info *info;
+	png_byte **rows;
+	FILE *file;
+
+	dpy = XOpenDisplay(NULL);
+	if (dpy == NULL)
+		return 1;
+
+	if (!XFixesQueryExtension(dpy, (int *)&x, (int *)&y))
+		return 1;
+
+	cur = XFixesGetCursorImage(dpy);
+	if (cur == NULL)
+		return 1;
+
+	printf("Cursor on display '%s': %dx%d, (hotspot %dx%d)\n",
+	       DisplayString(dpy),
+	       cur->width, cur->height,
+	       cur->xhot, cur->yhot);
+
+	file = fopen("cursor.png", "wb");
+	if (file == NULL)
+		return 2;
+
+	png = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+	info = png_create_info_struct(png);
+	png_init_io(png, file);
+	png_set_IHDR(png, info,
+		     cur->width, cur->height, 8,
+		     PNG_COLOR_TYPE_RGB_ALPHA,
+		     PNG_INTERLACE_NONE,
+		     PNG_COMPRESSION_TYPE_DEFAULT,
+		     PNG_FILTER_TYPE_DEFAULT);
+	png_write_info(png, info);
+
+	src = cur->pixels;
+	rows = malloc(cur->height*sizeof(png_byte*));
+	for (y = 0; y < cur->height; y++) {
+		rows[y] = malloc(cur->width * 4);
+		for (x = 0; x < cur->width; x++) {
+			uint32_t p = *src++;
+			uint8_t r = p >> 0;
+			uint8_t g = p >> 8;
+			uint8_t b = p >> 16;
+			uint8_t a = p >> 24;
+
+			if (a > 0x00 && a < 0xff) {
+				r = (r * 0xff + a /2) / a;
+				g = (g * 0xff + a /2) / a;
+				b = (b * 0xff + a /2) / a;
+			}
+
+			rows[y][4*x + 0] = b;
+			rows[y][4*x + 1] = g;
+			rows[y][4*x + 2] = r;
+			rows[y][4*x + 3] = a;
+		}
+	}
+
+	png_write_image(png, rows);
+	png_write_end(png, NULL);
+	fclose(file);
+
+	return 0;
+}
diff --git a/tools/dri3info.c b/tools/dri3info.c
new file mode 100644
index 0000000..6587411
--- /dev/null
+++ b/tools/dri3info.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <X11/Xlib.h>
+#include <X11/Xlib-xcb.h>
+#include <xcb/xcb.h>
+#include <xcb/dri3.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <drm.h>
+#include <xf86drm.h>
+
+static int dri3_open(Display *dpy)
+{
+	xcb_connection_t *c = XGetXCBConnection(dpy);
+	xcb_dri3_open_cookie_t cookie;
+	xcb_dri3_open_reply_t *reply;
+
+	cookie = xcb_dri3_open(c, RootWindow(dpy, DefaultScreen(dpy)), None);
+	reply = xcb_dri3_open_reply(c, cookie, NULL);
+
+	if (!reply)
+		return -1;
+
+	if (reply->nfd != 1)
+		return -1;
+
+	return xcb_dri3_open_reply_fds(c, reply)[0];
+}
+
+static void get_device_path(int fd, char *buf, int len)
+{
+	struct stat remote, local;
+	int i;
+
+	if (fstat(fd, &remote))
+		goto out;
+
+	for (i = 0; i < 16; i++) {
+		snprintf(buf, len, "/dev/dri/card%d", i);
+		if (stat(buf, &local))
+			continue;
+
+		if (local.st_mode == remote.st_mode &&
+		    local.st_rdev == remote.st_rdev)
+			return;
+
+		snprintf(buf, len, "/dev/dri/renderD%d", i + 128);
+		if (stat(buf, &local))
+			continue;
+
+		if (local.st_mode == remote.st_mode &&
+		    local.st_rdev == remote.st_rdev)
+			return;
+	}
+
+out:
+	strncpy(buf, "unknown path", len);
+}
+
+static void get_driver_name(int fd, char *name, int len)
+{
+	drm_version_t version;
+
+	memset(name, 0, len);
+	memset(&version, 0, sizeof(version));
+	version.name_len = len;
+	version.name = name;
+
+	(void)drmIoctl(fd, DRM_IOCTL_VERSION, &version);
+}
+
+static void info(const char *dpyname)
+{
+	Display *dpy;
+	int device;
+	char device_path[1024];
+	char driver_name[1024];
+
+	dpy = XOpenDisplay(dpyname);
+	if (dpy == NULL) {
+		printf("Unable to connect to display '%s'\n",
+		       dpyname ?: getenv("DISPLAY") ?: "unset");
+		return;
+	}
+
+	device = dri3_open(dpy);
+	if (device < 0) {
+		printf("Unable to connect to DRI3 on display '%s'\n",
+		       DisplayString(dpy));
+		return;
+	}
+
+	get_device_path(device, device_path, sizeof(device_path));
+	get_driver_name(device, driver_name, sizeof(driver_name));
+
+	printf("Connected to DRI3 on display '%s', using fd %d: matches %s, driver %s\n",
+	       DisplayString(dpy), device, device_path, driver_name);
+
+	XCloseDisplay(dpy);
+	close(device);
+}
+
+int main(int argc, char **argv)
+{
+	int i;
+
+	if (argc > 1) {
+		for (i = 1; i < argc; i++)
+			info(argv[i]);
+	} else
+		info(NULL);
+
+	return 0;
+}
diff --git a/tools/virtual.c b/tools/virtual.c
index 8e2b4a2..883cbf2 100644
--- a/tools/virtual.c
+++ b/tools/virtual.c
@@ -218,6 +218,13 @@ static inline XRRScreenResources *_XRRGetScreenResourcesCurrent(Display *dpy, Wi
 static int _x_error_occurred;
 
 static int
+_io_error_handler(Display *display)
+{
+	fprintf(stderr, "XIO error on display %s\n", DisplayString(display));
+	abort();
+}
+
+static int
 _check_error_handler(Display     *display,
 		     XErrorEvent *event)
 {
@@ -320,6 +327,7 @@ can_use_shm(Display *dpy,
 #include <X11/Xlib-xcb.h>
 #include <X11/xshmfence.h>
 #include <xcb/xcb.h>
+#include <xcb/xcbext.h>
 #include <xcb/dri3.h>
 #include <xcb/sync.h>
 static Pixmap dri3_create_pixmap(Display *dpy,
@@ -357,6 +365,7 @@ static int dri3_query_version(Display *dpy, int *major, int *minor)
 {
 	xcb_connection_t *c = XGetXCBConnection(dpy);
 	xcb_dri3_query_version_reply_t *reply;
+	xcb_generic_error_t *error;
 
 	*major = *minor = -1;
 
@@ -364,7 +373,8 @@ static int dri3_query_version(Display *dpy, int *major, int *minor)
 					     xcb_dri3_query_version(c,
 								    XCB_DRI3_MAJOR_VERSION,
 								    XCB_DRI3_MINOR_VERSION),
-					     NULL);
+					     &error);
+	free(error);
 	if (reply == NULL)
 		return -1;
 
@@ -377,8 +387,13 @@ static int dri3_query_version(Display *dpy, int *major, int *minor)
 
 static int dri3_exists(Display *dpy)
 {
+	const xcb_query_extension_reply_t *ext;
 	int major, minor;
 
+	ext = xcb_get_extension_data(XGetXCBConnection(dpy), &xcb_dri3_id);
+	if (ext == NULL || !ext->present)
+		return 0;
+
 	if (dri3_query_version(dpy, &major, &minor) < 0)
 		return 0;
 
@@ -3228,6 +3243,7 @@ int main(int argc, char **argv)
 		return -ret;
 
 	XSetErrorHandler(_check_error_handler);
+	XSetIOErrorHandler(_io_error_handler);
 
 	ret = add_fd(&ctx, display_open(&ctx, src_name));
 	if (ret) {