11 files changed, 1274 insertions, 181 deletions
diff --git a/pixman/configure.ac b/pixman/configure.ac
index f39f43739..5eeb6a54e 100644
--- a/pixman/configure.ac
+++ b/pixman/configure.ac
@@ -294,6 +294,9 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
 #if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
 #error "Need GCC >= 3.4 for MMX intrinsics"
 #endif
+#if defined(__clang__)
+#error "clang chokes on the inline assembly in pixman-mmx.c"
+#endif
 #include <mmintrin.h>
 int main () {
     __m64 v = _mm_cvtsi32_si64 (1);
@@ -592,6 +595,51 @@ fi
 
 AM_CONDITIONAL(USE_ARM_IWMMXT, test $have_iwmmxt_intrinsics = yes)
 
+dnl ==========================================================================
+dnl Check if assembler is gas compatible and supports MIPS DSPr2 instructions
+
+have_mips_dspr2=no
+AC_MSG_CHECKING(whether to use MIPS DSPr2 assembler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="-mdspr2 $CFLAGS"
+
+AC_COMPILE_IFELSE([[
+#if !(defined(__mips__) &&  __mips_isa_rev >= 2)
+#error MIPS DSPr2 is currently only available on MIPS32r2 platforms.
+#endif
+int
+main ()
+{
+    int c = 0, a = 0, b = 0;
+    __asm__ __volatile__ (
+        "precr.qb.ph %[c], %[a], %[b]          \n\t"
+        : [c] "=r" (c)
+        : [a] "r" (a), [b] "r" (b)
+    );
+    return c;
+}]], have_mips_dspr2=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(mips-dspr2,
+   [AC_HELP_STRING([--disable-mips-dspr2],
+                   [disable MIPS DSPr2 fast paths])],
+   [enable_mips_dspr2=$enableval], [enable_mips_dspr2=auto])
+
+if test $enable_mips_dspr2 = no ; then
+   have_mips_dspr2=disabled
+fi
+
+if test $have_mips_dspr2 = yes ; then
+   AC_DEFINE(USE_MIPS_DSPR2, 1, [use MIPS DSPr2 assembly optimizations])
+fi
+
+AM_CONDITIONAL(USE_MIPS_DSPR2, test $have_mips_dspr2 = yes)
+
+AC_MSG_RESULT($have_mips_dspr2)
+if test $enable_mips_dspr2 = yes && test $have_mips_dspr2 = no ; then
+   AC_MSG_ERROR([MIPS DSPr2 instructions not detected])
+fi
+
 dnl =========================================================================================
 dnl Check for GNU-style inline assembly support
 
diff --git a/pixman/pixman/Makefile.am b/pixman/pixman/Makefile.am
index 286b7cf36..fb7e04723 100644
--- a/pixman/pixman/Makefile.am
+++ b/pixman/pixman/Makefile.am
@@ -102,5 +102,21 @@ libpixman_1_la_LIBADD += libpixman-iwmmxt.la
 ASM_CFLAGS_IWMMXT=$(IWMMXT_CFLAGS)
 endif
 
+# mips dspr2 code
+if USE_MIPS_DSPR2
+noinst_LTLIBRARIES += libpixman-mips-dspr2.la
+libpixman_mips_dspr2_la_SOURCES = \
+        pixman-mips-dspr2.c \
+        pixman-mips-dspr2.h \
+        pixman-mips-dspr2-asm.S \
+        pixman-mips-dspr2-asm.h \
+        pixman-mips-memcpy-asm.S
+libpixman_mips_dspr2_la_CFLAGS = $(DEP_CFLAGS)
+libpixman_mips_dspr2_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LIBADD += libpixman-mips-dspr2.la
+
+ASM_CFLAGS_mips_dspr2=
+endif
+
 .c.s : $(libpixmaninclude_HEADERS) $(BUILT_SOURCES)
 	$(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $<
diff --git a/pixman/pixman/pixman-cpu.c b/pixman/pixman/pixman-cpu.c
index 92942b217..fcf591a99 100644
--- a/pixman/pixman/pixman-cpu.c
+++ b/pixman/pixman/pixman-cpu.c
@@ -427,6 +427,54 @@ pixman_have_arm_iwmmxt (void)
 
 #endif /* USE_ARM_SIMD || USE_ARM_NEON || USE_ARM_IWMMXT */
 
+#if defined(USE_MIPS_DSPR2)
+
+#if defined (__linux__) /* linux ELF */
+
+pixman_bool_t
+pixman_have_mips_dspr2 (void)
+{
+    const char *search_string = "MIPS 74K";
+    const char *file_name = "/proc/cpuinfo";
+    /* Simple detection of MIPS DSP ASE (revision 2) at runtime for Linux.
+     * It is based on /proc/cpuinfo, which reveals hardware configuration
+     * to user-space applications.  According to MIPS (early 2010), no similar
+     * facility is universally available on the MIPS architectures, so it's up
+     * to individual OSes to provide such.
+     *
+     * Only currently available MIPS core that supports DSPr2 is 74K.
+     */
+
+    char cpuinfo_line[256];
+
+    FILE *f = NULL;
+
+    if ((f = fopen (file_name, "r")) == NULL)
+        return FALSE;
+
+    while (fgets (cpuinfo_line, sizeof (cpuinfo_line), f) != NULL)
+    {
+        if (strstr (cpuinfo_line, search_string) != NULL)
+        {
+            fclose (f);
+            return TRUE;
+        }
+    }
+
+    fclose (f);
+
+    /* Did not find string in the proc file. */
+    return FALSE;
+}
+
+#else /* linux ELF */
+
+#define pixman_have_mips_dspr2() FALSE
+
+#endif /* linux ELF */
+
+#endif /* USE_MIPS_DSPR2 */
+
 #if defined(USE_X86_MMX) || defined(USE_SSE2)
 /* The CPU detection code needs to be in a file not compiled with
  * "-mmmx -msse", as gcc would generate CMOV instructions otherwise
@@ -696,6 +744,11 @@ _pixman_choose_implementation (void)
 	imp = _pixman_implementation_create_arm_neon (imp);
 #endif
 
+#ifdef USE_MIPS_DSPR2
+    if (pixman_have_mips_dspr2 ())
+	imp = _pixman_implementation_create_mips_dspr2 (imp);
+#endif
+
 #ifdef USE_VMX
     if (pixman_have_vmx ())
 	imp = _pixman_implementation_create_vmx (imp);
diff --git a/pixman/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman/pixman-mips-dspr2-asm.S
new file mode 100644
index 000000000..0a4c87e37
--- /dev/null
+++ b/pixman/pixman/pixman-mips-dspr2-asm.S
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nemanja Lukic (nlukic@mips.com)
+ */
+
+#include "pixman-mips-dspr2-asm.h"
+
+LEAF_MIPS_DSPR2(pixman_composite_src_8888_0565_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (a8r8g8b8)
+ * a2 - w
+ */
+
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+    li       t4, 0xf800f800
+    li       t5, 0x07e007e0
+    li       t6, 0x001f001f
+1:
+    lw       t0, 0(a1)
+    lw       t1, 4(a1)
+    addiu    a1, a1, 8
+    addiu    a2, a2, -2
+
+    CONVERT_2x8888_TO_2x0565 t0, t1, t2, t3, t4, t5, t6, t7, t8
+
+    sh       t2, 0(a0)
+    sh       t3, 2(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a2, 3f
+     nop
+    lw       t0, 0(a1)
+
+    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
+
+    sh       t1, 0(a0)
+3:
+    j        ra
+     nop
+
+END(pixman_composite_src_8888_0565_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_0565_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (r5g6b5)
+ * a2 - w
+ */
+
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+    li       t4, 0x07e007e0
+    li       t5, 0x001F001F
+1:
+    lhu      t0, 0(a1)
+    lhu      t1, 2(a1)
+    addiu    a1, a1, 4
+    addiu    a2, a2, -2
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
+
+    sw       t2, 0(a0)
+    sw       t3, 4(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    lhu      t0, 0(a1)
+
+    CONVERT_1x0565_TO_1x8888 t0, t1, t2, t3
+
+    sw       t1, 0(a0)
+3:
+    j        ra
+     nop
+
+END(pixman_composite_src_0565_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_x888_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (x8r8g8b8)
+ * a2 - w
+ */
+
+    beqz     a2, 4f
+     nop
+    li       t9, 0xff000000
+    srl      t8, a2, 3    /* t1 = how many multiples of 8 src pixels */
+    beqz     t8, 3f       /* branch if less than 8 src pixels */
+     nop
+1:
+    addiu    t8, t8, -1
+    beqz     t8, 2f
+     addiu   a2, a2, -8
+    pref     0, 32(a1)
+    lw       t0, 0(a1)
+    lw       t1, 4(a1)
+    lw       t2, 8(a1)
+    lw       t3, 12(a1)
+    lw       t4, 16(a1)
+    lw       t5, 20(a1)
+    lw       t6, 24(a1)
+    lw       t7, 28(a1)
+    addiu    a1, a1, 32
+    or       t0, t0, t9
+    or       t1, t1, t9
+    or       t2, t2, t9
+    or       t3, t3, t9
+    or       t4, t4, t9
+    or       t5, t5, t9
+    or       t6, t6, t9
+    or       t7, t7, t9
+    pref     30, 32(a0)
+    sw       t0, 0(a0)
+    sw       t1, 4(a0)
+    sw       t2, 8(a0)
+    sw       t3, 12(a0)
+    sw       t4, 16(a0)
+    sw       t5, 20(a0)
+    sw       t6, 24(a0)
+    sw       t7, 28(a0)
+    b        1b
+     addiu   a0, a0, 32
+2:
+    lw       t0, 0(a1)
+    lw       t1, 4(a1)
+    lw       t2, 8(a1)
+    lw       t3, 12(a1)
+    lw       t4, 16(a1)
+    lw       t5, 20(a1)
+    lw       t6, 24(a1)
+    lw       t7, 28(a1)
+    addiu    a1, a1, 32
+    or       t0, t0, t9
+    or       t1, t1, t9
+    or       t2, t2, t9
+    or       t3, t3, t9
+    or       t4, t4, t9
+    or       t5, t5, t9
+    or       t6, t6, t9
+    or       t7, t7, t9
+    sw       t0, 0(a0)
+    sw       t1, 4(a0)
+    sw       t2, 8(a0)
+    sw       t3, 12(a0)
+    sw       t4, 16(a0)
+    sw       t5, 20(a0)
+    sw       t6, 24(a0)
+    sw       t7, 28(a0)
+    beqz     a2, 4f
+     addiu   a0, a0, 32
+3:
+    lw       t0, 0(a1)
+    addiu    a1, a1, 4
+    addiu    a2, a2, -1
+    or       t1, t0, t9
+    sw       t1, 0(a0)
+    bnez     a2, 3b
+     addiu   a0, a0, 4
+4:
+    jr       ra
+     nop
+
+END(pixman_composite_src_x888_8888_asm_mips)
diff --git a/pixman/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman/pixman-mips-dspr2-asm.h
new file mode 100644
index 000000000..e07cda470
--- /dev/null
+++ b/pixman/pixman/pixman-mips-dspr2-asm.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nemanja Lukic (nlukic@mips.com)
+ */
+
+#ifndef PIXMAN_MIPS_DSPR2_ASM_H
+#define PIXMAN_MIPS_DSPR2_ASM_H
+
+#define zero $0
+#define AT   $1
+#define v0   $2
+#define v1   $3
+#define a0   $4
+#define a1   $5
+#define a2   $6
+#define a3   $7
+#define t0   $8
+#define t1   $9
+#define t2   $10
+#define t3   $11
+#define t4   $12
+#define t5   $13
+#define t6   $14
+#define t7   $15
+#define s0   $16
+#define s1   $17
+#define s2   $18
+#define s3   $19
+#define s4   $20
+#define s5   $21
+#define s6   $22
+#define s7   $23
+#define t8   $24
+#define t9   $25
+#define k0   $26
+#define k1   $27
+#define gp   $28
+#define sp   $29
+#define fp   $30
+#define s8   $30
+#define ra   $31
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol)                           \
+                .globl  symbol;                         \
+                .align  2;                              \
+                .type   symbol, @function;              \
+                .ent    symbol, 0;                      \
+symbol:         .frame  sp, 0, ra;                      \
+                .set    push;                           \
+                .set    arch=mips32r2;                  \
+                .set    noreorder;                      \
+                .set    noat;
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_MIPS_DSPR2(symbol)                         \
+LEAF_MIPS32R2(symbol)                                   \
+                .set    dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function)                                   \
+                .set    pop;                            \
+                .end    function;                       \
+                .size   function,.-function
+
+/*
+ * Conversion of single r5g6b5 pixel (in_565) to single a8r8g8b8 pixel
+ * returned in (out_8888) register. Requires two temporary registers
+ * (scratch1 and scratch2).
+ */
+.macro CONVERT_1x0565_TO_1x8888 in_565,   \
+                                out_8888, \
+                                scratch1, scratch2
+    lui     \out_8888, 0xff00
+    sll     \scratch1, \in_565,   0x3
+    andi    \scratch2, \scratch1, 0xff
+    ext     \scratch1, \in_565,   0x2, 0x3
+    or      \scratch1, \scratch2, \scratch1
+    or      \out_8888, \out_8888, \scratch1
+
+    sll     \scratch1, \in_565,   0x5
+    andi    \scratch1, \scratch1, 0xfc00
+    srl     \scratch2, \in_565,   0x1
+    andi    \scratch2, \scratch2, 0x300
+    or      \scratch2, \scratch1, \scratch2
+    or      \out_8888, \out_8888, \scratch2
+
+    andi    \scratch1, \in_565,   0xf800
+    srl     \scratch2, \scratch1, 0x5
+    andi    \scratch2, \scratch2, 0xff00
+    or      \scratch1, \scratch1, \scratch2
+    sll     \scratch1, \scratch1, 0x8
+    or      \out_8888, \out_8888, \scratch1
+.endm
+
+/*
+ * Conversion of two r5g6b5 pixels (in1_565 and in2_565) to two a8r8g8b8 pixels
+ * returned in (out1_8888 and out2_8888) registers. Requires four scratch
+ * registers (scratch1 ... scratch4). It also requires maskG and maskB for
+ * color component extractions. These masks must have following values:
+ *   li       maskG, 0x07e007e0
+ *   li       maskB, 0x001F001F
+ */
+.macro CONVERT_2x0565_TO_2x8888 in1_565, in2_565,     \
+                                out1_8888, out2_8888, \
+                                maskG, maskB,         \
+                                scratch1, scratch2, scratch3, scratch4
+    sll               \scratch1,  \in1_565,   16
+    or                \scratch1,  \scratch1,  \in2_565
+    lui               \out2_8888, 0xff00
+    ori               \out2_8888, \out2_8888, 0xff00
+    shrl.ph           \scratch2,  \scratch1,  11
+    and               \scratch3,  \scratch1,  \maskG
+    shra.ph           \scratch4,  \scratch2,  2
+    shll.ph           \scratch2,  \scratch2,  3
+    shll.ph           \scratch3,  \scratch3,  5
+    or                \scratch2,  \scratch2,  \scratch4
+    shrl.qb           \scratch4,  \scratch3,  6
+    or                \out2_8888, \out2_8888, \scratch2
+    or                \scratch3,  \scratch3,  \scratch4
+    and               \scratch1,  \scratch1,  \maskB
+    shll.ph           \scratch2,  \scratch1,  3
+    shra.ph           \scratch4,  \scratch1,  2
+    or                \scratch2,  \scratch2,  \scratch4
+    or                \scratch3,  \scratch2,  \scratch3
+    precrq.ph.w       \out1_8888, \out2_8888, \scratch3
+    precr_sra.ph.w    \out2_8888, \scratch3,  0
+.endm
+
+/*
+ * Conversion of single a8r8g8b8 pixel (in_8888) to single r5g6b5 pixel
+ * returned in (out_565) register. Requires two temporary registers
+ * (scratch1 and scratch2).
+ */
+.macro CONVERT_1x8888_TO_1x0565 in_8888, \
+                                out_565, \
+                                scratch1, scratch2
+    ext     \out_565,  \in_8888,  0x3, 0x5
+    srl     \scratch1, \in_8888,  0x5
+    andi    \scratch1, \scratch1, 0x07e0
+    srl     \scratch2, \in_8888,  0x8
+    andi    \scratch2, \scratch2, 0xf800
+    or      \out_565,  \out_565,  \scratch1
+    or      \out_565,  \out_565,  \scratch2
+.endm
+
+/*
+ * Conversion of two a8r8g8b8 pixels (in1_8888 and in2_8888) to two r5g6b5
+ * pixels returned in (out1_565 and out2_565) registers. Requires two temporary
+ * registers (scratch1 and scratch2). It also requires maskR, maskG and maskB
+ * for color component extractions. These masks must have following values:
+ *   li       maskR, 0xf800f800
+ *   li       maskG, 0x07e007e0
+ *   li       maskB, 0x001F001F
+ * Value of input register in2_8888 is lost.
+ */
+.macro CONVERT_2x8888_TO_2x0565 in1_8888, in2_8888,  \
+                                out1_565, out2_565,  \
+                                maskR, maskG, maskB, \
+                                scratch1, scratch2
+    precrq.ph.w       \scratch1, \in2_8888, \in1_8888
+    precr_sra.ph.w    \in2_8888, \in1_8888, 0
+    shll.ph           \scratch1, \scratch1, 8
+    srl               \in2_8888, \in2_8888, 3
+    and               \scratch2, \in2_8888, \maskB
+    and               \scratch1, \scratch1, \maskR
+    srl               \in2_8888, \in2_8888, 2
+    and               \out2_565, \in2_8888, \maskG
+    or                \out2_565, \out2_565, \scratch2
+    or                \out1_565, \out2_565, \scratch1
+    srl               \out2_565, \out1_565, 16
+.endm
+
+#endif //PIXMAN_MIPS_DSPR2_ASM_H
diff --git a/pixman/pixman/pixman-mips-dspr2.c b/pixman/pixman/pixman-mips-dspr2.c
new file mode 100644
index 000000000..e331853b7
--- /dev/null
+++ b/pixman/pixman/pixman-mips-dspr2.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nemanja Lukic (nlukic@mips.com)
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+#include "pixman-mips-dspr2.h"
+
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_x888_8888,
+                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_8888_0565,
+                                    uint32_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0565_8888,
+                                    uint16_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0565_0565,
+                                    uint16_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888,
+                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888,
+                                    uint8_t, 3, uint8_t, 3)
+
+static const pixman_fast_path_t mips_dspr2_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5,   null, r5g6b5,   mips_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5,   null, b5g6r5,   mips_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5,   mips_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5,   mips_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5,   mips_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5,   mips_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5,   null, a8r8g8b8, mips_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5,   null, x8r8g8b8, mips_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5,   null, a8b8g8r8, mips_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5,   null, x8b8g8r8, mips_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mips_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mips_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mips_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mips_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mips_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mips_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, mips_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, r8g8b8,   null, r8g8b8,   mips_composite_src_0888_0888),
+
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp =
+        _pixman_implementation_create (fallback, mips_dspr2_fast_paths);
+
+    return imp;
+}
diff --git a/pixman/pixman/pixman-mips-dspr2.h b/pixman/pixman/pixman-mips-dspr2.h
new file mode 100644
index 000000000..449c42a56
--- /dev/null
+++ b/pixman/pixman/pixman-mips-dspr2.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nemanja Lukic (nlukic@mips.com)
+ */
+
+#ifndef PIXMAN_MIPS_DSPR2_H
+#define PIXMAN_MIPS_DSPR2_H
+
+#include "pixman-private.h"
+#include "pixman-inlines.h"
+
+#define SKIP_ZERO_SRC  1
+#define SKIP_ZERO_MASK 2
+#define DO_FAST_MEMCPY 3
+
+void
+pixman_mips_fast_memcpy (void *dst, void *src, uint32_t n_bytes);
+
+/****************************************************************/
+
+#define PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST(flags, name,          \
+                                           src_type, src_cnt,    \
+                                           dst_type, dst_cnt)    \
+void                                                             \
+pixman_composite_##name##_asm_mips (dst_type *dst,               \
+                                    src_type *src,               \
+                                    int32_t   w);                \
+                                                                 \
+static void                                                      \
+mips_composite_##name (pixman_implementation_t *imp,             \
+                       pixman_composite_info_t *info)            \
+{                                                                \
+    PIXMAN_COMPOSITE_ARGS (info);                                \
+    dst_type *dst_line, *dst;                                    \
+    src_type *src_line, *src;                                    \
+    int32_t dst_stride, src_stride;                              \
+    int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;   \
+                                                                 \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,    \
+                           src_stride, src_line, src_cnt);       \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, \
+                           dst_stride, dst_line, dst_cnt);       \
+                                                                 \
+    while (height--)                                             \
+    {                                                            \
+      dst = dst_line;                                            \
+      dst_line += dst_stride;                                    \
+      src = src_line;                                            \
+      src_line += src_stride;                                    \
+                                                                 \
+      if (flags == DO_FAST_MEMCPY)                               \
+        pixman_mips_fast_memcpy (dst, src, width * bpp);         \
+      else                                                       \
+        pixman_composite_##name##_asm_mips (dst, src, width);    \
+    }                                                            \
+}
+
+#endif //PIXMAN_MIPS_DSPR2_H
diff --git a/pixman/pixman/pixman-mips-memcpy-asm.S b/pixman/pixman/pixman-mips-memcpy-asm.S
new file mode 100644
index 000000000..9ad6da537
--- /dev/null
+++ b/pixman/pixman/pixman-mips-memcpy-asm.S
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "pixman-mips-dspr2-asm.h"
+
+/*
+ * This routine could be optimized for MIPS64. The current code only
+ * uses MIPS32 instructions.
+ */
+
+#ifdef EB
+#  define LWHI	lwl		/* high part is left in big-endian */
+#  define SWHI	swl		/* high part is left in big-endian */
+#  define LWLO	lwr		/* low part is right in big-endian */
+#  define SWLO	swr		/* low part is right in big-endian */
+#else
+#  define LWHI	lwr		/* high part is right in little-endian */
+#  define SWHI	swr		/* high part is right in little-endian */
+#  define LWLO	lwl		/* low part is left in big-endian */
+#  define SWLO	swl		/* low part is left in big-endian */
+#endif
+
+LEAF_MIPS32R2(pixman_mips_fast_memcpy)
+
+	slti	AT, a2, 8
+	bne	AT, zero, $last8
+	move	v0, a0	/* memcpy returns the dst pointer */
+
+/* Test if the src and dst are word-aligned, or can be made word-aligned */
+	xor	t8, a1, a0
+	andi	t8, t8, 0x3		/* t8 is a0/a1 word-displacement */
+
+	bne	t8, zero, $unaligned
+	negu	a3, a0
+
+	andi	a3, a3, 0x3	/* we need to copy a3 bytes to make a0/a1 aligned */
+	beq	a3, zero, $chk16w	/* when a3=0 then the dst (a0) is word-aligned */
+	subu	a2, a2, a3	/* now a2 is the remining bytes count */
+
+	LWHI	t8, 0(a1)
+	addu	a1, a1, a3
+	SWHI	t8, 0(a0)
+	addu	a0, a0, a3
+
+/* Now the dst/src are mutually word-aligned with word-aligned addresses */
+$chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
+				/* t8 is the byte count after 64-byte chunks */
+
+	beq	a2, t8, $chk8w	/* if a2==t8, no 64-byte chunks */
+				/* There will be at most 1 32-byte chunk after it */
+	subu	a3, a2, t8	/* subtract from a2 the reminder */
+                                /* Here a3 counts bytes in 16w chunks */
+	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
+
+	addu	t0, a0, a2	/* t0 is the "past the end" address */
+
+/*
+ * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
+ * the "t0-32" address
+ * This means: for x=128 the last "safe" a0 address is "t0-160"
+ * Alternatively, for x=64 the last "safe" a0 address is "t0-96"
+ * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit
+ */
+	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
+
+	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
+	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
+	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
+	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
+/* In case the a0 > t9 don't use "pref 30" at all */
+	sgtu	v1, a0, t9
+	bgtz	v1, $loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
+	nop
+/* otherwise, start with using pref30 */
+	pref	30, 64(a0)
+$loop16w:
+	pref	0, 96(a1)
+	lw	t0, 0(a1)
+	bgtz	v1, $skip_pref30_96	/* skip "pref 30, 96(a0)" */
+	lw	t1, 4(a1)
+	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
+$skip_pref30_96:
+	lw	t2, 8(a1)
+	lw	t3, 12(a1)
+	lw	t4, 16(a1)
+	lw	t5, 20(a1)
+	lw	t6, 24(a1)
+	lw	t7, 28(a1)
+        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
+
+	sw	t0, 0(a0)
+	sw	t1, 4(a0)
+	sw	t2, 8(a0)
+	sw	t3, 12(a0)
+	sw	t4, 16(a0)
+	sw	t5, 20(a0)
+	sw	t6, 24(a0)
+	sw	t7, 28(a0)
+
+	lw	t0, 32(a1)
+	bgtz	v1, $skip_pref30_128	/* skip "pref 30, 128(a0)" */
+	lw	t1, 36(a1)
+	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
+$skip_pref30_128:
+	lw	t2, 40(a1)
+	lw	t3, 44(a1)
+	lw	t4, 48(a1)
+	lw	t5, 52(a1)
+	lw	t6, 56(a1)
+	lw	t7, 60(a1)
+        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
+
+	sw	t0, 32(a0)
+	sw	t1, 36(a0)
+	sw	t2, 40(a0)
+	sw	t3, 44(a0)
+	sw	t4, 48(a0)
+	sw	t5, 52(a0)
+	sw	t6, 56(a0)
+	sw	t7, 60(a0)
+
+	addiu	a0, a0, 64	/* adding 64 to dest */
+	sgtu	v1, a0, t9
+	bne	a0, a3, $loop16w
+	addiu	a1, a1, 64	/* adding 64 to src */
+	move	a2, t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes to go */
+
+$chk8w:
+	pref 0, 0x0(a1)
+	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
+				/* the t8 is the reminder count past 32-bytes */
+	beq	a2, t8, $chk1w	/* when a2=t8, no 32-byte chunk */
+	 nop
+
+	lw	t0, 0(a1)
+	lw	t1, 4(a1)
+	lw	t2, 8(a1)
+	lw	t3, 12(a1)
+	lw	t4, 16(a1)
+	lw	t5, 20(a1)
+	lw	t6, 24(a1)
+	lw	t7, 28(a1)
+	addiu	a1, a1, 32
+
+	sw	t0, 0(a0)
+	sw	t1, 4(a0)
+	sw	t2, 8(a0)
+	sw	t3, 12(a0)
+	sw	t4, 16(a0)
+	sw	t5, 20(a0)
+	sw	t6, 24(a0)
+	sw	t7, 28(a0)
+	addiu	a0, a0, 32
+
+$chk1w:
+	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
+	beq	a2, t8, $last8
+	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
+	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
+
+/* copying in words (4-byte chunks) */
+$wordCopy_loop:
+	lw	t3, 0(a1)	/* the first t3 may be equal t0 ... optimize? */
+	addiu	a1, a1, 4
+	addiu	a0, a0, 4
+	bne	a0, a3, $wordCopy_loop
+	sw	t3, -4(a0)
+
+/* For the last (<8) bytes */
+$last8:
+	blez	a2, leave
+	addu	a3, a0, a2	/* a3 is the last dst address */
+$last8loop:
+	lb	v1, 0(a1)
+	addiu	a1, a1, 1
+	addiu	a0, a0, 1
+	bne	a0, a3, $last8loop
+	sb	v1, -1(a0)
+
+leave:	j	ra
+	nop
+
+/*
+ * UNALIGNED case
+ */
+
+$unaligned:
+	/* got here with a3="negu a0" */
+	andi	a3, a3, 0x3	/* test if the a0 is word aligned */
+	beqz	a3, $ua_chk16w
+	subu	a2, a2, a3	/* bytes left after initial a3 bytes */
+
+	LWHI	v1, 0(a1)
+	LWLO	v1, 3(a1)
+	addu	a1, a1, a3	/* a3 may be here 1, 2 or 3 */
+	SWHI	v1, 0(a0)
+	addu	a0, a0, a3	/* below the dst will be word aligned (NOTE1) */
+
+$ua_chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
+				/* t8 is the byte count after 64-byte chunks */
+	beq	a2, t8, $ua_chk8w	/* if a2==t8, no 64-byte chunks */
+				/* There will be at most 1 32-byte chunk after it */
+	subu	a3, a2, t8	/* subtract from a2 the reminder */
+                                /* Here a3 counts bytes in 16w chunks */
+	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
+
+	addu	t0, a0, a2	/* t0 is the "past the end" address */
+
+	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
+
+	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
+	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
+	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
+	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
+/* In case the a0 > t9 don't use "pref 30" at all */
+	sgtu	v1, a0, t9
+	bgtz	v1, $ua_loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
+	nop
+/* otherwise,  start with using pref30 */
+	pref	30, 64(a0)
+$ua_loop16w:
+	pref	0, 96(a1)
+	LWHI	t0, 0(a1)
+	LWLO	t0, 3(a1)
+	LWHI	t1, 4(a1)
+	bgtz	v1, $ua_skip_pref30_96
+	LWLO	t1, 7(a1)
+	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
+$ua_skip_pref30_96:
+	LWHI	t2, 8(a1)
+	LWLO	t2, 11(a1)
+	LWHI	t3, 12(a1)
+	LWLO	t3, 15(a1)
+	LWHI	t4, 16(a1)
+	LWLO	t4, 19(a1)
+	LWHI	t5, 20(a1)
+	LWLO	t5, 23(a1)
+	LWHI	t6, 24(a1)
+	LWLO	t6, 27(a1)
+	LWHI	t7, 28(a1)
+	LWLO	t7, 31(a1)
+        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
+
+	sw	t0, 0(a0)
+	sw	t1, 4(a0)
+	sw	t2, 8(a0)
+	sw	t3, 12(a0)
+	sw	t4, 16(a0)
+	sw	t5, 20(a0)
+	sw	t6, 24(a0)
+	sw	t7, 28(a0)
+
+	LWHI	t0, 32(a1)
+	LWLO	t0, 35(a1)
+	LWHI	t1, 36(a1)
+	bgtz	v1, $ua_skip_pref30_128
+	LWLO	t1, 39(a1)
+	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
+$ua_skip_pref30_128:
+	LWHI	t2, 40(a1)
+	LWLO	t2, 43(a1)
+	LWHI	t3, 44(a1)
+	LWLO	t3, 47(a1)
+	LWHI	t4, 48(a1)
+	LWLO	t4, 51(a1)
+	LWHI	t5, 52(a1)
+	LWLO	t5, 55(a1)
+	LWHI	t6, 56(a1)
+	LWLO	t6, 59(a1)
+	LWHI	t7, 60(a1)
+	LWLO	t7, 63(a1)
+        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
+
+	sw	t0, 32(a0)
+	sw	t1, 36(a0)
+	sw	t2, 40(a0)
+	sw	t3, 44(a0)
+	sw	t4, 48(a0)
+	sw	t5, 52(a0)
+	sw	t6, 56(a0)
+	sw	t7, 60(a0)
+
+	addiu	a0, a0, 64	/* adding 64 to dest */
+	sgtu	v1, a0, t9
+	bne	a0, a3, $ua_loop16w
+	addiu	a1, a1, 64	/* adding 64 to src */
+	move	a2, t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes to go */
+
+$ua_chk8w:
+	pref 0, 0x0(a1)
+	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
+				/* the t8 is the reminder count */
+	beq	a2, t8, $ua_chk1w	/* when a2=t8, no 32-byte chunk */
+
+	LWHI	t0, 0(a1)
+	LWLO	t0, 3(a1)
+	LWHI	t1, 4(a1)
+	LWLO	t1, 7(a1)
+	LWHI	t2, 8(a1)
+	LWLO	t2, 11(a1)
+	LWHI	t3, 12(a1)
+	LWLO	t3, 15(a1)
+	LWHI	t4, 16(a1)
+	LWLO	t4, 19(a1)
+	LWHI	t5, 20(a1)
+	LWLO	t5, 23(a1)
+	LWHI	t6, 24(a1)
+	LWLO	t6, 27(a1)
+	LWHI	t7, 28(a1)
+	LWLO	t7, 31(a1)
+	addiu	a1, a1, 32
+
+	sw	t0, 0(a0)
+	sw	t1, 4(a0)
+	sw	t2, 8(a0)
+	sw	t3, 12(a0)
+	sw	t4, 16(a0)
+	sw	t5, 20(a0)
+	sw	t6, 24(a0)
+	sw	t7, 28(a0)
+	addiu	a0, a0, 32
+
+$ua_chk1w:
+	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
+	beq	a2, t8, $ua_smallCopy
+	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
+	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
+
+/* copying in words (4-byte chunks) */
+$ua_wordCopy_loop:
+	LWHI	v1, 0(a1)
+	LWLO	v1, 3(a1)
+	addiu	a1, a1, 4
+	addiu	a0, a0, 4		/* note: dst=a0 is word aligned here, see NOTE1 */
+	bne	a0, a3, $ua_wordCopy_loop
+	sw	v1, -4(a0)
+
+/* Now less than 4 bytes (value in a2) left to copy */
+$ua_smallCopy:
+	beqz	a2, leave
+	addu	a3, a0, a2	/* a3 is the last dst address */
+$ua_smallCopy_loop:
+	lb	v1, 0(a1)
+	addiu	a1, a1, 1
+	addiu	a0, a0, 1
+	bne	a0, a3, $ua_smallCopy_loop
+	sb	v1, -1(a0)
+
+	j	ra
+	nop
+
+END(pixman_mips_fast_memcpy)
diff --git a/pixman/pixman/pixman-mmx.c b/pixman/pixman/pixman-mmx.c
index 8fd85776d..bcfeb56f0 100644
--- a/pixman/pixman/pixman-mmx.c
+++ b/pixman/pixman/pixman-mmx.c
@@ -353,9 +353,16 @@ static __inline__ uint32_t ldl_u(uint32_t *p)
 }
 
 static force_inline __m64
-load8888 (uint32_t v)
+load8888 (const uint32_t *v)
 {
-    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ());
+    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (*v), _mm_setzero_si64 ());
+}
+
+static force_inline __m64
+load8888u (const uint32_t *v)
+{
+    uint32_t l = ldl_u(v);
+    return load8888(&l);
 }
 
 static force_inline __m64
@@ -364,15 +371,12 @@ pack8888 (__m64 lo, __m64 hi)
     return _mm_packs_pu16 (lo, hi);
 }
 
-#ifdef _MSC_VER
-#define store8888(v) _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()))
-#else
-static force_inline uint32_t
-store8888 (__m64 v)
+static force_inline void
+store8888 (uint32_t *dest, __m64 v)
 {
-    return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
+    v = pack8888 (v, _mm_setzero_si64());
+    *dest = _mm_cvtsi64_si32 (v);
 }
-#endif
 
 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
  *
@@ -475,13 +479,6 @@ pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
 
 /* --------------- MMX code patch for fbcompose.c --------------------- */
 
-#ifdef _MSC_VER
-#define combine(src, mask)                                                        \
-  ((mask) ?                                                                         \
-      store8888 (pix_multiply (load8888 (*src), expand_alpha (load8888 (*mask))))  \
-    :                                                                              \
-      *src)
-#else
 static force_inline uint32_t
 combine (const uint32_t *src, const uint32_t *mask)
 {
@@ -489,18 +486,17 @@ combine (const uint32_t *src, const uint32_t *mask)
 
     if (mask)
     {
-	__m64 m = load8888 (*mask);
-	__m64 s = load8888 (ssrc);
+	__m64 m = load8888 (mask);
+	__m64 s = load8888 (&ssrc);
 
 	m = expand_alpha (m);
 	s = pix_multiply (s, m);
 
-	ssrc = store8888 (s);
+	store8888 (&ssrc, s);
     }
 
     return ssrc;
 }
-#endif
 
 static void
 mmx_combine_over_u (pixman_implementation_t *imp,
@@ -524,9 +520,9 @@ mmx_combine_over_u (pixman_implementation_t *imp,
 	else if (ssrc)
 	{
 	    __m64 s, sa;
-	    s = load8888 (ssrc);
+	    s = load8888 (&ssrc);
 	    sa = expand_alpha (s);
-	    *dest = store8888 (over (s, sa, load8888 (*dest)));
+	    store8888 (dest, over (s, sa, load8888 (dest)));
 	}
 
 	++dest;
@@ -552,9 +548,9 @@ mmx_combine_over_reverse_u (pixman_implementation_t *imp,
 	__m64 d, da;
 	uint32_t s = combine (src, mask);
 
-	d = load8888 (*dest);
+	d = load8888 (dest);
 	da = expand_alpha (d);
-	*dest = store8888 (over (d, da, load8888 (s)));
+	store8888 (dest, over (d, da, load8888 (&s)));
 
 	++dest;
 	++src;
@@ -577,13 +573,14 @@ mmx_combine_in_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 x, a;
+	uint32_t ssrc = combine (src, mask);
 
-	x = load8888 (combine (src, mask));
-	a = load8888 (*dest);
+	x = load8888 (&ssrc);
+	a = load8888 (dest);
 	a = expand_alpha (a);
 	x = pix_multiply (x, a);
 
-	*dest = store8888 (x);
+	store8888 (dest, x);
 
 	++dest;
 	++src;
@@ -606,12 +603,13 @@ mmx_combine_in_reverse_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 x, a;
+	uint32_t ssrc = combine (src, mask);
 
-	x = load8888 (*dest);
-	a = load8888 (combine (src, mask));
+	x = load8888 (dest);
+	a = load8888 (&ssrc);
 	a = expand_alpha (a);
 	x = pix_multiply (x, a);
-	*dest = store8888 (x);
+	store8888 (dest, x);
 
 	++dest;
 	++src;
@@ -634,13 +632,14 @@ mmx_combine_out_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 x, a;
+	uint32_t ssrc = combine (src, mask);
 
-	x = load8888 (combine (src, mask));
-	a = load8888 (*dest);
+	x = load8888 (&ssrc);
+	a = load8888 (dest);
 	a = expand_alpha (a);
 	a = negate (a);
 	x = pix_multiply (x, a);
-	*dest = store8888 (x);
+	store8888 (dest, x);
 
 	++dest;
 	++src;
@@ -663,14 +662,15 @@ mmx_combine_out_reverse_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 x, a;
+	uint32_t ssrc = combine (src, mask);
 
-	x = load8888 (*dest);
-	a = load8888 (combine (src, mask));
+	x = load8888 (dest);
+	a = load8888 (&ssrc);
 	a = expand_alpha (a);
 	a = negate (a);
 	x = pix_multiply (x, a);
 
-	*dest = store8888 (x);
+	store8888 (dest, x);
 
 	++dest;
 	++src;
@@ -693,14 +693,15 @@ mmx_combine_atop_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 s, da, d, sia;
+	uint32_t ssrc = combine (src, mask);
 
-	s = load8888 (combine (src, mask));
-	d = load8888 (*dest);
+	s = load8888 (&ssrc);
+	d = load8888 (dest);
 	sia = expand_alpha (s);
 	sia = negate (sia);
 	da = expand_alpha (d);
 	s = pix_add_mul (s, da, d, sia);
-	*dest = store8888 (s);
+	store8888 (dest, s);
 
 	++dest;
 	++src;
@@ -725,14 +726,15 @@ mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 s, dia, d, sa;
+	uint32_t ssrc = combine (src, mask);
 
-	s = load8888 (combine (src, mask));
-	d = load8888 (*dest);
+	s = load8888 (&ssrc);
+	d = load8888 (dest);
 	sa = expand_alpha (s);
 	dia = expand_alpha (d);
 	dia = negate (dia);
 	s = pix_add_mul (s, dia, d, sa);
-	*dest = store8888 (s);
+	store8888 (dest, s);
 
 	++dest;
 	++src;
@@ -755,15 +757,16 @@ mmx_combine_xor_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 s, dia, d, sia;
+	uint32_t ssrc = combine (src, mask);
 
-	s = load8888 (combine (src, mask));
-	d = load8888 (*dest);
+	s = load8888 (&ssrc);
+	d = load8888 (dest);
 	sia = expand_alpha (s);
 	dia = expand_alpha (d);
 	sia = negate (sia);
 	dia = negate (dia);
 	s = pix_add_mul (s, dia, d, sia);
-	*dest = store8888 (s);
+	store8888 (dest, s);
 
 	++dest;
 	++src;
@@ -786,11 +789,12 @@ mmx_combine_add_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 s, d;
+	uint32_t ssrc = combine (src, mask);
 
-	s = load8888 (combine (src, mask));
-	d = load8888 (*dest);
+	s = load8888 (&ssrc);
+	d = load8888 (dest);
 	s = pix_add (s, d);
-	*dest = store8888 (s);
+	store8888 (dest, s);
 
 	++dest;
 	++src;
@@ -814,20 +818,21 @@ mmx_combine_saturate_u (pixman_implementation_t *imp,
     {
 	uint32_t s = combine (src, mask);
 	uint32_t d = *dest;
-	__m64 ms = load8888 (s);
-	__m64 md = load8888 (d);
+	__m64 ms = load8888 (&s);
+	__m64 md = load8888 (&d);
 	uint32_t sa = s >> 24;
 	uint32_t da = ~d >> 24;
 
 	if (sa > da)
 	{
-	    __m64 msa = load8888 (DIV_UN8 (da, sa) << 24);
+	    uint32_t quot = DIV_UN8 (da, sa) << 24;
+	    __m64 msa = load8888 (&quot);
 	    msa = expand_alpha (msa);
 	    ms = pix_multiply (ms, msa);
 	}
 
 	md = pix_add (md, ms);
-	*dest = store8888 (md);
+	store8888 (dest, md);
 
 	++src;
 	++dest;
@@ -849,11 +854,11 @@ mmx_combine_src_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
 
 	s = pix_multiply (s, a);
-	*dest = store8888 (s);
+	store8888 (dest, s);
 
 	++src;
 	++mask;
@@ -874,12 +879,12 @@ mmx_combine_over_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 sa = expand_alpha (s);
 
-	*dest = store8888 (in_over (s, sa, a, d));
+	store8888 (dest, in_over (s, sa, a, d));
 
 	++src;
 	++dest;
@@ -900,12 +905,12 @@ mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 da = expand_alpha (d);
 
-	*dest = store8888 (over (d, da, in (s, a)));
+	store8888 (dest, over (d, da, in (s, a)));
 
 	++src;
 	++dest;
@@ -926,14 +931,14 @@ mmx_combine_in_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 da = expand_alpha (d);
 
 	s = pix_multiply (s, a);
 	s = pix_multiply (s, da);
-	*dest = store8888 (s);
+	store8888 (dest, s);
 
 	++src;
 	++dest;
@@ -954,14 +959,14 @@ mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 sa = expand_alpha (s);
 
 	a = pix_multiply (a, sa);
 	d = pix_multiply (d, a);
-	*dest = store8888 (d);
+	store8888 (dest, d);
 
 	++src;
 	++dest;
@@ -982,15 +987,15 @@ mmx_combine_out_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 da = expand_alpha (d);
 
 	da = negate (da);
 	s = pix_multiply (s, a);
 	s = pix_multiply (s, da);
-	*dest = store8888 (s);
+	store8888 (dest, s);
 
 	++src;
 	++dest;
@@ -1011,15 +1016,15 @@ mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 sa = expand_alpha (s);
 
 	a = pix_multiply (a, sa);
 	a = negate (a);
 	d = pix_multiply (d, a);
-	*dest = store8888 (d);
+	store8888 (dest, d);
 
 	++src;
 	++dest;
@@ -1040,9 +1045,9 @@ mmx_combine_atop_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 da = expand_alpha (d);
 	__m64 sa = expand_alpha (s);
 
@@ -1050,7 +1055,7 @@ mmx_combine_atop_ca (pixman_implementation_t *imp,
 	a = pix_multiply (a, sa);
 	a = negate (a);
 	d = pix_add_mul (d, a, s, da);
-	*dest = store8888 (d);
+	store8888 (dest, d);
 
 	++src;
 	++dest;
@@ -1071,9 +1076,9 @@ mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 da = expand_alpha (d);
 	__m64 sa = expand_alpha (s);
 
@@ -1081,7 +1086,7 @@ mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
 	a = pix_multiply (a, sa);
 	da = negate (da);
 	d = pix_add_mul (d, a, s, da);
-	*dest = store8888 (d);
+	store8888 (dest, d);
 
 	++src;
 	++dest;
@@ -1102,9 +1107,9 @@ mmx_combine_xor_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 da = expand_alpha (d);
 	__m64 sa = expand_alpha (s);
 
@@ -1113,7 +1118,7 @@ mmx_combine_xor_ca (pixman_implementation_t *imp,
 	da = negate (da);
 	a = negate (a);
 	d = pix_add_mul (d, a, s, da);
-	*dest = store8888 (d);
+	store8888 (dest, d);
 
 	++src;
 	++dest;
@@ -1134,13 +1139,13 @@ mmx_combine_add_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 
 	s = pix_multiply (s, a);
 	d = pix_add (s, d);
-	*dest = store8888 (d);
+	store8888 (dest, d);
 
 	++src;
 	++dest;
@@ -1171,7 +1176,7 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp,
 
     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
@@ -1184,7 +1189,7 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp,
 
 	while (w && (unsigned long)dst & 7)
 	{
-	    *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
+	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
 
 	    w--;
 	    dst++;
@@ -1210,7 +1215,7 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp,
 
 	if (w)
 	{
-	    *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
+	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
 	}
     }
 
@@ -1237,7 +1242,7 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp,
 
     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
@@ -1316,7 +1321,7 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
@@ -1331,9 +1336,9 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		__m64 vdest = load8888 (*q);
-		vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
-		*q = store8888 (vdest);
+		__m64 vdest = load8888 (q);
+		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
+		store8888 (q, vdest);
 	    }
 
 	    twidth--;
@@ -1352,9 +1357,9 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 		__m64 dest0, dest1;
 		__m64 vdest = *(__m64 *)q;
 
-		dest0 = in_over (vsrc, vsrca, load8888 (m0),
+		dest0 = in_over (vsrc, vsrca, load8888 (&m0),
 		                 expand8888 (vdest, 0));
-		dest1 = in_over (vsrc, vsrca, load8888 (m1),
+		dest1 = in_over (vsrc, vsrca, load8888 (&m1),
 		                 expand8888 (vdest, 1));
 
 		*(__m64 *)q = pack8888 (dest0, dest1);
@@ -1371,9 +1376,9 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		__m64 vdest = load8888 (*q);
-		vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
-		*q = store8888 (vdest);
+		__m64 vdest = load8888 (q);
+		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
+		store8888 (q, vdest);
 	    }
 
 	    twidth--;
@@ -1408,7 +1413,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
     mask &= 0xff000000;
     mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
-    vmask = load8888 (mask);
+    vmask = load8888 (&mask);
 
     while (height--)
     {
@@ -1420,10 +1425,10 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 
 	while (w && (unsigned long)dst & 7)
 	{
-	    __m64 s = load8888 (*src);
-	    __m64 d = load8888 (*dst);
+	    __m64 s = load8888 (src);
+	    __m64 d = load8888 (dst);
 
-	    *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
+	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
 
 	    w--;
 	    dst++;
@@ -1448,10 +1453,10 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 
 	if (w)
 	{
-	    __m64 s = load8888 (*src);
-	    __m64 d = load8888 (*dst);
+	    __m64 s = load8888 (src);
+	    __m64 d = load8888 (dst);
 
-	    *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
+	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
 	}
     }
 
@@ -1479,7 +1484,7 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 
     mask &= 0xff000000;
     mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
-    vmask = load8888 (mask);
+    vmask = load8888 (&mask);
     srca = MC (4x00ff);
 
     while (height--)
@@ -1492,10 +1497,11 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 
 	while (w && (unsigned long)dst & 7)
 	{
-	    __m64 s = load8888 (*src | 0xff000000);
-	    __m64 d = load8888 (*dst);
+	    uint32_t ssrc = *src | 0xff000000;
+	    __m64 s = load8888 (&ssrc);
+	    __m64 d = load8888 (dst);
 
-	    *dst = store8888 (in_over (s, srca, vmask, d));
+	    store8888 (dst, in_over (s, srca, vmask, d));
 
 	    w--;
 	    dst++;
@@ -1570,10 +1576,11 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 
 	while (w)
 	{
-	    __m64 s = load8888 (*src | 0xff000000);
-	    __m64 d = load8888 (*dst);
+	    uint32_t ssrc = *src | 0xff000000;
+	    __m64 s = load8888 (&ssrc);
+	    __m64 d = load8888 (dst);
 
-	    *dst = store8888 (in_over (s, srca, vmask, d));
+	    store8888 (dst, in_over (s, srca, vmask, d));
 
 	    w--;
 	    dst++;
@@ -1621,9 +1628,9 @@ mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
 	    else if (s)
 	    {
 		__m64 ms, sa;
-		ms = load8888 (s);
+		ms = load8888 (&s);
 		sa = expand_alpha (ms);
-		*dst = store8888 (over (ms, sa, load8888 (*dst)));
+		store8888 (dst, over (ms, sa, load8888 (dst)));
 	    }
 
 	    dst++;
@@ -1664,7 +1671,7 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
 
 	while (w && (unsigned long)dst & 7)
 	{
-	    __m64 vsrc = load8888 (*src);
+	    __m64 vsrc = load8888 (src);
 	    uint64_t d = *dst;
 	    __m64 vdest = expand565 (to_m64 (d), 0);
 
@@ -1685,10 +1692,10 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
 	    __m64 vsrc0, vsrc1, vsrc2, vsrc3;
 	    __m64 vdest;
 
-	    vsrc0 = load8888 (*(src + 0));
-	    vsrc1 = load8888 (*(src + 1));
-	    vsrc2 = load8888 (*(src + 2));
-	    vsrc3 = load8888 (*(src + 3));
+	    vsrc0 = load8888 ((src + 0));
+	    vsrc1 = load8888 ((src + 1));
+	    vsrc2 = load8888 ((src + 2));
+	    vsrc3 = load8888 ((src + 3));
 
 	    vdest = *(__m64 *)dst;
 
@@ -1708,7 +1715,7 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
 
 	while (w)
 	{
-	    __m64 vsrc = load8888 (*src);
+	    __m64 vsrc = load8888 (src);
 	    uint64_t d = *dst;
 	    __m64 vdest = expand565 (to_m64 (d), 0);
 
@@ -1751,7 +1758,7 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
@@ -1772,9 +1779,9 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	    {
 		__m64 vdest = in_over (vsrc, vsrca,
 				       expand_alpha_rev (to_m64 (m)),
-				       load8888 (*dst));
+				       load8888 (dst));
 
-		*dst = store8888 (vdest);
+		store8888 (dst, vdest);
 	    }
 
 	    w--;
@@ -1823,11 +1830,11 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		__m64 vdest = load8888 (*dst);
+		__m64 vdest = load8888 (dst);
 
 		vdest = in_over (
 		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
-		*dst = store8888 (vdest);
+		store8888 (dst, vdest);
 	    }
 	}
     }
@@ -2016,7 +2023,7 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
 
     while (height--)
     {
@@ -2036,7 +2043,7 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
 	    {
 		__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
 
-		*dst = store8888 (vdest);
+		store8888 (dst, vdest);
 	    }
 	    else
 	    {
@@ -2087,10 +2094,10 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		__m64 vdest = load8888 (*dst);
+		__m64 vdest = load8888 (dst);
 
 		vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
-		*dst = store8888 (vdest);
+		store8888 (dst, vdest);
 	    }
 	    else
 	    {
@@ -2126,7 +2133,7 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
@@ -2265,7 +2272,7 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 
 	while (w && (unsigned long)dst & 7)
 	{
-	    __m64 vsrc = load8888 (*src);
+	    __m64 vsrc = load8888 (src);
 	    uint64_t d = *dst;
 	    __m64 vdest = expand565 (to_m64 (d), 0);
 
@@ -2298,10 +2305,10 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	    if ((a0 & a1 & a2 & a3) == 0xFF)
 	    {
 		__m64 vdest;
-		vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0);
-		vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1);
-		vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2);
-		vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3);
+		vdest = pack_565 (invert_colors (load8888 (&s0)), _mm_setzero_si64 (), 0);
+		vdest = pack_565 (invert_colors (load8888 (&s1)), vdest, 1);
+		vdest = pack_565 (invert_colors (load8888 (&s2)), vdest, 2);
+		vdest = pack_565 (invert_colors (load8888 (&s3)), vdest, 3);
 
 		*(__m64 *)dst = vdest;
 	    }
@@ -2309,10 +2316,10 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	    {
 		__m64 vdest = *(__m64 *)dst;
 
-		vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0);
-		vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1);
-		vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2);
-		vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3);
+		vdest = pack_565 (over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0)), vdest, 0);
+		vdest = pack_565 (over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1)), vdest, 1);
+		vdest = pack_565 (over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2)), vdest, 2);
+		vdest = pack_565 (over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3)), vdest, 3);
 
 		*(__m64 *)dst = vdest;
 	    }
@@ -2326,7 +2333,7 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 
 	while (w)
 	{
-	    __m64 vsrc = load8888 (*src);
+	    __m64 vsrc = load8888 (src);
 	    uint64_t d = *dst;
 	    __m64 vdest = expand565 (to_m64 (d), 0);
 
@@ -2373,10 +2380,10 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 
 	while (w && (unsigned long)dst & 7)
 	{
-	    __m64 s = load8888 (*src);
-	    __m64 d = load8888 (*dst);
+	    __m64 s = load8888 (src);
+	    __m64 d = load8888 (dst);
 
-	    *dst = store8888 (over_rev_non_pre (s, d));
+	    store8888 (dst, over_rev_non_pre (s, d));
 
 	    w--;
 	    dst++;
@@ -2385,7 +2392,7 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 
 	while (w >= 2)
 	{
-	    uint64_t s0, s1;
+	    uint32_t s0, s1;
 	    unsigned char a0, a1;
 	    __m64 d0, d1;
 
@@ -2397,8 +2404,8 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 
 	    if ((a0 & a1) == 0xFF)
 	    {
-		d0 = invert_colors (load8888 (s0));
-		d1 = invert_colors (load8888 (s1));
+		d0 = invert_colors (load8888 (&s0));
+		d1 = invert_colors (load8888 (&s1));
 
 		*(__m64 *)dst = pack8888 (d0, d1);
 	    }
@@ -2406,8 +2413,8 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	    {
 		__m64 vdest = *(__m64 *)dst;
 
-		d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0));
-		d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1));
+		d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
+		d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
 
 		*(__m64 *)dst = pack8888 (d0, d1);
 	    }
@@ -2419,10 +2426,10 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 
 	if (w)
 	{
-	    __m64 s = load8888 (*src);
-	    __m64 d = load8888 (*dst);
+	    __m64 s = load8888 (src);
+	    __m64 d = load8888 (dst);
 
-	    *dst = store8888 (over_rev_non_pre (s, d));
+	    store8888 (dst, over_rev_non_pre (s, d));
 	}
     }
 
@@ -2450,7 +2457,7 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
@@ -2467,7 +2474,7 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	    {
 		uint64_t d = *q;
 		__m64 vdest = expand565 (to_m64 (d), 0);
-		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
 		*q = to_uint64 (vdest);
 	    }
 
@@ -2489,10 +2496,10 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	    {
 		__m64 vdest = *(__m64 *)q;
 
-		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0);
-		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1);
-		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2);
-		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0)), vdest, 0);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1)), vdest, 1);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2)), vdest, 2);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3)), vdest, 3);
 
 		*(__m64 *)q = vdest;
 	    }
@@ -2510,7 +2517,7 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	    {
 		uint64_t d = *q;
 		__m64 vdest = expand565 (to_m64 (d), 0);
-		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
 		*q = to_uint64 (vdest);
 	    }
 
@@ -2546,7 +2553,7 @@ mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
 
     sa = src >> 24;
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
@@ -2578,10 +2585,10 @@ mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
 	    __m64 vmask;
 	    __m64 vdest;
 
-	    vmask = load8888 (ldl_u((uint32_t *)mask));
-	    vdest = load8888 (*(uint32_t *)dst);
+	    vmask = load8888u ((uint32_t *)mask);
+	    vdest = load8888 ((uint32_t *)dst);
 
-	    *(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest));
+	    store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
 
 	    dst += 4;
 	    mask += 4;
@@ -2648,7 +2655,7 @@ mmx_composite_in_8_8 (pixman_implementation_t *imp,
 	    uint32_t *s = (uint32_t *)src;
 	    uint32_t *d = (uint32_t *)dst;
 
-	    *d = store8888 (in (load8888 (ldl_u((uint32_t *)s)), load8888 (*d)));
+	    store8888 (d, in (load8888u (s), load8888 (d)));
 
 	    w -= 4;
 	    dst += 4;
@@ -2696,7 +2703,7 @@ mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
     if (src == 0)
 	return;
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
@@ -2729,10 +2736,10 @@ mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
 	    __m64 vmask;
 	    __m64 vdest;
 
-	    vmask = load8888 (ldl_u((uint32_t *)mask));
-	    vdest = load8888 (*(uint32_t *)dst);
+	    vmask = load8888u ((uint32_t *)mask);
+	    vdest = load8888 ((uint32_t *)dst);
 
-	    *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
+	    store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
 
 	    dst += 4;
 	    mask += 4;
@@ -3073,19 +3080,20 @@ mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		__m64 s = load8888 (*src | 0xff000000);
+		uint32_t ssrc = *src | 0xff000000;
+		__m64 s = load8888 (&ssrc);
 
 		if (m == 0xff)
 		{
-		    *dst = store8888 (s);
+		    store8888 (dst, s);
 		}
 		else
 		{
 		    __m64 sa = expand_alpha (s);
 		    __m64 vm = expand_alpha_rev (to_m64 (m));
-		    __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
+		    __m64 vdest = in_over (s, sa, vm, load8888 (dst));
 
-		    *dst = store8888 (vdest);
+		    store8888 (dst, vdest);
 		}
 	    }
 
diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h
index 856038547..9d96a9312 100644
--- a/pixman/pixman/pixman-private.h
+++ b/pixman/pixman/pixman-private.h
@@ -559,6 +559,11 @@ pixman_implementation_t *
 _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
 #endif
 
+#ifdef USE_MIPS_DSPR2
+pixman_implementation_t *
+_pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback);
+#endif
+
 #ifdef USE_VMX
 pixman_implementation_t *
 _pixman_implementation_create_vmx (pixman_implementation_t *fallback);
diff --git a/pixman/test/lowlevel-blt-bench.c b/pixman/test/lowlevel-blt-bench.c
index ba7f30716..95513ba10 100644
--- a/pixman/test/lowlevel-blt-bench.c
+++ b/pixman/test/lowlevel-blt-bench.c
@@ -626,6 +626,7 @@ tests_tbl[] =
     { "over_n_0565",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_r5g6b5 },
     { "over_n_1555",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
     { "over_8888_0565",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "over_8888_8888",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "over_8888_x888",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
     { "over_x888_8_0565",      PIXMAN_x8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_r5g6b5 },
     { "over_x888_8_8888",      PIXMAN_x8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
@@ -649,6 +650,7 @@ tests_tbl[] =
     { "over_8888_n_x888",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_x8r8g8b8 },
     { "over_8888_n_0565",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_r5g6b5 },
     { "over_8888_n_1555",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_a1r5g5b5 },
+    { "over_x888_n_8888",      PIXMAN_x8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_a8r8g8b8 },
     { "outrev_n_8_0565",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_r5g6b5 },
     { "outrev_n_8_1555",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
     { "outrev_n_8_x888",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },