/*
 * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
 *             2005 Lars Knoll & Zack Rusin, Trolltech
 *
 * Permission to use, copy, modify, distribute, and sell this software and its
 * documentation for any purpose is hereby granted without fee, provided that
 * the above copyright notice appear in all copies and that both that
 * copyright notice and this permission notice appear in supporting
 * documentation, and that the name of Keith Packard not be used in
 * advertising or publicity pertaining to distribution of the software without
 * specific, written prior permission.  Keith Packard makes no
 * representations about the suitability of this software for any purpose.  It
 * is provided "as is" without express or implied warranty.
 *
 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 * SOFTWARE.
 */

#ifdef HAVE_DIX_CONFIG_H
#include <dix-config.h>
#endif

#include "fb.h"

#ifdef RENDER

#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>

#include "picturestr.h"
#include "mipict.h"
#include "fbpict.h"

#define mod(a,b)	((b) == 1 ? 0 : (a) >= 0 ? (a) % (b) : (b) - (-a) % (b))

#define SCANLINE_BUFFER_LENGTH 2048

typedef FASTCALL void (*fetchProc)(const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed);

/*
 * All of the fetch functions
 */

static FASTCALL void
fbFetch_a8r8g8b8 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    memcpy(buffer, (const CARD32 *)bits + x, width*sizeof(CARD32));
}

static FASTCALL void
fbFetch_x8r8g8b8 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD32 *pixel = (const CARD32 *)bits + x;
    const CARD32 *end = pixel + width;
    while (pixel < end) {
        *buffer++ = *pixel++ | 0xff000000;
    }
}

static FASTCALL void
fbFetch_a8b8g8r8 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD32 *pixel = (CARD32 *)bits + x;
    const CARD32 *end = pixel + width;
    while (pixel < end) {
        *buffer++ =  ((*pixel & 0xff00ff00) |
                      ((*pixel >> 16) & 0xff) |
                      ((*pixel & 0xff) << 16));
        ++pixel;
    }
}

static FASTCALL void
fbFetch_x8b8g8r8 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD32 *pixel = (CARD32 *)bits + x;
    const CARD32 *end = pixel + width;
    while (pixel < end) {
        *buffer++ =  0xff000000 |
                     ((*pixel & 0x0000ff00) |
                      ((*pixel >> 16) & 0xff) |
                      ((*pixel & 0xff) << 16));
        ++pixel;
    }
}

static FASTCALL void
fbFetch_r8g8b8 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD8 *pixel = (const CARD8 *)bits + 3*x;
    const CARD8 *end = pixel + 3*width;
    while (pixel < end) {
        CARD32 b = Fetch24(pixel) | 0xff000000;
        pixel += 3;
        *buffer++ = b;
    }
}

static FASTCALL void
fbFetch_b8g8r8 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD8 *pixel = (const CARD8 *)bits + 3*x;
    const CARD8 *end = pixel + 3*width;
    while (pixel < end) {
        CARD32 b = 0xff000000;
#if IMAGE_BYTE_ORDER == MSBFirst
        b |= (*pixel++);
        b |= (*pixel++ << 8);
        b |= (*pixel++ << 16);
#else
        b |= (*pixel++ << 16);
        b |= (*pixel++ << 8);
        b |= (*pixel++);
#endif
    }
}

static FASTCALL void
fbFetch_r5g6b5 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD16 *pixel = (const CARD16 *)bits + x;
    const CARD16 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        CARD32 r = (((p) << 3) & 0xf8) | 
                   (((p) << 5) & 0xfc00) |
                   (((p) << 8) & 0xf80000);
        r |= (r >> 5) & 0x70007;
        r |= (r >> 6) & 0x300;
        *buffer++ = 0xff000000 | r;
    }
}

static FASTCALL void
fbFetch_b5g6r5 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD16 *pixel = (const CARD16 *)bits + x;
    const CARD16 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        CARD32  r,g,b;

        b = ((p & 0xf800) | ((p & 0xe000) >> 5)) >> 8;
        g = ((p & 0x07e0) | ((p & 0x0600) >> 6)) << 5;
        r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
        *buffer++ = (0xff000000 | r | g | b);
    }
}

static FASTCALL void
fbFetch_a1r5g5b5 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD16 *pixel = (const CARD16 *)bits + x;
    const CARD16 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        CARD32  r,g,b, a;

        a = (CARD32) ((CARD8) (0 - ((p & 0x8000) >> 15))) << 24;
        r = ((p & 0x7c00) | ((p & 0x7000) >> 5)) << 9;
        g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
        b = ((p & 0x001c) | ((p & 0x001f) << 5)) >> 2;
        *buffer++ = (a | r | g | b);
    }
}

static FASTCALL void
fbFetch_x1r5g5b5 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD16 *pixel = (const CARD16 *)bits + x;
    const CARD16 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        CARD32  r,g,b;

        r = ((p & 0x7c00) | ((p & 0x7000) >> 5)) << 9;
        g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
        b = ((p & 0x001c) | ((p & 0x001f) << 5)) >> 2;
        *buffer++ = (0xff000000 | r | g | b);
    }
}

static FASTCALL void
fbFetch_a1b5g5r5 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD16 *pixel = (const CARD16 *)bits + x;
    const CARD16 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        CARD32  r,g,b, a;

        a = (CARD32) ((CARD8) (0 - ((p & 0x8000) >> 15))) << 24;
        b = ((p & 0x7c00) | ((p & 0x7000) >> 5)) >> 7;
        g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
        r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
        *buffer++ = (a | r | g | b);
    }
}

static FASTCALL void
fbFetch_x1b5g5r5 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD16 *pixel = (const CARD16 *)bits + x;
    const CARD16 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        CARD32  r,g,b;

        b = ((p & 0x7c00) | ((p & 0x7000) >> 5)) >> 7;
        g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
        r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
        *buffer++ = (0xff000000 | r | g | b);
    }
}

static FASTCALL void
fbFetch_a4r4g4b4 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD16 *pixel = (const CARD16 *)bits + x;
    const CARD16 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        CARD32  r,g,b, a;

        a = ((p & 0xf000) | ((p & 0xf000) >> 4)) << 16;
        r = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) << 12;
        g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
        b = ((p & 0x000f) | ((p & 0x000f) << 4));
        *buffer++ = (a | r | g | b);
    }
}

static FASTCALL void
fbFetch_x4r4g4b4 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD16 *pixel = (const CARD16 *)bits + x;
    const CARD16 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        CARD32  r,g,b;

        r = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) << 12;
        g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
        b = ((p & 0x000f) | ((p & 0x000f) << 4));
        *buffer++ = (0xff000000 | r | g | b);
    }
}

static FASTCALL void
fbFetch_a4b4g4r4 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD16 *pixel = (const CARD16 *)bits + x;
    const CARD16 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        CARD32  r,g,b, a;

        a = ((p & 0xf000) | ((p & 0xf000) >> 4)) << 16;
        b = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) << 12;
        g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
        r = ((p & 0x000f) | ((p & 0x000f) << 4));
        *buffer++ = (a | r | g | b);
	}
}

static FASTCALL void
fbFetch_x4b4g4r4 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD16 *pixel = (const CARD16 *)bits + x;
    const CARD16 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        CARD32  r,g,b;

        b = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) << 12;
        g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
        r = ((p & 0x000f) | ((p & 0x000f) << 4));
        *buffer++ = (0xff000000 | r | g | b);
	}
}

static FASTCALL void
fbFetch_a8 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD8 *pixel = (const CARD8 *)bits + x;
    const CARD8 *end = pixel + width;
    while (pixel < end) {
        *buffer++ = (*pixel++) << 24;
    }
}

static FASTCALL void
fbFetch_r3g3b2 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD8 *pixel = (const CARD8 *)bits + x;
    const CARD8 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        CARD32  r,g,b;

        r = ((p & 0xe0) | ((p & 0xe0) >> 3) | ((p & 0xc0) >> 6)) << 16;
        g = ((p & 0x1c) | ((p & 0x18) >> 3) | ((p & 0x1c) << 3)) << 8;
        b = (((p & 0x03)     ) |
             ((p & 0x03) << 2) |
             ((p & 0x03) << 4) |
             ((p & 0x03) << 6));
        *buffer++ = (0xff000000 | r | g | b);
    }
}

static FASTCALL void
fbFetch_b2g3r3 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD8 *pixel = (const CARD8 *)bits + x;
    const CARD8 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        CARD32  r,g,b;

        b = (((p & 0xc0)     ) |
             ((p & 0xc0) >> 2) |
             ((p & 0xc0) >> 4) |
             ((p & 0xc0) >> 6));
        g = ((p & 0x38) | ((p & 0x38) >> 3) | ((p & 0x30) << 2)) << 8;
        r = (((p & 0x07)     ) |
             ((p & 0x07) << 3) |
             ((p & 0x06) << 6)) << 16;
        *buffer++ = (0xff000000 | r | g | b);
	}
}

static FASTCALL void
fbFetch_a2r2g2b2 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD8 *pixel = (const CARD8 *)bits + x;
    const CARD8 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        CARD32   a,r,g,b;

        a = ((p & 0xc0) * 0x55) << 18;
        r = ((p & 0x30) * 0x55) << 12;
        g = ((p & 0x0c) * 0x55) << 6;
        b = ((p & 0x03) * 0x55);
        *buffer++ = a|r|g|b;
    }
}

static FASTCALL void
fbFetch_a2b2g2r2 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD8 *pixel = (const CARD8 *)bits + x;
    const CARD8 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        CARD32   a,r,g,b;

        a = ((p & 0xc0) * 0x55) << 18;
        b = ((p & 0x30) * 0x55) >> 6;
        g = ((p & 0x0c) * 0x55) << 6;
        r = ((p & 0x03) * 0x55) << 16;
        *buffer++ = a|r|g|b;
    }
}

static FASTCALL void
fbFetch_c8 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD8 *pixel = (const CARD8 *)bits + x;
    const CARD8 *end = pixel + width;
    while (pixel < end) {
        CARD32  p = *pixel++;
        *buffer++ = indexed->rgba[p];
    }
}

static FASTCALL void
fbFetch_x4a4 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    const CARD8 *pixel = (const CARD8 *)bits + x;
    const CARD8 *end = pixel + width;
    while (pixel < end) {
	CARD8 p = (*pixel++) & 0xf;
        *buffer++ = (p | (p << 4)) << 24;
    }
}

#define Fetch8(l,o)    (((CARD8 *) (l))[(o) >> 2])
#if IMAGE_BYTE_ORDER == MSBFirst
#define Fetch4(l,o)    ((o) & 2 ? Fetch8(l,o) & 0xf : Fetch8(l,o) >> 4)
#else
#define Fetch4(l,o)    ((o) & 2 ? Fetch8(l,o) >> 4 : Fetch8(l,o) & 0xf)
#endif

static FASTCALL void
fbFetch_a4 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  p = Fetch4(bits, i + x);

        p |= p << 4;
        *buffer++ = p << 24;
    }
}

static FASTCALL void
fbFetch_r1g2b1 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  p = Fetch4(bits, i + x);
        CARD32  r,g,b;

        r = ((p & 0x8) * 0xff) << 13;
        g = ((p & 0x6) * 0x55) << 7;
        b = ((p & 0x1) * 0xff);
        *buffer++ = 0xff000000|r|g|b;
    }
}

static FASTCALL void
fbFetch_b1g2r1 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  p = Fetch4(bits, i + x);
        CARD32  r,g,b;

        b = ((p & 0x8) * 0xff) >> 3;
        g = ((p & 0x6) * 0x55) << 7;
        r = ((p & 0x1) * 0xff) << 16;
        *buffer++ = 0xff000000|r|g|b;
    }
}

static FASTCALL void
fbFetch_a1r1g1b1 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  p = Fetch4(bits, i + x);
        CARD32  a,r,g,b;

        a = ((p & 0x8) * 0xff) << 21;
        r = ((p & 0x4) * 0xff) << 14;
        g = ((p & 0x2) * 0xff) << 7;
        b = ((p & 0x1) * 0xff);
        *buffer++ = a|r|g|b;
    }
}

static FASTCALL void
fbFetch_a1b1g1r1 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  p = Fetch4(bits, i + x);
        CARD32  a,r,g,b;

        a = ((p & 0x8) * 0xff) << 21;
        r = ((p & 0x4) * 0xff) >> 3;
        g = ((p & 0x2) * 0xff) << 7;
        b = ((p & 0x1) * 0xff) << 16;
        *buffer++ = a|r|g|b;
    }
}

static FASTCALL void
fbFetch_c4 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  p = Fetch4(bits, i + x);

        *buffer++ = indexed->rgba[p];
    }
}


static FASTCALL void
fbFetch_a1 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  p = ((CARD32 *)bits)[(i + x) >> 5];
        CARD32  a;
#if BITMAP_BIT_ORDER == MSBFirst
        a = p >> (0x1f - ((i+x) & 0x1f));
#else
        a = p >> ((i+x) & 0x1f);
#endif
        a = a & 1;
        a |= a << 1;
        a |= a << 2;
        a |= a << 4;
        *buffer++ = a << 24;
    }
}

static FASTCALL void
fbFetch_g1 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  p = ((CARD32 *)bits)[(i+x) >> 5];
        CARD32 a;
#if BITMAP_BIT_ORDER == MSBFirst
        a = p >> (0x1f - ((i+x) & 0x1f));
#else
        a = p >> ((i+x) & 0x1f);
#endif
        a = a & 1;
        *buffer++ = indexed->rgba[a];
    }
}

static fetchProc fetchProcForPicture (PicturePtr pict)
{
    switch(pict->format) {
    case PICT_a8r8g8b8: return fbFetch_a8r8g8b8;
    case PICT_x8r8g8b8: return fbFetch_x8r8g8b8;
    case PICT_a8b8g8r8: return fbFetch_a8b8g8r8;
    case PICT_x8b8g8r8: return fbFetch_x8b8g8r8;

        /* 24bpp formats */
    case PICT_r8g8b8: return fbFetch_r8g8b8;
    case PICT_b8g8r8: return fbFetch_b8g8r8;

        /* 16bpp formats */
    case PICT_r5g6b5: return fbFetch_r5g6b5;
    case PICT_b5g6r5: return fbFetch_b5g6r5;

    case PICT_a1r5g5b5: return fbFetch_a1r5g5b5;
    case PICT_x1r5g5b5: return fbFetch_x1r5g5b5;
    case PICT_a1b5g5r5: return fbFetch_a1b5g5r5;
    case PICT_x1b5g5r5: return fbFetch_x1b5g5r5;
    case PICT_a4r4g4b4: return fbFetch_a4r4g4b4;
    case PICT_x4r4g4b4: return fbFetch_x4r4g4b4;
    case PICT_a4b4g4r4: return fbFetch_a4b4g4r4;
    case PICT_x4b4g4r4: return fbFetch_x4b4g4r4;

        /* 8bpp formats */
    case PICT_a8: return  fbFetch_a8;
    case PICT_r3g3b2: return fbFetch_r3g3b2;
    case PICT_b2g3r3: return fbFetch_b2g3r3;
    case PICT_a2r2g2b2: return fbFetch_a2r2g2b2;
    case PICT_a2b2g2r2: return fbFetch_a2b2g2r2;
    case PICT_c8: return  fbFetch_c8;
    case PICT_g8: return  fbFetch_c8;
    case PICT_x4a4: return fbFetch_x4a4;

        /* 4bpp formats */
    case PICT_a4: return  fbFetch_a4;
    case PICT_r1g2b1: return fbFetch_r1g2b1;
    case PICT_b1g2r1: return fbFetch_b1g2r1;
    case PICT_a1r1g1b1: return fbFetch_a1r1g1b1;
    case PICT_a1b1g1r1: return fbFetch_a1b1g1r1;
    case PICT_c4: return  fbFetch_c4;
    case PICT_g4: return  fbFetch_c4;

        /* 1bpp formats */
    case PICT_a1: return  fbFetch_a1;
    case PICT_g1: return  fbFetch_g1;
    default:
        return NULL;
    }
}

/*
 * Pixel wise fetching
 */

typedef FASTCALL CARD32 (*fetchPixelProc)(const FbBits *bits, int offset, miIndexedPtr indexed);

static FASTCALL CARD32
fbFetchPixel_a8r8g8b8 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    return ((CARD32 *)bits)[offset];
}

static FASTCALL CARD32
fbFetchPixel_x8r8g8b8 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    return ((CARD32 *)bits)[offset] | 0xff000000;
}

static FASTCALL CARD32
fbFetchPixel_a8b8g8r8 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = ((CARD32 *)bits)[offset];

    return ((pixel & 0xff000000) |
	    ((pixel >> 16) & 0xff) |
	    (pixel & 0x0000ff00) |
	    ((pixel & 0xff) << 16));
}

static FASTCALL CARD32
fbFetchPixel_x8b8g8r8 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = ((CARD32 *)bits)[offset];

    return ((0xff000000) |
	    ((pixel >> 16) & 0xff) |
	    (pixel & 0x0000ff00) |
	    ((pixel & 0xff) << 16));
}

static FASTCALL CARD32
fbFetchPixel_r8g8b8 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD8   *pixel = ((CARD8 *) bits) + (offset*3);
#if IMAGE_BYTE_ORDER == MSBFirst
    return (0xff000000 |
	    (pixel[0] << 16) |
	    (pixel[1] << 8) |
	    (pixel[2]));
#else
    return (0xff000000 |
            (pixel[2] << 16) |
            (pixel[1] << 8) |
            (pixel[0]));
#endif
}

static FASTCALL CARD32
fbFetchPixel_b8g8r8 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD8   *pixel = ((CARD8 *) bits) + (offset*3);
#if IMAGE_BYTE_ORDER == MSBFirst
    return (0xff000000 |
	    (pixel[2] << 16) |
	    (pixel[1] << 8) |
	    (pixel[0]));
#else
    return (0xff000000 |
	    (pixel[0] << 16) |
	    (pixel[1] << 8) |
	    (pixel[2]));
#endif
}

static FASTCALL CARD32
fbFetchPixel_r5g6b5 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = ((CARD16 *) bits)[offset];
    CARD32  r,g,b;

    r = ((pixel & 0xf800) | ((pixel & 0xe000) >> 5)) << 8;
    g = ((pixel & 0x07e0) | ((pixel & 0x0600) >> 6)) << 5;
    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
    return (0xff000000 | r | g | b);
}

static FASTCALL CARD32
fbFetchPixel_b5g6r5 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = ((CARD16 *) bits)[offset];
    CARD32  r,g,b;

    b = ((pixel & 0xf800) | ((pixel & 0xe000) >> 5)) >> 8;
    g = ((pixel & 0x07e0) | ((pixel & 0x0600) >> 6)) << 5;
    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
    return (0xff000000 | r | g | b);
}

static FASTCALL CARD32
fbFetchPixel_a1r5g5b5 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = ((CARD16 *) bits)[offset];
    CARD32  a,r,g,b;

    a = (CARD32) ((CARD8) (0 - ((pixel & 0x8000) >> 15))) << 24;
    r = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) << 9;
    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
    return (a | r | g | b);
}

static FASTCALL CARD32
fbFetchPixel_x1r5g5b5 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = ((CARD16 *) bits)[offset];
    CARD32  r,g,b;

    r = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) << 9;
    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
    return (0xff000000 | r | g | b);
}

static FASTCALL CARD32
fbFetchPixel_a1b5g5r5 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = ((CARD16 *) bits)[offset];
    CARD32  a,r,g,b;

    a = (CARD32) ((CARD8) (0 - ((pixel & 0x8000) >> 15))) << 24;
    b = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) >> 7;
    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
    return (a | r | g | b);
}

static FASTCALL CARD32
fbFetchPixel_x1b5g5r5 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = ((CARD16 *) bits)[offset];
    CARD32  r,g,b;

    b = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) >> 7;
    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
    return (0xff000000 | r | g | b);
}

static FASTCALL CARD32
fbFetchPixel_a4r4g4b4 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = ((CARD16 *) bits)[offset];
    CARD32  a,r,g,b;

    a = ((pixel & 0xf000) | ((pixel & 0xf000) >> 4)) << 16;
    r = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) << 12;
    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
    b = ((pixel & 0x000f) | ((pixel & 0x000f) << 4));
    return (a | r | g | b);
}

static FASTCALL CARD32
fbFetchPixel_x4r4g4b4 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = ((CARD16 *) bits)[offset];
    CARD32  r,g,b;

    r = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) << 12;
    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
    b = ((pixel & 0x000f) | ((pixel & 0x000f) << 4));
    return (0xff000000 | r | g | b);
}

static FASTCALL CARD32
fbFetchPixel_a4b4g4r4 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = ((CARD16 *) bits)[offset];
    CARD32  a,r,g,b;

    a = ((pixel & 0xf000) | ((pixel & 0xf000) >> 4)) << 16;
    b = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) << 12;
    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
    r = ((pixel & 0x000f) | ((pixel & 0x000f) << 4));
    return (a | r | g | b);
}

static FASTCALL CARD32
fbFetchPixel_x4b4g4r4 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = ((CARD16 *) bits)[offset];
    CARD32  r,g,b;

    b = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) << 12;
    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
    r = ((pixel & 0x000f) | ((pixel & 0x000f) << 4));
    return (0xff000000 | r | g | b);
}

static FASTCALL CARD32
fbFetchPixel_a8 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32   pixel = ((CARD8 *) bits)[offset];

    return pixel << 24;
}

static FASTCALL CARD32
fbFetchPixel_r3g3b2 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32   pixel = ((CARD8 *) bits)[offset];
    CARD32  r,g,b;

    r = ((pixel & 0xe0) | ((pixel & 0xe0) >> 3) | ((pixel & 0xc0) >> 6)) << 16;
    g = ((pixel & 0x1c) | ((pixel & 0x18) >> 3) | ((pixel & 0x1c) << 3)) << 8;
    b = (((pixel & 0x03)     ) |
	 ((pixel & 0x03) << 2) |
	 ((pixel & 0x03) << 4) |
	 ((pixel & 0x03) << 6));
    return (0xff000000 | r | g | b);
}

static FASTCALL CARD32
fbFetchPixel_b2g3r3 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32   pixel = ((CARD8 *) bits)[offset];
    CARD32  r,g,b;

    b = (((pixel & 0xc0)     ) |
	 ((pixel & 0xc0) >> 2) |
	 ((pixel & 0xc0) >> 4) |
	 ((pixel & 0xc0) >> 6));
    g = ((pixel & 0x38) | ((pixel & 0x38) >> 3) | ((pixel & 0x30) << 2)) << 8;
    r = (((pixel & 0x07)     ) |
	 ((pixel & 0x07) << 3) |
	 ((pixel & 0x06) << 6)) << 16;
    return (0xff000000 | r | g | b);
}

static FASTCALL CARD32
fbFetchPixel_a2r2g2b2 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32   pixel = ((CARD8 *) bits)[offset];
    CARD32   a,r,g,b;

    a = ((pixel & 0xc0) * 0x55) << 18;
    r = ((pixel & 0x30) * 0x55) << 12;
    g = ((pixel & 0x0c) * 0x55) << 6;
    b = ((pixel & 0x03) * 0x55);
    return a|r|g|b;
}

static FASTCALL CARD32
fbFetchPixel_a2b2g2r2 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32   pixel = ((CARD8 *) bits)[offset];
    CARD32   a,r,g,b;

    a = ((pixel & 0xc0) * 0x55) << 18;
    b = ((pixel & 0x30) * 0x55) >> 6;
    g = ((pixel & 0x0c) * 0x55) << 6;
    r = ((pixel & 0x03) * 0x55) << 16;
    return a|r|g|b;
}

static FASTCALL CARD32
fbFetchPixel_c8 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32   pixel = ((CARD8 *) bits)[offset];
    return indexed->rgba[pixel];
}

#define Fetch8(l,o)    (((CARD8 *) (l))[(o) >> 2])
#if IMAGE_BYTE_ORDER == MSBFirst
#define Fetch4(l,o)    ((o) & 2 ? Fetch8(l,o) & 0xf : Fetch8(l,o) >> 4)
#else
#define Fetch4(l,o)    ((o) & 2 ? Fetch8(l,o) >> 4 : Fetch8(l,o) & 0xf)
#endif

static FASTCALL CARD32
fbFetchPixel_a4 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = Fetch4(bits, offset);

    pixel |= pixel << 4;
    return pixel << 24;
}

static FASTCALL CARD32
fbFetchPixel_r1g2b1 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = Fetch4(bits, offset);
    CARD32  r,g,b;

    r = ((pixel & 0x8) * 0xff) << 13;
    g = ((pixel & 0x6) * 0x55) << 7;
    b = ((pixel & 0x1) * 0xff);
    return 0xff000000|r|g|b;
}

static FASTCALL CARD32
fbFetchPixel_b1g2r1 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = Fetch4(bits, offset);
    CARD32  r,g,b;

    b = ((pixel & 0x8) * 0xff) >> 3;
    g = ((pixel & 0x6) * 0x55) << 7;
    r = ((pixel & 0x1) * 0xff) << 16;
    return 0xff000000|r|g|b;
}

static FASTCALL CARD32
fbFetchPixel_a1r1g1b1 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = Fetch4(bits, offset);
    CARD32  a,r,g,b;

    a = ((pixel & 0x8) * 0xff) << 21;
    r = ((pixel & 0x4) * 0xff) << 14;
    g = ((pixel & 0x2) * 0xff) << 7;
    b = ((pixel & 0x1) * 0xff);
    return a|r|g|b;
}

static FASTCALL CARD32
fbFetchPixel_a1b1g1r1 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = Fetch4(bits, offset);
    CARD32  a,r,g,b;

    a = ((pixel & 0x8) * 0xff) << 21;
    r = ((pixel & 0x4) * 0xff) >> 3;
    g = ((pixel & 0x2) * 0xff) << 7;
    b = ((pixel & 0x1) * 0xff) << 16;
    return a|r|g|b;
}

static FASTCALL CARD32
fbFetchPixel_c4 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = Fetch4(bits, offset);

    return indexed->rgba[pixel];
}


static FASTCALL CARD32
fbFetchPixel_a1 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32  pixel = ((CARD32 *)bits)[offset >> 5];
    CARD32  a;
#if BITMAP_BIT_ORDER == MSBFirst
    a = pixel >> (0x1f - (offset & 0x1f));
#else
    a = pixel >> (offset & 0x1f);
#endif
    a = a & 1;
    a |= a << 1;
    a |= a << 2;
    a |= a << 4;
    return a << 24;
}

static FASTCALL CARD32
fbFetchPixel_g1 (const FbBits *bits, int offset, miIndexedPtr indexed)
{
    CARD32 pixel = ((CARD32 *)bits)[offset >> 5];
    CARD32 a;
#if BITMAP_BIT_ORDER == MSBFirst
    a = pixel >> (0x1f - (offset & 0x1f));
#else
    a = pixel >> (offset & 0x1f);
#endif
    a = a & 1;
    return indexed->rgba[a];
}

static fetchPixelProc fetchPixelProcForPicture (PicturePtr pict)
{
    switch(pict->format) {
    case PICT_a8r8g8b8: return fbFetchPixel_a8r8g8b8;
    case PICT_x8r8g8b8: return fbFetchPixel_x8r8g8b8;
    case PICT_a8b8g8r8: return fbFetchPixel_a8b8g8r8;
    case PICT_x8b8g8r8: return fbFetchPixel_x8b8g8r8;

        /* 24bpp formats */
    case PICT_r8g8b8: return fbFetchPixel_r8g8b8;
    case PICT_b8g8r8: return fbFetchPixel_b8g8r8;

        /* 16bpp formats */
    case PICT_r5g6b5: return fbFetchPixel_r5g6b5;
    case PICT_b5g6r5: return fbFetchPixel_b5g6r5;

    case PICT_a1r5g5b5: return fbFetchPixel_a1r5g5b5;
    case PICT_x1r5g5b5: return fbFetchPixel_x1r5g5b5;
    case PICT_a1b5g5r5: return fbFetchPixel_a1b5g5r5;
    case PICT_x1b5g5r5: return fbFetchPixel_x1b5g5r5;
    case PICT_a4r4g4b4: return fbFetchPixel_a4r4g4b4;
    case PICT_x4r4g4b4: return fbFetchPixel_x4r4g4b4;
    case PICT_a4b4g4r4: return fbFetchPixel_a4b4g4r4;
    case PICT_x4b4g4r4: return fbFetchPixel_x4b4g4r4;

        /* 8bpp formats */
    case PICT_a8: return  fbFetchPixel_a8;
    case PICT_r3g3b2: return fbFetchPixel_r3g3b2;
    case PICT_b2g3r3: return fbFetchPixel_b2g3r3;
    case PICT_a2r2g2b2: return fbFetchPixel_a2r2g2b2;
    case PICT_a2b2g2r2: return fbFetchPixel_a2b2g2r2;
    case PICT_c8: return  fbFetchPixel_c8;
    case PICT_g8: return  fbFetchPixel_c8;

        /* 4bpp formats */
    case PICT_a4: return  fbFetchPixel_a4;
    case PICT_r1g2b1: return fbFetchPixel_r1g2b1;
    case PICT_b1g2r1: return fbFetchPixel_b1g2r1;
    case PICT_a1r1g1b1: return fbFetchPixel_a1r1g1b1;
    case PICT_a1b1g1r1: return fbFetchPixel_a1b1g1r1;
    case PICT_c4: return  fbFetchPixel_c4;
    case PICT_g4: return  fbFetchPixel_c4;

        /* 1bpp formats */
    case PICT_a1: return  fbFetchPixel_a1;
    case PICT_g1: return  fbFetchPixel_g1;
    default:
        return NULL;
    }
}



/*
 * All the store functions
 */

typedef FASTCALL void (*storeProc) (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed);

#define Splita(v)	CARD32	a = ((v) >> 24), r = ((v) >> 16) & 0xff, g = ((v) >> 8) & 0xff, b = (v) & 0xff
#define Split(v)	CARD32	r = ((v) >> 16) & 0xff, g = ((v) >> 8) & 0xff, b = (v) & 0xff

static FASTCALL void
fbStore_a8r8g8b8 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    memcpy(((CARD32 *)bits) + x, values, width*sizeof(CARD32));
}

static FASTCALL void
fbStore_x8r8g8b8 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD32 *pixel = (CARD32 *)bits + x;
    for (i = 0; i < width; ++i)
        *pixel++ = values[i] & 0xffffff;
}

static FASTCALL void
fbStore_a8b8g8r8 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD32 *pixel = (CARD32 *)bits + x;
    for (i = 0; i < width; ++i)
        *pixel++ = (values[i] & 0xff00ff00) | ((values[i] >> 16) & 0xff) | ((values[i] & 0xff) << 16);
}

static FASTCALL void
fbStore_x8b8g8r8 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD32 *pixel = (CARD32 *)bits + x;
    for (i = 0; i < width; ++i)
        *pixel++ = (values[i] & 0x0000ff00) | ((values[i] >> 16) & 0xff) | ((values[i] & 0xff) << 16);
}

static FASTCALL void
fbStore_r8g8b8 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD8 *pixel = ((CARD8 *) bits) + 3*x;
    for (i = 0; i < width; ++i) {
        Store24(pixel, values[i]);
        pixel += 3;
    }
}

static FASTCALL void
fbStore_b8g8r8 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD8 *pixel = ((CARD8 *) bits) + 3*x;
    for (i = 0; i < width; ++i) {
#if IMAGE_BYTE_ORDER == MSBFirst
        *pixel++ = Blue(values[i]);
        *pixel++ = Green(values[i]);
        *pixel++ = Red(values[i]);
#else
        *pixel++ = Red(values[i]);
        *pixel++ = Green(values[i]);
        *pixel++ = Blue(values[i]);
#endif
    }
}

static FASTCALL void
fbStore_r5g6b5 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD16 *pixel = ((CARD16 *) bits) + x;
    for (i = 0; i < width; ++i) {
        CARD32 s = values[i];
        *pixel++ = ((s >> 3) & 0x001f) |
                   ((s >> 5) & 0x07e0) |
                   ((s >> 8) & 0xf800);
    }
}

static FASTCALL void
fbStore_b5g6r5 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD16  *pixel = ((CARD16 *) bits) + x;
    for (i = 0; i < width; ++i) {
        Split(values[i]);
        *pixel++ = (((b << 8) & 0xf800) |
                    ((g << 3) & 0x07e0) |
                    ((r >> 3)         ));
    }
}

static FASTCALL void
fbStore_a1r5g5b5 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD16  *pixel = ((CARD16 *) bits) + x;
    for (i = 0; i < width; ++i) {
        Splita(values[i]);
        *pixel++ = (((a << 8) & 0x8000) |
                    ((r << 7) & 0x7c00) |
                    ((g << 2) & 0x03e0) |
                    ((b >> 3)         ));
    }
}

static FASTCALL void
fbStore_x1r5g5b5 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD16  *pixel = ((CARD16 *) bits) + x;
    for (i = 0; i < width; ++i) {
        Split(values[i]);
        *pixel++ = (((r << 7) & 0x7c00) |
                    ((g << 2) & 0x03e0) |
                    ((b >> 3)         ));
    }
}

static FASTCALL void
fbStore_a1b5g5r5 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD16  *pixel = ((CARD16 *) bits) + x;
    for (i = 0; i < width; ++i) {
        Splita(values[i]);
        *pixel++ = (((a << 8) & 0x8000) |
                    ((b << 7) & 0x7c00) |
                    ((g << 2) & 0x03e0) |
                    ((r >> 3)         ));
	}
}

static FASTCALL void
fbStore_x1b5g5r5 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD16  *pixel = ((CARD16 *) bits) + x;
    for (i = 0; i < width; ++i) {
        Split(values[i]);
        *pixel++ = (((b << 7) & 0x7c00) |
                    ((g << 2) & 0x03e0) |
                    ((r >> 3)         ));
    }
}

static FASTCALL void
fbStore_a4r4g4b4 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD16  *pixel = ((CARD16 *) bits) + x;
    for (i = 0; i < width; ++i) {
        Splita(values[i]);
        *pixel++ = (((a << 8) & 0xf000) |
                    ((r << 4) & 0x0f00) |
                    ((g     ) & 0x00f0) |
                    ((b >> 4)         ));
    }
}

static FASTCALL void
fbStore_x4r4g4b4 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD16  *pixel = ((CARD16 *) bits) + x;
    for (i = 0; i < width; ++i) {
        Split(values[i]);
        *pixel++ = (((r << 4) & 0x0f00) |
                    ((g     ) & 0x00f0) |
                    ((b >> 4)         ));
    }
}

static FASTCALL void
fbStore_a4b4g4r4 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD16  *pixel = ((CARD16 *) bits) + x;
    for (i = 0; i < width; ++i) {
        Splita(values[i]);
        *pixel++ = (((a << 8) & 0xf000) |
                    ((b << 4) & 0x0f00) |
                    ((g     ) & 0x00f0) |
                    ((r >> 4)         ));
    }
}

static FASTCALL void
fbStore_x4b4g4r4 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD16  *pixel = ((CARD16 *) bits) + x;
    for (i = 0; i < width; ++i) {
        Split(values[i]);
        *pixel++ = (((b << 4) & 0x0f00) |
                    ((g     ) & 0x00f0) |
                    ((r >> 4)         ));
    }
}

static FASTCALL void
fbStore_a8 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD8   *pixel = ((CARD8 *) bits) + x;
    for (i = 0; i < width; ++i) {
        *pixel++ = values[i] >> 24;
    }
}

static FASTCALL void
fbStore_r3g3b2 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD8   *pixel = ((CARD8 *) bits) + x;
    for (i = 0; i < width; ++i) {
        Split(values[i]);
        *pixel++ = (((r     ) & 0xe0) |
                    ((g >> 3) & 0x1c) |
                    ((b >> 6)       ));
    }
}

static FASTCALL void
fbStore_b2g3r3 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD8   *pixel = ((CARD8 *) bits) + x;
    for (i = 0; i < width; ++i) {
        Split(values[i]);
        *pixel++ = (((b     ) & 0xe0) |
                    ((g >> 3) & 0x1c) |
                    ((r >> 6)       ));
    }
}

static FASTCALL void
fbStore_a2r2g2b2 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD8   *pixel = ((CARD8 *) bits) + x;
    for (i = 0; i < width; ++i) {
        Splita(values[i]);
        *pixel++ = (((a     ) & 0xc0) |
                    ((r >> 2) & 0x30) |
                    ((g >> 4) & 0x0c) |
                    ((b >> 6)       ));
    }
}

static FASTCALL void
fbStore_c8 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD8   *pixel = ((CARD8 *) bits) + x;
    for (i = 0; i < width; ++i) {
        *pixel++ = miIndexToEnt24(indexed,values[i]);
    }
}

static FASTCALL void
fbStore_x4a4 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    CARD8   *pixel = ((CARD8 *) bits) + x;
    for (i = 0; i < width; ++i) {
        *pixel++ = values[i] >> 28;
    }
}

#define Store8(l,o,v)  (((CARD8 *) l)[(o) >> 3] = (v))
#if IMAGE_BYTE_ORDER == MSBFirst
#define Store4(l,o,v)  Store8(l,o,((o) & 4 ? \
				   (Fetch8(l,o) & 0xf0) | (v) : \
				   (Fetch8(l,o) & 0x0f) | ((v) << 4)))
#else
#define Store4(l,o,v)  Store8(l,o,((o) & 4 ? \
				   (Fetch8(l,o) & 0x0f) | ((v) << 4) : \
				   (Fetch8(l,o) & 0xf0) | (v)))
#endif

static FASTCALL void
fbStore_a4 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        Store4(bits, i + x, values[i]>>28);
    }
}

static FASTCALL void
fbStore_r1g2b1 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  pixel;

        Split(values[i]);
        pixel = (((r >> 4) & 0x8) |
                 ((g >> 5) & 0x6) |
                 ((b >> 7)      ));
        Store4(bits, i + x, pixel);
    }
}

static FASTCALL void
fbStore_b1g2r1 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  pixel;

        Split(values[i]);
        pixel = (((b >> 4) & 0x8) |
                 ((g >> 5) & 0x6) |
                 ((r >> 7)      ));
        Store4(bits, i + x, pixel);
    }
}

static FASTCALL void
fbStore_a1r1g1b1 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  pixel;
        Splita(values[i]);
        pixel = (((a >> 4) & 0x8) |
                 ((r >> 5) & 0x4) |
                 ((g >> 6) & 0x2) |
                 ((b >> 7)      ));
        Store4(bits, i + x, pixel);
    }
}

static FASTCALL void
fbStore_a1b1g1r1 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  pixel;
        Splita(values[i]);
        pixel = (((a >> 4) & 0x8) |
                 ((b >> 5) & 0x4) |
                 ((g >> 6) & 0x2) |
                 ((r >> 7)      ));
        Store4(bits, i + x, pixel);
    }
}

static FASTCALL void
fbStore_c4 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  pixel;

        pixel = miIndexToEnt24(indexed, values[i]);
        Store4(bits, i + x, pixel);
    }
}

static FASTCALL void
fbStore_a1 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  *pixel = ((CARD32 *) bits) + ((i+x) >> 5);
        CARD32  mask = FbStipMask((i+x) & 0x1f, 1);

        CARD32 v = values[i] & 0x80000000 ? mask : 0;
        *pixel = (*pixel & ~mask) | v;
    }
}

static FASTCALL void
fbStore_g1 (FbBits *bits, const CARD32 *values, int x, int width, miIndexedPtr indexed)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  *pixel = ((CARD32 *) bits) + ((i+x) >> 5);
        CARD32  mask = FbStipMask((i+x) & 0x1f, 1);

        CARD32 v = miIndexToEntY24(indexed,values[i]) ? mask : 0;
        *pixel = (*pixel & ~mask) | v;
    }
}


static storeProc storeProcForPicture (PicturePtr pict)
{
    switch(pict->format) {
    case PICT_a8r8g8b8: return fbStore_a8r8g8b8;
    case PICT_x8r8g8b8: return fbStore_x8r8g8b8;
    case PICT_a8b8g8r8: return fbStore_a8b8g8r8;
    case PICT_x8b8g8r8: return fbStore_x8b8g8r8;

        /* 24bpp formats */
    case PICT_r8g8b8: return fbStore_r8g8b8;
    case PICT_b8g8r8: return fbStore_b8g8r8;

        /* 16bpp formats */
    case PICT_r5g6b5: return fbStore_r5g6b5;
    case PICT_b5g6r5: return fbStore_b5g6r5;

    case PICT_a1r5g5b5: return fbStore_a1r5g5b5;
    case PICT_x1r5g5b5: return fbStore_x1r5g5b5;
    case PICT_a1b5g5r5: return fbStore_a1b5g5r5;
    case PICT_x1b5g5r5: return fbStore_x1b5g5r5;
    case PICT_a4r4g4b4: return fbStore_a4r4g4b4;
    case PICT_x4r4g4b4: return fbStore_x4r4g4b4;
    case PICT_a4b4g4r4: return fbStore_a4b4g4r4;
    case PICT_x4b4g4r4: return fbStore_x4b4g4r4;

        /* 8bpp formats */
    case PICT_a8: return  fbStore_a8;
    case PICT_r3g3b2: return fbStore_r3g3b2;
    case PICT_b2g3r3: return fbStore_b2g3r3;
    case PICT_a2r2g2b2: return fbStore_a2r2g2b2;
    case PICT_c8: return  fbStore_c8;
    case PICT_g8: return  fbStore_c8;
    case PICT_x4a4: return fbStore_x4a4;

        /* 4bpp formats */
    case PICT_a4: return  fbStore_a4;
    case PICT_r1g2b1: return fbStore_r1g2b1;
    case PICT_b1g2r1: return fbStore_b1g2r1;
    case PICT_a1r1g1b1: return fbStore_a1r1g1b1;
    case PICT_a1b1g1r1: return fbStore_a1b1g1r1;
    case PICT_c4: return  fbStore_c4;
    case PICT_g4: return  fbStore_c4;

        /* 1bpp formats */
    case PICT_a1: return  fbStore_a1;
    case PICT_g1: return  fbStore_g1;
    default:
        return NULL;
    }
}


/*
 * Combine src and mask
 */
static FASTCALL void
fbCombineMaskU (CARD32 *src, const CARD32 *mask, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 a = mask[i] >> 24;
        CARD32 s = src[i];
        FbByteMul(s, a);
        src[i] = s;
    }
}

/*
 * All of the composing functions
 */

static FASTCALL void
fbCombineClear (CARD32 *dest, const CARD32 *src, int width)
{
    memset(dest, 0, width*sizeof(CARD32));
}

static FASTCALL void
fbCombineSrcU (CARD32 *dest, const CARD32 *src, int width)
{
    memcpy(dest, src, width*sizeof(CARD32));
}


static FASTCALL void
fbCombineOverU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 ia = Alpha(~s);

        FbByteMulAdd(d, ia, s);
        dest[i] = d;
    }
}

static FASTCALL void
fbCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 ia = Alpha(~dest[i]);
        FbByteMulAdd(s, ia, d);
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineInU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 a = Alpha(dest[i]);
        FbByteMul(s, a);
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineInReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 d = dest[i];
        CARD32 a = Alpha(src[i]);
        FbByteMul(d, a);
        dest[i] = d;
    }
}

static FASTCALL void
fbCombineOutU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 a = Alpha(~dest[i]);
        FbByteMul(s, a);
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 d = dest[i];
        CARD32 a = Alpha(~src[i]);
        FbByteMul(d, a);
        dest[i] = d;
    }
}

static FASTCALL void
fbCombineAtopU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 dest_a = Alpha(d);
        CARD32 src_ia = Alpha(~s);

        FbByteAddMul(s, dest_a, d, src_ia);
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 src_a = Alpha(s);
        CARD32 dest_ia = Alpha(~d);

        FbByteAddMul(s, dest_ia, d, src_a);
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineXorU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 src_ia = Alpha(~s);
        CARD32 dest_ia = Alpha(~d);

        FbByteAddMul(s, dest_ia, d, src_ia);
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineAddU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        FbByteAdd(d, s);
        dest[i] = d;
    }
}

static FASTCALL void
fbCombineSaturateU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  s = src[i];
        CARD32 d = dest[i];
        CARD16  sa, da;

        sa = s >> 24;
        da = ~d >> 24;
        if (sa > da)
        {
            sa = FbIntDiv(da, sa);
            FbByteMul(s, sa);
        }
        FbByteAdd(d, s);
        dest[i] = d;
    }
}

/*
 * All of the disjoint composing functions

 The four entries in the first column indicate what source contributions
 come from each of the four areas of the picture -- areas covered by neither
 A nor B, areas covered only by A, areas covered only by B and finally
 areas covered by both A and B.

		Disjoint			Conjoint
		Fa		Fb		Fa		Fb
(0,0,0,0)	0		0		0		0
(0,A,0,A)	1		0		1		0
(0,0,B,B)	0		1		0		1
(0,A,B,A)	1		min((1-a)/b,1)	1		max(1-a/b,0)
(0,A,B,B)	min((1-b)/a,1)	1		max(1-b/a,0)	1
(0,0,0,A)	max(1-(1-b)/a,0) 0		min(1,b/a)	0
(0,0,0,B)	0		max(1-(1-a)/b,0) 0		min(a/b,1)
(0,A,0,0)	min(1,(1-b)/a)	0		max(1-b/a,0)	0
(0,0,B,0)	0		min(1,(1-a)/b)	0		max(1-a/b,0)
(0,0,B,A)	max(1-(1-b)/a,0) min(1,(1-a)/b)	 min(1,b/a)	max(1-a/b,0)
(0,A,0,B)	min(1,(1-b)/a)	max(1-(1-a)/b,0) max(1-b/a,0)	min(1,a/b)
(0,A,B,0)	min(1,(1-b)/a)	min(1,(1-a)/b)	max(1-b/a,0)	max(1-a/b,0)

 */

#define CombineAOut 1
#define CombineAIn  2
#define CombineBOut 4
#define CombineBIn  8

#define CombineClear	0
#define CombineA	(CombineAOut|CombineAIn)
#define CombineB	(CombineBOut|CombineBIn)
#define CombineAOver	(CombineAOut|CombineBOut|CombineAIn)
#define CombineBOver	(CombineAOut|CombineBOut|CombineBIn)
#define CombineAAtop	(CombineBOut|CombineAIn)
#define CombineBAtop	(CombineAOut|CombineBIn)
#define CombineXor	(CombineAOut|CombineBOut)

/* portion covered by a but not b */
static INLINE CARD8
fbCombineDisjointOutPart (CARD8 a, CARD8 b)
{
    /* min (1, (1-b) / a) */

    b = ~b;		    /* 1 - b */
    if (b >= a)		    /* 1 - b >= a -> (1-b)/a >= 1 */
	return 0xff;	    /* 1 */
    return FbIntDiv(b,a);   /* (1-b) / a */
}

/* portion covered by both a and b */
static INLINE CARD8
fbCombineDisjointInPart (CARD8 a, CARD8 b)
{
    /* max (1-(1-b)/a,0) */
    /*  = - min ((1-b)/a - 1, 0) */
    /*  = 1 - min (1, (1-b)/a) */

    b = ~b;		    /* 1 - b */
    if (b >= a)		    /* 1 - b >= a -> (1-b)/a >= 1 */
	return 0;	    /* 1 - 1 */
    return ~FbIntDiv(b,a);  /* 1 - (1-b) / a */
}

static FASTCALL void
fbCombineDisjointGeneralU (CARD32 *dest, const CARD32 *src, int width, CARD8 combine)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 m,n,o,p;
        CARD16 Fa, Fb, t, u, v;
        CARD8 sa = s >> 24;
        CARD8 da = d >> 24;

        switch (combine & CombineA) {
        default:
            Fa = 0;
            break;
        case CombineAOut:
            Fa = fbCombineDisjointOutPart (sa, da);
            break;
        case CombineAIn:
            Fa = fbCombineDisjointInPart (sa, da);
            break;
        case CombineA:
            Fa = 0xff;
            break;
        }

        switch (combine & CombineB) {
        default:
            Fb = 0;
            break;
        case CombineBOut:
            Fb = fbCombineDisjointOutPart (da, sa);
            break;
        case CombineBIn:
            Fb = fbCombineDisjointInPart (da, sa);
            break;
        case CombineB:
            Fb = 0xff;
            break;
        }
        m = FbGen (s,d,0,Fa,Fb,t, u, v);
        n = FbGen (s,d,8,Fa,Fb,t, u, v);
        o = FbGen (s,d,16,Fa,Fb,t, u, v);
        p = FbGen (s,d,24,Fa,Fb,t, u, v);
        s = m|n|o|p;
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineDisjointOverU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  s = src[i];
        CARD16  a = s >> 24;

        if (a != 0x00)
        {
            if (a != 0xff)
            {
                CARD32 d = dest[i];
                a = fbCombineDisjointOutPart (d >> 24, a);
                FbByteMulAdd(d, a, s);
                s = d;
            }
            dest[i] = s;
        }
    }
}

static FASTCALL void
fbCombineDisjointInU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineDisjointGeneralU (dest, src, width, CombineAIn);
}

static FASTCALL void
fbCombineDisjointInReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineDisjointGeneralU (dest, src, width, CombineBIn);
}

static FASTCALL void
fbCombineDisjointOutU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineDisjointGeneralU (dest, src, width, CombineAOut);
}

static FASTCALL void
fbCombineDisjointOutReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineDisjointGeneralU (dest, src, width, CombineBOut);
}

static FASTCALL void
fbCombineDisjointAtopU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineDisjointGeneralU (dest, src, width, CombineAAtop);
}

static FASTCALL void
fbCombineDisjointAtopReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineDisjointGeneralU (dest, src, width, CombineBAtop);
}

static FASTCALL void
fbCombineDisjointXorU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineDisjointGeneralU (dest, src, width, CombineXor);
}

/* portion covered by a but not b */
static INLINE CARD8
fbCombineConjointOutPart (CARD8 a, CARD8 b)
{
    /* max (1-b/a,0) */
    /* = 1-min(b/a,1) */

    /* min (1, (1-b) / a) */

    if (b >= a)		    /* b >= a -> b/a >= 1 */
	return 0x00;	    /* 0 */
    return ~FbIntDiv(b,a);   /* 1 - b/a */
}

/* portion covered by both a and b */
static INLINE CARD8
fbCombineConjointInPart (CARD8 a, CARD8 b)
{
    /* min (1,b/a) */

    if (b >= a)		    /* b >= a -> b/a >= 1 */
	return 0xff;	    /* 1 */
    return FbIntDiv(b,a);   /* b/a */
}

static FASTCALL void
fbCombineConjointGeneralU (CARD32 *dest, const CARD32 *src, int width, CARD8 combine)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  s = src[i];
        CARD32 d = dest[i];
        CARD32  m,n,o,p;
        CARD16  Fa, Fb, t, u, v;
        CARD8 sa = s >> 24;
        CARD8 da = d >> 24;

        switch (combine & CombineA) {
        default:
            Fa = 0;
            break;
        case CombineAOut:
            Fa = fbCombineConjointOutPart (sa, da);
            break;
        case CombineAIn:
            Fa = fbCombineConjointInPart (sa, da);
            break;
        case CombineA:
            Fa = 0xff;
            break;
        }

        switch (combine & CombineB) {
        default:
            Fb = 0;
            break;
        case CombineBOut:
            Fb = fbCombineConjointOutPart (da, sa);
            break;
        case CombineBIn:
            Fb = fbCombineConjointInPart (da, sa);
            break;
        case CombineB:
            Fb = 0xff;
            break;
        }
        m = FbGen (s,d,0,Fa,Fb,t, u, v);
        n = FbGen (s,d,8,Fa,Fb,t, u, v);
        o = FbGen (s,d,16,Fa,Fb,t, u, v);
        p = FbGen (s,d,24,Fa,Fb,t, u, v);
        s = m|n|o|p;
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineConjointOverU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineConjointGeneralU (dest, src, width, CombineAOver);
}


static FASTCALL void
fbCombineConjointOverReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineConjointGeneralU (dest, src, width, CombineBOver);
}


static FASTCALL void
fbCombineConjointInU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineConjointGeneralU (dest, src, width, CombineAIn);
}


static FASTCALL void
fbCombineConjointInReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineConjointGeneralU (dest, src, width, CombineBIn);
}

static FASTCALL void
fbCombineConjointOutU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineConjointGeneralU (dest, src, width, CombineAOut);
}

static FASTCALL void
fbCombineConjointOutReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineConjointGeneralU (dest, src, width, CombineBOut);
}

static FASTCALL void
fbCombineConjointAtopU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineConjointGeneralU (dest, src, width, CombineAAtop);
}

static FASTCALL void
fbCombineConjointAtopReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineConjointGeneralU (dest, src, width, CombineBAtop);
}

static FASTCALL void
fbCombineConjointXorU (CARD32 *dest, const CARD32 *src, int width)
{
    fbCombineConjointGeneralU (dest, src, width, CombineXor);
}

static CombineFuncU fbCombineFuncU[] = {
    fbCombineClear,
    fbCombineSrcU,
    NULL, /* CombineDst */
    fbCombineOverU,
    fbCombineOverReverseU,
    fbCombineInU,
    fbCombineInReverseU,
    fbCombineOutU,
    fbCombineOutReverseU,
    fbCombineAtopU,
    fbCombineAtopReverseU,
    fbCombineXorU,
    fbCombineAddU,
    fbCombineSaturateU,
    NULL,
    NULL,
    fbCombineClear,
    fbCombineSrcU,
    NULL, /* CombineDst */
    fbCombineDisjointOverU,
    fbCombineSaturateU, /* DisjointOverReverse */
    fbCombineDisjointInU,
    fbCombineDisjointInReverseU,
    fbCombineDisjointOutU,
    fbCombineDisjointOutReverseU,
    fbCombineDisjointAtopU,
    fbCombineDisjointAtopReverseU,
    fbCombineDisjointXorU,
    NULL,
    NULL,
    NULL,
    NULL,
    fbCombineClear,
    fbCombineSrcU,
    NULL, /* CombineDst */
    fbCombineConjointOverU,
    fbCombineConjointOverReverseU,
    fbCombineConjointInU,
    fbCombineConjointInReverseU,
    fbCombineConjointOutU,
    fbCombineConjointOutReverseU,
    fbCombineConjointAtopU,
    fbCombineConjointAtopReverseU,
    fbCombineConjointXorU,
};

static FASTCALL void
fbCombineMaskC (CARD32 *src, CARD32 *mask, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 a = mask[i];

        CARD32	x;
        CARD16	xa;

        if (!a)
        {
            src[i] = 0;
            continue;
        }

        x = src[i];
        if (a == 0xffffffff)
        {
            x = x >> 24;
            x |= x << 8;
            x |= x << 16;
            mask[i] = x;
            continue;
        }

        xa = x >> 24;
        FbByteMulC(x, a);
        src[i] = x;
        FbByteMul(a, xa);
        mask[i] = a;
    }
}

static FASTCALL void
fbCombineMaskValueC (CARD32 *src, const CARD32 *mask, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 a = mask[i];
        CARD32	x;

        if (!a)
        {
            src[i] = 0;
            continue;
        }

        if (a == 0xffffffff)
            continue;

        x = src[i];
        FbByteMulC(x, a);
        src[i] = x;
    }
}


static FASTCALL void
fbCombineMaskAlphaC (const CARD32 *src, CARD32 *mask, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 a = mask[i];
        CARD32	x;

        if (!a)
            continue;

        x = src[i] >> 24;
        if (x == 0xff)
            continue;
        if (a == 0xffffffff)
        {
            x = x >> 24;
            x |= x << 8;
            x |= x << 16;
            mask[i] = x;
            continue;
        }

        FbByteMul(a, x);
        mask[i] = a;
    }
}

static FASTCALL void
fbCombineClearC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    memset(dest, 0, width*sizeof(CARD32));
}

static FASTCALL void
fbCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineMaskValueC(src, mask, width);
    memcpy(dest, src, width*sizeof(CARD32));
}

static FASTCALL void
fbCombineOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    int i;
    fbCombineMaskC(src, mask, width);
    for (i = 0; i < width; ++i) {
        CARD32  s = src[i];
        CARD32  a = ~mask[i];

        if (a != 0xffffffff)
        {
            if (a)
            {
                CARD32 d = dest[i];
                FbByteMulAddC(d, a, s);
                s = d;
            }
            dest[i] = s;
        }
    }
}

static FASTCALL void
fbCombineOverReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    int i;
    fbCombineMaskValueC(src, mask, width);
    for (i = 0; i < width; ++i) {
        CARD32 d = dest[i];
        CARD32 a = ~d >> 24;

        if (a)
        {
            CARD32 s = src[i];
            if (a != 0xff)
            {
                FbByteMulAdd(s, a, d);
            }
            dest[i] = s;
        }
    }
}

static FASTCALL void
fbCombineInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    int i;
    fbCombineMaskValueC(src, mask, width);
    for (i = 0; i < width; ++i) {
        CARD32 d = dest[i];
        CARD16 a = d >> 24;
        CARD32 s = 0;
        if (a)
        {
            s = src[i];
            if (a != 0xff)
            {
                FbByteMul(s, a);
            }
        }
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    int i;
    fbCombineMaskAlphaC(src, mask, width);
    for (i = 0; i < width; ++i) {
        CARD32 a = mask[i];

        if (a != 0xffffffff)
        {
            CARD32 d = 0;
            if (a)
            {
                d = dest[i];
                FbByteMulC(d, a);
            }
            dest[i] = d;
        }
    }
}

static FASTCALL void
fbCombineOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    int i;
    fbCombineMaskValueC(src, mask, width);
    for (i = 0; i < width; ++i) {
        CARD32 d = dest[i];
        CARD16 a = ~d >> 24;
        CARD32 s = 0;
        if (a)
        {
            s = src[i];
            if (a != 0xff)
            {
                FbByteMul(s, a);
            }
        }
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    int i;
    fbCombineMaskAlphaC(src, mask, width);
    for (i = 0; i < width; ++i) {
        CARD32 a = ~mask[i];

        if (a != 0xffffffff)
        {
            CARD32 d = 0;
            if (a)
            {
                d = dest[i];
                FbByteMulC(d, a);
            }
            dest[i] = d;
        }
    }
}

static FASTCALL void
fbCombineAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    int i;
    fbCombineMaskC(src, mask, width);
    for (i = 0; i < width; ++i) {
        CARD32 d = dest[i];
        CARD32 s = src[i];
        CARD32 ad = ~mask[i];
        CARD16 as = d >> 24;
        FbByteAddMulC(d, ad, s, as);
        dest[i] = d;
    }
}

static FASTCALL void
fbCombineAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    int i;
    fbCombineMaskC(src, mask, width);
    for (i = 0; i < width; ++i) {

        CARD32 d = dest[i];
        CARD32 s = src[i];
        CARD32 ad = mask[i];
        CARD16 as = ~d >> 24;
        FbByteAddMulC(d, ad, s, as);
        dest[i] = d;
    }
}

static FASTCALL void
fbCombineXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    int i;
    fbCombineMaskC(src, mask, width);
    for (i = 0; i < width; ++i) {
        CARD32 d = dest[i];
        CARD32 s = src[i];
        CARD32 ad = ~mask[i];
        CARD16 as = ~d >> 24;
        FbByteAddMulC(d, ad, s, as);
        dest[i] = d;
    }
}

static FASTCALL void
fbCombineAddC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    int i;
    fbCombineMaskValueC(src, mask, width);
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        FbByteAdd(d, s);
        dest[i] = d;
    }
}

static FASTCALL void
fbCombineSaturateC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    int i;
    fbCombineMaskC(src, mask, width);
    for (i = 0; i < width; ++i) {
        CARD32  s, d;
        CARD16  sa, sr, sg, sb, da;
        CARD16  t, u, v;
        CARD32  m,n,o,p;

        d = dest[i];
        s = src[i];
        sa = (mask[i] >> 24);
        sr = (mask[i] >> 16) & 0xff;
        sg = (mask[i] >>  8) & 0xff;
        sb = (mask[i]      ) & 0xff;
        da = ~d >> 24;

        if (sb <= da)
            m = FbAdd(s,d,0,t);
        else
            m = FbGen (s, d, 0, (da << 8) / sb, 0xff, t, u, v);

        if (sg <= da)
            n = FbAdd(s,d,8,t);
        else
            n = FbGen (s, d, 8, (da << 8) / sg, 0xff, t, u, v);

        if (sr <= da)
            o = FbAdd(s,d,16,t);
        else
            o = FbGen (s, d, 16, (da << 8) / sr, 0xff, t, u, v);

        if (sa <= da)
            p = FbAdd(s,d,24,t);
        else
            p = FbGen (s, d, 24, (da << 8) / sa, 0xff, t, u, v);

        dest[i] = m|n|o|p;
    }
}

static FASTCALL void
fbCombineDisjointGeneralC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width, CARD8 combine)
{
    int i;
    fbCombineMaskC(src, mask, width);
    for (i = 0; i < width; ++i) {
        CARD32  s, d;
        CARD32  m,n,o,p;
        CARD32  Fa, Fb;
        CARD16  t, u, v;
        CARD32  sa;
        CARD8   da;

        s = src[i];
        sa = mask[i];
        d = dest[i];
        da = d >> 24;

        switch (combine & CombineA) {
        default:
            Fa = 0;
            break;
        case CombineAOut:
            m = fbCombineDisjointOutPart ((CARD8) (sa >> 0), da);
            n = fbCombineDisjointOutPart ((CARD8) (sa >> 8), da) << 8;
            o = fbCombineDisjointOutPart ((CARD8) (sa >> 16), da) << 16;
            p = fbCombineDisjointOutPart ((CARD8) (sa >> 24), da) << 24;
            Fa = m|n|o|p;
            break;
        case CombineAIn:
            m = fbCombineDisjointInPart ((CARD8) (sa >> 0), da);
            n = fbCombineDisjointInPart ((CARD8) (sa >> 8), da) << 8;
            o = fbCombineDisjointInPart ((CARD8) (sa >> 16), da) << 16;
            p = fbCombineDisjointInPart ((CARD8) (sa >> 24), da) << 24;
            Fa = m|n|o|p;
            break;
        case CombineA:
            Fa = 0xffffffff;
            break;
        }

        switch (combine & CombineB) {
        default:
            Fb = 0;
            break;
        case CombineBOut:
            m = fbCombineDisjointOutPart (da, (CARD8) (sa >> 0));
            n = fbCombineDisjointOutPart (da, (CARD8) (sa >> 8)) << 8;
            o = fbCombineDisjointOutPart (da, (CARD8) (sa >> 16)) << 16;
            p = fbCombineDisjointOutPart (da, (CARD8) (sa >> 24)) << 24;
            Fb = m|n|o|p;
            break;
        case CombineBIn:
            m = fbCombineDisjointInPart (da, (CARD8) (sa >> 0));
            n = fbCombineDisjointInPart (da, (CARD8) (sa >> 8)) << 8;
            o = fbCombineDisjointInPart (da, (CARD8) (sa >> 16)) << 16;
            p = fbCombineDisjointInPart (da, (CARD8) (sa >> 24)) << 24;
            Fb = m|n|o|p;
            break;
        case CombineB:
            Fb = 0xffffffff;
            break;
        }
        m = FbGen (s,d,0,FbGet8(Fa,0),FbGet8(Fb,0),t, u, v);
        n = FbGen (s,d,8,FbGet8(Fa,8),FbGet8(Fb,8),t, u, v);
        o = FbGen (s,d,16,FbGet8(Fa,16),FbGet8(Fb,16),t, u, v);
        p = FbGen (s,d,24,FbGet8(Fa,24),FbGet8(Fb,24),t, u, v);
        s = m|n|o|p;
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineDisjointOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineDisjointGeneralC (dest, src, mask, width, CombineAOver);
}

static FASTCALL void
fbCombineDisjointInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineDisjointGeneralC (dest, src, mask, width, CombineAIn);
}

static FASTCALL void
fbCombineDisjointInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineDisjointGeneralC (dest, src, mask, width, CombineBIn);
}

static FASTCALL void
fbCombineDisjointOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineDisjointGeneralC (dest, src, mask, width, CombineAOut);
}

static FASTCALL void
fbCombineDisjointOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineDisjointGeneralC (dest, src, mask, width, CombineBOut);
}

static FASTCALL void
fbCombineDisjointAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineDisjointGeneralC (dest, src, mask, width, CombineAAtop);
}

static FASTCALL void
fbCombineDisjointAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineDisjointGeneralC (dest, src, mask, width, CombineBAtop);
}

static FASTCALL void
fbCombineDisjointXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineDisjointGeneralC (dest, src, mask, width, CombineXor);
}

static FASTCALL void
fbCombineConjointGeneralC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width, CARD8 combine)
{
    int i;
    fbCombineMaskC(src, mask, width);
    for (i = 0; i < width; ++i) {
        CARD32  s, d;
        CARD32  m,n,o,p;
        CARD32  Fa, Fb;
        CARD16  t, u, v;
        CARD32  sa;
        CARD8   da;

        s = src[i];
        sa = mask[i];
        d = dest[i];
        da = d >> 24;

        switch (combine & CombineA) {
        default:
            Fa = 0;
            break;
        case CombineAOut:
            m = fbCombineConjointOutPart ((CARD8) (sa >> 0), da);
            n = fbCombineConjointOutPart ((CARD8) (sa >> 8), da) << 8;
            o = fbCombineConjointOutPart ((CARD8) (sa >> 16), da) << 16;
            p = fbCombineConjointOutPart ((CARD8) (sa >> 24), da) << 24;
            Fa = m|n|o|p;
            break;
        case CombineAIn:
            m = fbCombineConjointInPart ((CARD8) (sa >> 0), da);
            n = fbCombineConjointInPart ((CARD8) (sa >> 8), da) << 8;
            o = fbCombineConjointInPart ((CARD8) (sa >> 16), da) << 16;
            p = fbCombineConjointInPart ((CARD8) (sa >> 24), da) << 24;
            Fa = m|n|o|p;
            break;
        case CombineA:
            Fa = 0xffffffff;
            break;
        }

        switch (combine & CombineB) {
        default:
            Fb = 0;
            break;
        case CombineBOut:
            m = fbCombineConjointOutPart (da, (CARD8) (sa >> 0));
            n = fbCombineConjointOutPart (da, (CARD8) (sa >> 8)) << 8;
            o = fbCombineConjointOutPart (da, (CARD8) (sa >> 16)) << 16;
            p = fbCombineConjointOutPart (da, (CARD8) (sa >> 24)) << 24;
            Fb = m|n|o|p;
            break;
        case CombineBIn:
            m = fbCombineConjointInPart (da, (CARD8) (sa >> 0));
            n = fbCombineConjointInPart (da, (CARD8) (sa >> 8)) << 8;
            o = fbCombineConjointInPart (da, (CARD8) (sa >> 16)) << 16;
            p = fbCombineConjointInPart (da, (CARD8) (sa >> 24)) << 24;
            Fb = m|n|o|p;
            break;
        case CombineB:
            Fb = 0xffffffff;
            break;
        }
        m = FbGen (s,d,0,FbGet8(Fa,0),FbGet8(Fb,0),t, u, v);
        n = FbGen (s,d,8,FbGet8(Fa,8),FbGet8(Fb,8),t, u, v);
        o = FbGen (s,d,16,FbGet8(Fa,16),FbGet8(Fb,16),t, u, v);
        p = FbGen (s,d,24,FbGet8(Fa,24),FbGet8(Fb,24),t, u, v);
        s = m|n|o|p;
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineConjointOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineConjointGeneralC (dest, src, mask, width, CombineAOver);
}

static FASTCALL void
fbCombineConjointOverReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineConjointGeneralC (dest, src, mask, width, CombineBOver);
}

static FASTCALL void
fbCombineConjointInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineConjointGeneralC (dest, src, mask, width, CombineAIn);
}

static FASTCALL void
fbCombineConjointInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineConjointGeneralC (dest, src, mask, width, CombineBIn);
}

static FASTCALL void
fbCombineConjointOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineConjointGeneralC (dest, src, mask, width, CombineAOut);
}

static FASTCALL void
fbCombineConjointOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineConjointGeneralC (dest, src, mask, width, CombineBOut);
}

static FASTCALL void
fbCombineConjointAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineConjointGeneralC (dest, src, mask, width, CombineAAtop);
}

static FASTCALL void
fbCombineConjointAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineConjointGeneralC (dest, src, mask, width, CombineBAtop);
}

static FASTCALL void
fbCombineConjointXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    fbCombineConjointGeneralC (dest, src, mask, width, CombineXor);
}

static CombineFuncC fbCombineFuncC[] = {
    fbCombineClearC,
    fbCombineSrcC,
    NULL, /* Dest */
    fbCombineOverC,
    fbCombineOverReverseC,
    fbCombineInC,
    fbCombineInReverseC,
    fbCombineOutC,
    fbCombineOutReverseC,
    fbCombineAtopC,
    fbCombineAtopReverseC,
    fbCombineXorC,
    fbCombineAddC,
    fbCombineSaturateC,
    NULL,
    NULL,
    fbCombineClearC,	    /* 0x10 */
    fbCombineSrcC,
    NULL, /* Dest */
    fbCombineDisjointOverC,
    fbCombineSaturateC, /* DisjointOverReverse */
    fbCombineDisjointInC,
    fbCombineDisjointInReverseC,
    fbCombineDisjointOutC,
    fbCombineDisjointOutReverseC,
    fbCombineDisjointAtopC,
    fbCombineDisjointAtopReverseC,
    fbCombineDisjointXorC,  /* 0x1b */
    NULL,
    NULL,
    NULL,
    NULL,
    fbCombineClearC,
    fbCombineSrcC,
    NULL, /* Dest */
    fbCombineConjointOverC,
    fbCombineConjointOverReverseC,
    fbCombineConjointInC,
    fbCombineConjointInReverseC,
    fbCombineConjointOutC,
    fbCombineConjointOutReverseC,
    fbCombineConjointAtopC,
    fbCombineConjointAtopReverseC,
    fbCombineConjointXorC,
};


FbComposeFunctions composeFunctions = {
    fbCombineFuncU,
    fbCombineFuncC,
    fbCombineMaskU
};


static void fbFetchSolid(PicturePtr pict, int x, int y, int width, CARD32 *buffer)
{
    FbBits *bits;
    FbStride stride;
    int bpp;
    int xoff, yoff;
    CARD32 color;
    CARD32 *end;
    fetchPixelProc fetch = fetchPixelProcForPicture(pict);
    miIndexedPtr indexed = (miIndexedPtr) pict->pFormat->index.devPrivate;

    fbGetDrawable (pict->pDrawable, bits, stride, bpp, xoff, yoff);
    bits += yoff*stride + (xoff*bpp >> FB_SHIFT);

    color = fetch(bits, 0, indexed);

    end = buffer + width;
    while (buffer < end)
        *buffer++ = color;
}

static void fbFetch(PicturePtr pict, int x, int y, int width, CARD32 *buffer)
{
    FbBits *bits;
    FbStride stride;
    int bpp;
    int xoff, yoff;
    fetchProc fetch = fetchProcForPicture(pict);
    miIndexedPtr indexed = (miIndexedPtr) pict->pFormat->index.devPrivate;

    fbGetDrawable (pict->pDrawable, bits, stride, bpp, xoff, yoff);
    x += xoff;
    y += yoff;

    bits += y*stride;

    fetch(bits, x, width, buffer, indexed);
}

#define MOD(a,b) ((a) < 0 ? ((b) - ((-(a) - 1) % (b))) - 1 : (a) % (b))
#define DIV(a,b) ((((a) < 0) == ((b) < 0)) ? (a) / (b) :\
        ((a) - (b) + 1 - (((b) < 0) << 1)) / (b))


static CARD32 gradientPixel(const SourcePictPtr pGradient, xFixed_48_16 pos, unsigned int spread)
{
    int ipos = (pos * PICT_GRADIENT_STOPTABLE_SIZE - 1) >> 16;

    /* calculate the actual offset. */
    if (ipos < 0 || ipos >= PICT_GRADIENT_STOPTABLE_SIZE) {
        if (pGradient->type == SourcePictTypeConical || spread == RepeatNormal) {
            ipos = ipos % PICT_GRADIENT_STOPTABLE_SIZE;
            ipos = ipos < 0 ? PICT_GRADIENT_STOPTABLE_SIZE + ipos : ipos;

        } else if (spread == RepeatReflect) {
            const int limit = PICT_GRADIENT_STOPTABLE_SIZE * 2 - 1;
            ipos = ipos % limit;
            ipos = ipos < 0 ? limit + ipos : ipos;
            ipos = ipos >= PICT_GRADIENT_STOPTABLE_SIZE ? limit - ipos : ipos;

        } else if (spread == RepeatPad) {
            if (ipos < 0)
                ipos = 0;
            else if (ipos >= PICT_GRADIENT_STOPTABLE_SIZE)
                ipos = PICT_GRADIENT_STOPTABLE_SIZE-1;
        } else { /* RepeatNone */
            return 0;
        }
    }

    assert(ipos >= 0);
    assert(ipos < PICT_GRADIENT_STOPTABLE_SIZE);

    return pGradient->linear.colorTable[ipos];
}

static void fbFetchSourcePict(PicturePtr pict, int x, int y, int width, CARD32 *buffer)
{
    SourcePictPtr pGradient = pict->pSourcePict;
    CARD32 *end = buffer + width;

    if (pGradient->type == SourcePictTypeSolidFill) {
        register CARD32 color = pGradient->solidFill.color;
        while (buffer < end) {
            *buffer++ = color;
        }
    } else if (pGradient->type == SourcePictTypeLinear) {
        PictVector v, unit;
        xFixed_32_32 l;
        xFixed_48_16 dx, dy, a, b, off;

        v.vector[0] = IntToxFixed(x);
        v.vector[1] = IntToxFixed(y);
        v.vector[2] = xFixed1;
        if (pict->transform) {
            if (!PictureTransformPoint3d (pict->transform, &v))
                return;
            unit.vector[0] = pict->transform->matrix[0][0];
            unit.vector[1] = pict->transform->matrix[1][0];
            unit.vector[2] = pict->transform->matrix[2][0];
        } else {
            unit.vector[0] = xFixed1;
            unit.vector[1] = 0;
            unit.vector[2] = 0;
        }

        dx = pGradient->linear.p2.x - pGradient->linear.p1.x;
        dy = pGradient->linear.p2.y - pGradient->linear.p1.y;
        l = dx*dx + dy*dy;
        if (l != 0) {
            a = (dx << 32) / l;
            b = (dy << 32) / l;
            off = (-a*pGradient->linear.p1.x - b*pGradient->linear.p1.y)>>16;
        }
        if (l == 0  || (unit.vector[2] == 0 && v.vector[2] == xFixed1)) {
            xFixed_48_16 inc, t;
            /* affine transformation only */
            if (l == 0) {
                t = 0;
                inc = 0;
            } else {
                t = ((a*v.vector[0] + b*v.vector[1]) >> 16) + off;
                inc = (a * unit.vector[0] + b * unit.vector[1]) >> 16;
            }
            while (buffer < end) {
                *buffer++ = gradientPixel(pGradient, t, pict->repeatType);
                t += inc;
            }
        } else {
            /* projective transformation */
            while (buffer < end) {
                xFixed_48_16 t;
                if (v.vector[2] == 0) {
                    t = 0;
                } else {
                    xFixed_48_16 x, y;
                    x = ((xFixed_48_16)v.vector[0] << 16) / v.vector[2];
                    y = ((xFixed_48_16)v.vector[1] << 16) / v.vector[2];
                    t = ((a*x + b*y) >> 16) + off;
                }
                *buffer++ = gradientPixel(pGradient, t, pict->repeatType);
                v.vector[0] += unit.vector[0];
                v.vector[1] += unit.vector[1];
                v.vector[2] += unit.vector[2];
            }
        }
    } else {
        /* radial or conical */
        Bool affine = TRUE;
        double cx = 1.;
        double cy = 0.;
        double cz = 0.;
        double rx = x;
        double ry = y;
        double rz = 1.;

        if (pict->transform) {
            PictVector v;
            v.vector[0] = IntToxFixed(x);
            v.vector[1] = IntToxFixed(y);
            v.vector[2] = xFixed1;
            if (!PictureTransformPoint3d (pict->transform, &v))
                return;

            cx = pict->transform->matrix[0][0]/65536.;
            cy = pict->transform->matrix[1][0]/65536.;
            cz = pict->transform->matrix[2][0]/65536.;
            rx = v.vector[0]/65536.;
            ry = v.vector[1]/65536.;
            rz = v.vector[2]/65536.;
            affine = pict->transform->matrix[2][0] == 0 && v.vector[2] == xFixed1;
        }

        if (pGradient->type == SourcePictTypeRadial) {
            if (affine) {
                rx -= pGradient->radial.fx;
                ry -= pGradient->radial.fy;

                while (buffer < end) {
                    double b = 2*(rx*pGradient->radial.dx + ry*pGradient->radial.dy);
                    double c = -(rx*rx + ry*ry);
                    double det = (b * b) - (4 * pGradient->radial.a * c);
                    double s = (-b + sqrt(det))/(2. * pGradient->radial.a);
                    *buffer = gradientPixel(pGradient,
                                            (xFixed_48_16)((s*pGradient->radial.m + pGradient->radial.b)*65536),
                                            pict->repeatType);
                    ++buffer;
                    rx += cx;
                    ry += cy;
                }
            } else {
                while (buffer < end) {
                    double x, y;
                    double b, c, det, s;
                    if (rz != 0) {
                        x = rx/rz;
                        y = ry/rz;
                    } else {
                        x = y = 0.;
                    }
                    x -= pGradient->radial.fx;
                    y -= pGradient->radial.fy;
                    b = 2*(x*pGradient->radial.dx + y*pGradient->radial.dy);
                    c = -(x*x + y*y);
                    det = (b * b) - (4 * pGradient->radial.a * c);
                    s = (-b + sqrt(det))/(2. * pGradient->radial.a);
                    *buffer = gradientPixel(pGradient,
                                            (xFixed_48_16)((s*pGradient->radial.m + pGradient->radial.b)*65536),
                                            pict->repeatType);
                    ++buffer;
                    rx += cx;
                    ry += cy;
                    rz += cz;
                }
            }
        } else /* SourcePictTypeConical */ {
            double a = pGradient->conical.angle/(180.*65536);
            if (affine) {
                rx -= pGradient->conical.center.x/65536.;
                ry -= pGradient->conical.center.y/65536.;

                while (buffer < end) {
                    double angle = atan2(ry, rx) + a;
                    *buffer = gradientPixel(pGradient, (xFixed_48_16) (angle * (65536. / (2*M_PI))),
                                            pict->repeatType);
                    ++buffer;
                    rx += cx;
                    ry += cy;
                }
            } else {

                while (buffer < end) {
                    double x, y, angle;
                    if (rz != 0) {
                        x = rx/rz;
                        y = ry/rz;
                    } else {
                        x = y = 0.;
                    }
                    x -= pGradient->conical.center.x/65536.;
                    y -= pGradient->conical.center.y/65536.;
                    angle = atan2(y, x) + a;
                    *buffer = gradientPixel(pGradient, (xFixed_48_16) (angle * (65536. / (2*M_PI))),
                                            pict->repeatType);
                    ++buffer;
                    rx += cx;
                    ry += cy;
                    rz += cz;
                }
            }
        }
    }
}



static void fbFetchTransformed(PicturePtr pict, int x, int y, int width, CARD32 *buffer)
{
    FbBits     *bits;
    FbStride    stride;
    int         bpp;
    int         xoff, yoff, dx, dy;
    fetchPixelProc   fetch;
    PictVector	v;
    PictVector  unit;
    int         i;
    BoxRec	box;
    miIndexedPtr indexed = (miIndexedPtr) pict->pFormat->index.devPrivate;
    Bool affine = TRUE;

    fetch = fetchPixelProcForPicture(pict);

    fbGetDrawable(pict->pDrawable, bits, stride, bpp, xoff, yoff);
    x += xoff;
    y += yoff;

    dx = pict->pDrawable->x;
    dy = pict->pDrawable->y;

    v.vector[0] = IntToxFixed(x - dx);
    v.vector[1] = IntToxFixed(y - dy);
    v.vector[2] = xFixed1;

    /* when using convolution filters one might get here without a transform */
    if (pict->transform) {
        if (!PictureTransformPoint3d (pict->transform, &v))
            return;
        unit.vector[0] = pict->transform->matrix[0][0];
        unit.vector[1] = pict->transform->matrix[1][0];
        unit.vector[2] = pict->transform->matrix[2][0];
        affine = v.vector[2] == xFixed1 && unit.vector[2] == 0;
    } else {
        unit.vector[0] = xFixed1;
        unit.vector[1] = 0;
        unit.vector[2] = 0;
    }

    if (pict->filter == PictFilterNearest)
    {
        if (pict->repeatType == RepeatNormal) {
            if (RegionNumRects(pict->pCompositeClip) == 1) {
                for (i = 0; i < width; ++i) {
                    if (!v.vector[2]) {
                        buffer[i] = 0;
                    } else {
                        if (!affine) {
                            y = MOD(DIV(v.vector[1],v.vector[2]), pict->pDrawable->height);
                            x = MOD(DIV(v.vector[0],v.vector[2]), pict->pDrawable->width);
                        } else {
                            y = MOD(v.vector[1]>>16, pict->pDrawable->height);
                            x = MOD(v.vector[0]>>16, pict->pDrawable->width);
                        }
                        buffer[i] = fetch(bits + (y + dy)*stride, x + dx, indexed);
                    }
                    v.vector[0] += unit.vector[0];
                    v.vector[1] += unit.vector[1];
                    v.vector[2] += unit.vector[2];
                }
            } else {
                for (i = 0; i < width; ++i) {
                    if (!v.vector[2]) {
                        buffer[i] = 0;
                    } else {
                        if (!affine) {
                            y = MOD(DIV(v.vector[1],v.vector[2]), pict->pDrawable->height);
                            x = MOD(DIV(v.vector[0],v.vector[2]), pict->pDrawable->width);
                        } else {
                            y = MOD(v.vector[1]>>16, pict->pDrawable->height);
                            x = MOD(v.vector[0]>>16, pict->pDrawable->width);
                        }
                        if (RegionContainsPoint(pict->pCompositeClip, x + dx, y + dy, &box))
                            buffer[i] = fetch(bits + (y + dy)*stride, x + dx, indexed);
                        else
                            buffer[i] = 0;
                    }
                    v.vector[0] += unit.vector[0];
                    v.vector[1] += unit.vector[1];
                    v.vector[2] += unit.vector[2];
                }
            }
        } else {
            if (RegionNumRects(pict->pCompositeClip) == 1) {
                box = pict->pCompositeClip->extents;
                for (i = 0; i < width; ++i) {
                    if (!v.vector[2]) {
                        buffer[i] = 0;
                    } else {
                        if (!affine) {
                            y = DIV(v.vector[1],v.vector[2]);
                            x = DIV(v.vector[0],v.vector[2]);
                        } else {
                            y = v.vector[1]>>16;
                            x = v.vector[0]>>16;
                        }
                        buffer[i] = ((x < box.x1-dx) | (x >= box.x2-dx) | (y < box.y1-dy) | (y >= box.y2-dy)) ?
                                    0 : fetch(bits + (y + dy)*stride, x + dx, indexed);
                    }
                    v.vector[0] += unit.vector[0];
                    v.vector[1] += unit.vector[1];
                    v.vector[2] += unit.vector[2];
                }
            } else {
                for (i = 0; i < width; ++i) {
                    if (!v.vector[2]) {
                        buffer[i] = 0;
                    } else {
                        if (!affine) {
                            y = DIV(v.vector[1],v.vector[2]);
                            x = DIV(v.vector[0],v.vector[2]);
                        } else {
                            y = v.vector[1]>>16;
                            x = v.vector[0]>>16;
                        }
                        if (RegionContainsPoint(pict->pCompositeClip, x + dx, y + dy, &box))
                            buffer[i] = fetch(bits + (y + dy)*stride, x + dx, indexed);
                        else
                            buffer[i] = 0;
                    }
                    v.vector[0] += unit.vector[0];
                    v.vector[1] += unit.vector[1];
                    v.vector[2] += unit.vector[2];
                }
            }
        }
    } else if (pict->filter == PictFilterBilinear) {
        if (pict->repeatType == RepeatNormal) {
            if (RegionNumRects(pict->pCompositeClip) == 1) {
                for (i = 0; i < width; ++i) {
                    if (!v.vector[2]) {
                        buffer[i] = 0;
                    } else {
                        int x1, x2, y1, y2, distx, idistx, disty, idisty;
                        FbBits *b;
                        CARD32 tl, tr, bl, br, r;
                        CARD32 ft, fb;

                        if (!affine) {
                            xFixed_48_16 div;
                            div = ((xFixed_48_16)v.vector[0] << 16)/v.vector[2];
                            x1 = div >> 16;
                            distx = ((xFixed)div >> 8) & 0xff;
                            div = ((xFixed_48_16)v.vector[1] << 16)/v.vector[2];
                            y1 = div >> 16;
                            disty = ((xFixed)div >> 8) & 0xff;
                        } else {
                            x1 = v.vector[0] >> 16;
                            distx = (v.vector[0] >> 8) & 0xff;
                            y1 = v.vector[1] >> 16;
                            disty = (v.vector[1] >> 8) & 0xff;
                        }
                        x2 = x1 + 1;
                        y2 = y1 + 1;

                        idistx = 256 - distx;
                        idisty = 256 - disty;

                        x1 = MOD (x1, pict->pDrawable->width);
                        x2 = MOD (x2, pict->pDrawable->width);
                        y1 = MOD (y1, pict->pDrawable->height);
                        y2 = MOD (y2, pict->pDrawable->height);

                        b = bits + (y1 + dy)*stride;

                        tl = fetch(b, x1 + dx, indexed);
                        tr = fetch(b, x2 + dx, indexed);
                        b = bits + (y2 + dy)*stride;
                        bl = fetch(b, x1 + dx, indexed);
                        br = fetch(b, x2 + dx, indexed);

                        ft = FbGet8(tl,0) * idistx + FbGet8(tr,0) * distx;
                        fb = FbGet8(bl,0) * idistx + FbGet8(br,0) * distx;
                        r = (((ft * idisty + fb * disty) >> 16) & 0xff);
                        ft = FbGet8(tl,8) * idistx + FbGet8(tr,8) * distx;
                        fb = FbGet8(bl,8) * idistx + FbGet8(br,8) * distx;
                        r |= (((ft * idisty + fb * disty) >> 8) & 0xff00);
                        ft = FbGet8(tl,16) * idistx + FbGet8(tr,16) * distx;
                        fb = FbGet8(bl,16) * idistx + FbGet8(br,16) * distx;
                        r |= (((ft * idisty + fb * disty)) & 0xff0000);
                        ft = FbGet8(tl,24) * idistx + FbGet8(tr,24) * distx;
                        fb = FbGet8(bl,24) * idistx + FbGet8(br,24) * distx;
                        r |= (((ft * idisty + fb * disty) << 8) & 0xff000000);
                        buffer[i] = r;
                    }
                    v.vector[0] += unit.vector[0];
                    v.vector[1] += unit.vector[1];
                    v.vector[2] += unit.vector[2];
                }
            } else {
                for (i = 0; i < width; ++i) {
                    if (!v.vector[2]) {
                        buffer[i] = 0;
                    } else {
                        int x1, x2, y1, y2, distx, idistx, disty, idisty;
                        FbBits *b;
                        CARD32 tl, tr, bl, br, r;
                        CARD32 ft, fb;

                        if (!affine) {
                            xFixed_48_16 div;
                            div = ((xFixed_48_16)v.vector[0] << 16)/v.vector[2];
                            x1 = div >> 16;
                            distx = ((xFixed)div >> 8) & 0xff;
                            div = ((xFixed_48_16)v.vector[1] << 16)/v.vector[2];
                            y1 = div >> 16;
                            disty = ((xFixed)div >> 8) & 0xff;
                        } else {
                            x1 = v.vector[0] >> 16;
                            distx = (v.vector[0] >> 8) & 0xff;
                            y1 = v.vector[1] >> 16;
                            disty = (v.vector[1] >> 8) & 0xff;
                        }
                        x2 = x1 + 1;
                        y2 = y1 + 1;

                        idistx = 256 - distx;
                        idisty = 256 - disty;

                        x1 = MOD (x1, pict->pDrawable->width);
                        x2 = MOD (x2, pict->pDrawable->width);
                        y1 = MOD (y1, pict->pDrawable->height);
                        y2 = MOD (y2, pict->pDrawable->height);

                        b = bits + (y1 + dy)*stride;

                        tl = RegionContainsPoint(pict->pCompositeClip, x1 + dx, y1 + dy, &box)
                             ? fetch(b, x1 + dx, indexed) : 0;
                        tr = RegionContainsPoint(pict->pCompositeClip, x2 + dx, y1 + dy, &box)
                             ? fetch(b, x2 + dx, indexed) : 0;
                        b = bits + (y2 + dy)*stride;
                        bl = RegionContainsPoint(pict->pCompositeClip, x1 + dx, y2 + dy, &box)
                             ? fetch(b, x1 + dx, indexed) : 0;
                        br = RegionContainsPoint(pict->pCompositeClip, x2 + dx, y2 + dy, &box)
                             ? fetch(b, x2 + dx, indexed) : 0;

                        ft = FbGet8(tl,0) * idistx + FbGet8(tr,0) * distx;
                        fb = FbGet8(bl,0) * idistx + FbGet8(br,0) * distx;
                        r = (((ft * idisty + fb * disty) >> 16) & 0xff);
                        ft = FbGet8(tl,8) * idistx + FbGet8(tr,8) * distx;
                        fb = FbGet8(bl,8) * idistx + FbGet8(br,8) * distx;
                        r |= (((ft * idisty + fb * disty) >> 8) & 0xff00);
                        ft = FbGet8(tl,16) * idistx + FbGet8(tr,16) * distx;
                        fb = FbGet8(bl,16) * idistx + FbGet8(br,16) * distx;
                        r |= (((ft * idisty + fb * disty)) & 0xff0000);
                        ft = FbGet8(tl,24) * idistx + FbGet8(tr,24) * distx;
                        fb = FbGet8(bl,24) * idistx + FbGet8(br,24) * distx;
                        r |= (((ft * idisty + fb * disty) << 8) & 0xff000000);
                        buffer[i] = r;
                    }
                    v.vector[0] += unit.vector[0];
                    v.vector[1] += unit.vector[1];
                    v.vector[2] += unit.vector[2];
                }
            }
        } else {
            if (RegionNumRects(pict->pCompositeClip) == 1) {
                box = pict->pCompositeClip->extents;
                for (i = 0; i < width; ++i) {
                    if (!v.vector[2]) {
                        buffer[i] = 0;
                    } else {
                        int x1, x2, y1, y2, distx, idistx, disty, idisty, x_off;
                        FbBits *b;
                        CARD32 tl, tr, bl, br, r;
                        Bool x1_out, x2_out, y1_out, y2_out;
                        CARD32 ft, fb;

                        if (!affine) {
                            xFixed_48_16 div;
                            div = ((xFixed_48_16)v.vector[0] << 16)/v.vector[2];
                            x1 = div >> 16;
                            distx = ((xFixed)div >> 8) & 0xff;
                            div = ((xFixed_48_16)v.vector[1] << 16)/v.vector[2];
                            y1 = div >> 16;
                            disty = ((xFixed)div >> 8) & 0xff;
                        } else {
                            x1 = v.vector[0] >> 16;
                            distx = (v.vector[0] >> 8) & 0xff;
                            y1 = v.vector[1] >> 16;
                            disty = (v.vector[1] >> 8) & 0xff;
                        }
                        x2 = x1 + 1;
                        y2 = y1 + 1;

                        idistx = 256 - distx;
                        idisty = 256 - disty;

                        b = bits + (y1 + dy)*stride;
                        x_off = x1 + dx;

                        x1_out = (x1 < box.x1-dx) | (x1 >= box.x2-dx);
                        x2_out = (x2 < box.x1-dx) | (x2 >= box.x2-dx);
                        y1_out = (y1 < box.y1-dy) | (y1 >= box.y2-dy);
                        y2_out = (y2 < box.y1-dy) | (y2 >= box.y2-dy);

                        tl = x1_out|y1_out ? 0 : fetch(b, x_off, indexed);
                        tr = x2_out|y1_out ? 0 : fetch(b, x_off + 1, indexed);
                        b += stride;
                        bl = x1_out|y2_out ? 0 : fetch(b, x_off, indexed);
                        br = x2_out|y2_out ? 0 : fetch(b, x_off + 1, indexed);

                        ft = FbGet8(tl,0) * idistx + FbGet8(tr,0) * distx;
                        fb = FbGet8(bl,0) * idistx + FbGet8(br,0) * distx;
                        r = (((ft * idisty + fb * disty) >> 16) & 0xff);
                        ft = FbGet8(tl,8) * idistx + FbGet8(tr,8) * distx;
                        fb = FbGet8(bl,8) * idistx + FbGet8(br,8) * distx;
                        r |= (((ft * idisty + fb * disty) >> 8) & 0xff00);
                        ft = FbGet8(tl,16) * idistx + FbGet8(tr,16) * distx;
                        fb = FbGet8(bl,16) * idistx + FbGet8(br,16) * distx;
                        r |= (((ft * idisty + fb * disty)) & 0xff0000);
                        ft = FbGet8(tl,24) * idistx + FbGet8(tr,24) * distx;
                        fb = FbGet8(bl,24) * idistx + FbGet8(br,24) * distx;
                        r |= (((ft * idisty + fb * disty) << 8) & 0xff000000);
                        buffer[i] = r;
                    }
                    v.vector[0] += unit.vector[0];
                    v.vector[1] += unit.vector[1];
                    v.vector[2] += unit.vector[2];
                }
            } else {
                for (i = 0; i < width; ++i) {
                    if (!v.vector[2]) {
                        buffer[i] = 0;
                    } else {
                        int x1, x2, y1, y2, distx, idistx, disty, idisty, x_off;
                        FbBits *b;
                        CARD32 tl, tr, bl, br, r;
                        CARD32 ft, fb;

                        if (!affine) {
                            xFixed_48_16 div;
                            div = ((xFixed_48_16)v.vector[0] << 16)/v.vector[2];
                            x1 = div >> 16;
                            distx = ((xFixed)div >> 8) & 0xff;
                            div = ((xFixed_48_16)v.vector[1] << 16)/v.vector[2];
                            y1 = div >> 16;
                            disty = ((xFixed)div >> 8) & 0xff;
                        } else {
                            x1 = v.vector[0] >> 16;
                            distx = (v.vector[0] >> 8) & 0xff;
                            y1 = v.vector[1] >> 16;
                            disty = (v.vector[1] >> 8) & 0xff;
                        }
                        x2 = x1 + 1;
                        y2 = y1 + 1;

                        idistx = 256 - distx;
                        idisty = 256 - disty;

                        b = bits + (y1 + dy)*stride;
                        x_off = x1 + dx;

                        tl = RegionContainsPoint(pict->pCompositeClip, x1 + dx, y1 + dy, &box)
                             ? fetch(b, x_off, indexed) : 0;
                        tr = RegionContainsPoint(pict->pCompositeClip, x2 + dx, y1 + dy, &box)
                             ? fetch(b, x_off + 1, indexed) : 0;
                        b += stride;
                        bl = RegionContainsPoint(pict->pCompositeClip, x1 + dx, y2 + dy, &box)
                             ? fetch(b, x_off, indexed) : 0;
                        br = RegionContainsPoint(pict->pCompositeClip, x2 + dx, y2 + dy, &box)
                             ? fetch(b, x_off + 1, indexed) : 0;

                        ft = FbGet8(tl,0) * idistx + FbGet8(tr,0) * distx;
                        fb = FbGet8(bl,0) * idistx + FbGet8(br,0) * distx;
                        r = (((ft * idisty + fb * disty) >> 16) & 0xff);
                        ft = FbGet8(tl,8) * idistx + FbGet8(tr,8) * distx;
                        fb = FbGet8(bl,8) * idistx + FbGet8(br,8) * distx;
                        r |= (((ft * idisty + fb * disty) >> 8) & 0xff00);
                        ft = FbGet8(tl,16) * idistx + FbGet8(tr,16) * distx;
                        fb = FbGet8(bl,16) * idistx + FbGet8(br,16) * distx;
                        r |= (((ft * idisty + fb * disty)) & 0xff0000);
                        ft = FbGet8(tl,24) * idistx + FbGet8(tr,24) * distx;
                        fb = FbGet8(bl,24) * idistx + FbGet8(br,24) * distx;
                        r |= (((ft * idisty + fb * disty) << 8) & 0xff000000);
                        buffer[i] = r;
                    }
                    v.vector[0] += unit.vector[0];
                    v.vector[1] += unit.vector[1];
                    v.vector[2] += unit.vector[2];
                }
            }
        }
    } else if (pict->filter == PictFilterConvolution) {
        xFixed *params = pict->filter_params;
        INT32 cwidth = xFixedToInt(params[0]);
        INT32 cheight = xFixedToInt(params[1]);
        int xoff = params[0] >> 1;
        int yoff = params[1] >> 1;
        params += 2;
        for (i = 0; i < width; ++i) {
            if (!v.vector[2]) {
                buffer[i] = 0;
            } else {
                int x1, x2, y1, y2, x, y;
                INT32 srtot, sgtot, sbtot, satot;
                xFixed *p = params;

                if (!affine) {
                    xFixed_48_16 tmp;
                    tmp = ((xFixed_48_16)v.vector[0] << 16)/v.vector[2] - xoff;
                    x1 = xFixedToInt(tmp);
                    tmp = ((xFixed_48_16)v.vector[1] << 16)/v.vector[2] - yoff;
                    y1 = xFixedToInt(tmp);
                } else {
                    x1 = xFixedToInt(v.vector[0] - xoff);
                    y1 = xFixedToInt(v.vector[1] - yoff);
                }
                x2 = x1 + cwidth;
                y2 = y1 + cheight;

                srtot = sgtot = sbtot = satot = 0;

                for (y = y1; y < y2; y++) {
                    int ty = (pict->repeatType == RepeatNormal) ? MOD (y, pict->pDrawable->height) : y;
                    for (x = x1; x < x2; x++) {
                        if (*p) {
                            int tx = (pict->repeatType == RepeatNormal) ? MOD (x, pict->pDrawable->width) : x;
                            if (RegionContainsPoint(pict->pCompositeClip, tx + dx, ty + dy, &box)) {
                                FbBits *b = bits + (ty + dy)*stride;
                                CARD32 c = fetch(b, tx + dx, indexed);

                                srtot += Red(c) * *p;
                                sgtot += Green(c) * *p;
                                sbtot += Blue(c) * *p;
                                satot += Alpha(c) * *p;
                            }
                        }
                        p++;
                    }
                }

                if (satot < 0) satot = 0; else if (satot > 0xff) satot = 0xff;
                if (srtot < 0) srtot = 0; else if (srtot > 0xff) srtot = 0xff;
                if (sgtot < 0) sgtot = 0; else if (sgtot > 0xff) sgtot = 0xff;
                if (sbtot < 0) sbtot = 0; else if (sbtot > 0xff) sbtot = 0xff;

                buffer[i] = ((satot << 24) |
                             (srtot << 16) |
                             (sgtot <<  8) |
                             (sbtot       ));
            }
            v.vector[0] += unit.vector[0];
            v.vector[1] += unit.vector[1];
            v.vector[2] += unit.vector[2];
        }
    }
}


static void fbFetchExternalAlpha(PicturePtr pict, int x, int y, int width, CARD32 *buffer)
{
    int i;
    CARD32 _alpha_buffer[SCANLINE_BUFFER_LENGTH];
    CARD32 *alpha_buffer = _alpha_buffer;

    if (!pict->alphaMap) {
        fbFetchTransformed(pict, x, y, width, buffer);
	return;
    }
    if (width > SCANLINE_BUFFER_LENGTH)
        alpha_buffer = (CARD32 *) malloc(width*sizeof(CARD32));

    fbFetchTransformed(pict, x, y, width, buffer);
    fbFetchTransformed(pict->alphaMap, x - pict->alphaOrigin.x, y - pict->alphaOrigin.y, width, alpha_buffer);
    for (i = 0; i < width; ++i) {
        int a = alpha_buffer[i]>>24;
        buffer[i] = (a << 24)
                 | (div_255(Red(buffer[i]) * a) << 16)
                 | (div_255(Green(buffer[i]) * a) << 8)
                 | (div_255(Blue(buffer[i]) * a));
    }

    if (alpha_buffer != _alpha_buffer)
        free(alpha_buffer);
}

static void fbStore(PicturePtr pict, int x, int y, int width, CARD32 *buffer)
{
    FbBits *bits;
    FbStride stride;
    int bpp;
    int xoff, yoff;
    storeProc store = storeProcForPicture(pict);
    miIndexedPtr indexed = (miIndexedPtr) pict->pFormat->index.devPrivate;

    fbGetDrawable (pict->pDrawable, bits, stride, bpp, xoff, yoff);
    x += xoff;
    y += yoff;

    bits += y*stride;
    store(bits, buffer, x, width, indexed);
}

static void fbStoreExternalAlpha(PicturePtr pict, int x, int y, int width, CARD32 *buffer)
{
    FbBits *bits, *alpha_bits;
    FbStride stride, astride;
    int bpp, abpp;
    int xoff, yoff;
    int ax, ay;
    storeProc store;
    storeProc astore;
    miIndexedPtr indexed = (miIndexedPtr) pict->pFormat->index.devPrivate;
    miIndexedPtr aindexed;

    if (!pict->alphaMap) {
        fbStore(pict, x, y, width, buffer);
	return;
    }

    store = storeProcForPicture(pict);
    astore = storeProcForPicture(pict->alphaMap);
    aindexed = (miIndexedPtr) pict->alphaMap->pFormat->index.devPrivate;

    ax = x;
    ay = y;

    fbGetDrawable (pict->pDrawable, bits, stride, bpp, xoff, yoff);
    x += xoff;
    y += yoff;
    fbGetDrawable (pict->alphaMap->pDrawable, alpha_bits, astride, abpp, xoff, yoff);
    ax += xoff;
    ay += yoff;

    bits       += y*stride;
    alpha_bits += (ay - pict->alphaOrigin.y)*astride;


    store(bits, buffer, x, width, indexed);
    astore(alpha_bits, buffer, ax - pict->alphaOrigin.x, width, aindexed);
}

typedef void (*scanStoreProc)(PicturePtr , int , int , int , CARD32 *);
typedef void (*scanFetchProc)(PicturePtr , int , int , int , CARD32 *);

static void
fbCompositeRect (const FbComposeData *data, CARD32 *scanline_buffer)
{
    CARD32 *src_buffer = scanline_buffer;
    CARD32 *dest_buffer = src_buffer + data->width;
    int i;
    scanStoreProc store;
    scanFetchProc fetchSrc = NULL, fetchMask = NULL, fetchDest = NULL;

    if (data->op == PictOpClear)
        fetchSrc = NULL;
    else if (!data->src->pDrawable) {
        if (data->src->pSourcePict)
            fetchSrc = fbFetchSourcePict;
    } else if (data->src->alphaMap)
        fetchSrc = fbFetchExternalAlpha;
    else if (data->src->repeatType == RepeatNormal &&
             data->src->pDrawable->width == 1 && data->src->pDrawable->height == 1)
        fetchSrc = fbFetchSolid;
    else if (!data->src->transform && data->src->filter != PictFilterConvolution)
        fetchSrc = fbFetch;
    else
        fetchSrc = fbFetchTransformed;

    if (data->mask && data->op != PictOpClear) {
        if (!data->mask->pDrawable) {
            if (data->mask->pSourcePict)
                fetchMask = fbFetchSourcePict;
        } else if (data->mask->alphaMap)
            fetchMask = fbFetchExternalAlpha;
        else if (data->mask->repeatType == RepeatNormal
                 && data->mask->pDrawable->width == 1 && data->mask->pDrawable->height == 1)
            fetchMask = fbFetchSolid;
        else if (!data->mask->transform && data->mask->filter != PictFilterConvolution)
            fetchMask = fbFetch;
        else
            fetchMask = fbFetchTransformed;
    } else {
        fetchMask = NULL;
    }

    if (data->dest->alphaMap) {
        fetchDest = fbFetchExternalAlpha;
        store = fbStoreExternalAlpha;
    } else {
        fetchDest = fbFetch;
        store = fbStore;
    }
    if (data->op == PictOpClear || data->op == PictOpSrc)
        fetchDest = NULL;

    if (fetchSrc && fetchMask && data->mask && data->mask->componentAlpha && PICT_FORMAT_RGB(data->mask->format)) {
        CARD32 *mask_buffer = dest_buffer + data->width;
        CombineFuncC compose = composeFunctions.combineC[data->op];
        if (!compose)
            return;

        for (i = 0; i < data->height; ++i)
        {
            /* fill first half of scanline with source */
            fetchSrc(data->src, data->xSrc, data->ySrc + i, data->width, src_buffer);
            fetchMask(data->mask, data->xMask, data->yMask + i, data->width, mask_buffer);

            /* fill dest into second half of scanline */
            if (fetchDest)
                fetchDest(data->dest, data->xDest, data->yDest + i, data->width, dest_buffer);

            /* blend */
            compose(dest_buffer, src_buffer, mask_buffer, data->width);

            /* write back */
            store(data->dest, data->xDest, data->yDest + i, data->width, dest_buffer);
        }
    } else {

        CombineFuncU compose = composeFunctions.combineU[data->op];
        if (!compose)
            return;

        if (fetchSrc == fbFetchSolid && (!fetchMask || fetchMask == fbFetchSolid)) {
            fetchSrc(data->src, data->xSrc, data->ySrc, data->width, src_buffer);
            if (fetchMask) {
                fetchMask(data->mask, data->xMask, data->yMask, data->width, dest_buffer);
                composeFunctions.combineMaskU(src_buffer, dest_buffer, data->width);
            }
            fetchSrc = NULL;
            fetchMask = NULL;
        }

        for (i = 0; i < data->height; ++i)
        {
            /* fill first half of scanline with source */
            if (fetchSrc) {
                fetchSrc(data->src, data->xSrc, data->ySrc + i, data->width, src_buffer);

                /* add in mask */
                if (fetchMask) {
                    fetchMask(data->mask, data->xMask, data->yMask + i, data->width, dest_buffer);
                    composeFunctions.combineMaskU(src_buffer, dest_buffer, data->width);
                }
            }

            /* fill dest into second half of scanline */
            if (fetchDest)
                fetchDest(data->dest, data->xDest, data->yDest + i, data->width, dest_buffer);

            /* blend */
            compose(dest_buffer, src_buffer, data->width);

            /* write back */
            store(data->dest, data->xDest, data->yDest + i, data->width, dest_buffer);
        }
    }
}

void
fbCompositeGeneral (CARD8	op,
		    PicturePtr	pSrc,
		    PicturePtr	pMask,
		    PicturePtr	pDst,
		    INT16	xSrc,
		    INT16	ySrc,
		    INT16	xMask,
		    INT16	yMask,
		    INT16	xDst,
		    INT16	yDst,
		    CARD16	width,
		    CARD16	height)
{
    RegionRec	    region;
    int		    n;
    BoxPtr	    pbox;
    Bool	    srcRepeat = FALSE;
    Bool	    maskRepeat = FALSE;
    int		    w, h;
    CARD32 _scanline_buffer[SCANLINE_BUFFER_LENGTH*3];
    CARD32 *scanline_buffer = _scanline_buffer;
    FbComposeData compose_data;
    
    if (pSrc->pDrawable)
        srcRepeat = pSrc->repeatType == RepeatNormal && !pSrc->transform
                    && (pSrc->pDrawable->width != 1 || pSrc->pDrawable->height != 1);

    if (pMask && pMask->pDrawable)
	maskRepeat = pMask->repeatType == RepeatNormal && !pMask->transform
                     && (pMask->pDrawable->width != 1 || pMask->pDrawable->height != 1);

    if (op == PictOpOver && !pMask && !pSrc->transform && !PICT_FORMAT_A(pSrc->format) && !pSrc->alphaMap)
        op = PictOpSrc;

    if (!miComputeCompositeRegion (&region,
				   pSrc,
				   pMask,
				   pDst,
				   xSrc,
				   ySrc,
				   xMask,
				   yMask,
				   xDst,
				   yDst,
				   width,
				   height))
	    return;

    compose_data.op = op;
    compose_data.src = pSrc;
    compose_data.mask = pMask;
    compose_data.dest = pDst;
    if (width > SCANLINE_BUFFER_LENGTH)
        scanline_buffer = (CARD32 *) malloc(width * 3 * sizeof(CARD32));

    n = RegionNumRects (&region);
    pbox = RegionRects (&region);
    while (n--)
    {
	h = pbox->y2 - pbox->y1;
	compose_data.ySrc = pbox->y1 - yDst + ySrc;
	compose_data.yMask = pbox->y1 - yDst + yMask;
	compose_data.yDest = pbox->y1;
	while (h)
	{
	    compose_data.height = h;
	    w = pbox->x2 - pbox->x1;
	    compose_data.xSrc = pbox->x1 - xDst + xSrc;
	    compose_data.xMask = pbox->x1 - xDst + xMask;
	    compose_data.xDest = pbox->x1;
	    if (maskRepeat)
	    {
		compose_data.yMask = mod (compose_data.yMask, pMask->pDrawable->height);
		if (compose_data.height > pMask->pDrawable->height - compose_data.yMask)
		    compose_data.height = pMask->pDrawable->height - compose_data.yMask;
	    }
	    if (srcRepeat)
	    {
		compose_data.ySrc = mod (compose_data.ySrc, pSrc->pDrawable->height);
		if (compose_data.height > pSrc->pDrawable->height - compose_data.ySrc)
		    compose_data.height = pSrc->pDrawable->height - compose_data.ySrc;
	    }
	    while (w)
	    {
		compose_data.width = w;
		if (maskRepeat)
		{
		    compose_data.xMask = mod (compose_data.xMask, pMask->pDrawable->width);
		    if (compose_data.width > pMask->pDrawable->width - compose_data.xMask)
			compose_data.width = pMask->pDrawable->width - compose_data.xMask;
		}
		if (srcRepeat)
		{
		    compose_data.xSrc = mod (compose_data.xSrc, pSrc->pDrawable->width);
		    if (compose_data.width > pSrc->pDrawable->width - compose_data.xSrc)
			compose_data.width = pSrc->pDrawable->width - compose_data.xSrc;
		}
		fbCompositeRect(&compose_data, scanline_buffer);
		w -= compose_data.width;
		compose_data.xSrc += compose_data.width;
		compose_data.xMask += compose_data.width;
		compose_data.xDest += compose_data.width;
	    }
	    h -= compose_data.height;
	    compose_data.ySrc += compose_data.height;
	    compose_data.yMask += compose_data.height;
	    compose_data.yDest += compose_data.height;
    }
	pbox++;
    }
    RegionUninit(&region);

    if (scanline_buffer != _scanline_buffer)
        free(scanline_buffer);
}

#endif