From f4092abdf94af6a99aff944d6264bc1284e8bdd4 Mon Sep 17 00:00:00 2001 From: Reinhard Tartler Date: Mon, 10 Oct 2011 17:43:39 +0200 Subject: Imported nx-X11-3.1.0-1.tar.gz Summary: Imported nx-X11-3.1.0-1.tar.gz Keywords: Imported nx-X11-3.1.0-1.tar.gz into Git repository --- .../Xserver/hw/xfree86/os-support/misc/BUSmemcpy.c | 413 +++++++++++++++++++++ 1 file changed, 413 insertions(+) create mode 100644 nx-X11/programs/Xserver/hw/xfree86/os-support/misc/BUSmemcpy.c (limited to 'nx-X11/programs/Xserver/hw/xfree86/os-support/misc/BUSmemcpy.c') diff --git a/nx-X11/programs/Xserver/hw/xfree86/os-support/misc/BUSmemcpy.c b/nx-X11/programs/Xserver/hw/xfree86/os-support/misc/BUSmemcpy.c new file mode 100644 index 000000000..9ff14396c --- /dev/null +++ b/nx-X11/programs/Xserver/hw/xfree86/os-support/misc/BUSmemcpy.c @@ -0,0 +1,413 @@ + +/**************************************************************************** + + For Alpha Linux, BusToMem() and MemToBus() can be simply memcpy(), BUT: + we need to prevent unaligned operations when accessing DENSE space on the BUS, + as the video memory is mmap'd that way. The below code does this. + +NOTE: we could simply use the "memcpy()" from LIBC here, but that, currently, is + not as fast. + +Thanks to Linus Torvalds for contributing this code. + +****************************************************************************/ + +/* $XFree86: xc/programs/Xserver/hw/xfree86/os-support/misc/BUSmemcpy.c,v 1.3 1999/12/03 19:17:44 eich Exp $ */ + +#ifdef HAVE_XORG_CONFIG_H +#include +#endif + +#include +#include "xf86.h" +#include "xf86Priv.h" +#include "xf86_OSlib.h" + +#ifdef __alpha__ + +#include "compiler.h" + +/* + * The Jensen lacks dense memory, thus we have to address the bus via + * the sparse addressing scheme. These routines are only used in s3im.c + * Non time critical code uses SlowBCopy_{from/to} bus. + * + * Martin Ostermann (ost@comnets.rwth-aachen.de) - Apr.-Sep. 1996 + */ + +#ifdef TEST_JENSEN_CODE /* define to test the Sparse addressing on a non-Jensen */ +#define LWORD_CODING (0x18) +#define SPARSE (5) +#else +#define LWORD_CODING (0x60) +#define SPARSE (7) +#endif + +void +xf86JensenMemToBus(char *Base, long dst, long src, int count) +{ + if( ((long)src^((long)dst)) & 3) { + /* src & dst are NOT aligned to each other */ + unsigned long addr; + unsigned long low_word, high_word,last_read; + long rm,loop; + unsigned long tmp,org,org2,mask,src_org,count_org; + + src_org=src; + count_org=count; + + /* add EISA longword coding and round off*/ + addr = (long)(Base+(dst< 4 ) { + last_read = src_org+count_org - 1; + __asm__("ldq_u %0,%1" + :"=r" (high_word):"m" (*(unsigned long *)(src+4))); + __asm__("extll %1,%2,%0" + :"=r" (low_word) + :"r" (low_word), "r" ((unsigned long)(src))); + __asm__("extlh %1,%2,%0" + :"=r" (tmp) + :"r" (high_word), "r" ((unsigned long)(src))); + tmp |= low_word; + src += 4; + __asm__("mskqh %1,%2,%0" + :"=r" (tmp) + :"r" (tmp), "r" (rm)); + __asm__("mskql %1,%2,%0" + :"=r" (org2) + :"r" (org), "r" (rm)); + tmp |= org2; + + loop = (count-4) >> 2; /* loop eqv. count>=4 ; count -= 4 */ + while (loop) { + /* tmp to be stored completly -- need to read next word*/ + low_word = high_word; + *(volatile unsigned int *) (addr) = tmp; + __asm__("ldq_u %0,%1" + :"=r" (high_word):"m" (*(unsigned long*)(src+4))); + loop --; + __asm__("extll %1,%2,%0" + :"=r" (low_word) + :"r" (low_word), "r" ((unsigned long)src)); + __asm__("extlh %1,%2,%0" + :"=r" (tmp) + :"r" (high_word), "r" ((unsigned long)src)); + src += 4; + tmp |= low_word; + addr += 4< 4 */ + __asm__("ldq_u %0,%1" + :"=r" (high_word):"m" (*(unsigned long *)(src+4))); + __asm__("extll %1,%2,%0" + :"=r" (low_word) + :"r" (low_word), "r" ((unsigned long)(src))); + __asm__("extlh %1,%2,%0" + :"=r" (tmp) + :"r" (high_word), "r" ((unsigned long)(src))); + tmp |= low_word; + if( count < 4 ) { + + mask = -1; + __asm__("mskqh %1,%2,%0" + :"=r" (mask) + :"r" (mask), "r" (rm)); + __asm__("mskql %1,%2,%0" + :"=r" (mask) + :"r" (mask), "r" (count)); + tmp = (tmp & mask) | (org & ~mask); + *(volatile unsigned int *) (addr) = tmp; + return; + } else { + __asm__("mskqh %1,%2,%0" + :"=r" (tmp) + :"r" (tmp), "r" (rm)); + __asm__("mskql %1,%2,%0" + :"=r" (org2) + :"r" (org), "r" (rm)); + + tmp |= org2; + *(volatile unsigned int *) (addr) = tmp; + return; + } + } + } else { /* src & dst are aligned to each other */ + unsigned long addr; + unsigned int tmp,org,rm; + unsigned int *src_r; + + /* add EISA longword coding and round off*/ + addr = (long)(Base+(dst< 4) { + *(volatile unsigned int *) addr = tmp; + addr += 4<>= ((addr>>SPARSE) & 3) * 8; + *dst++ = (char) result; + addr += 1<= 0){ + int i; + + result = *(volatile int *) (addr+LWORD_CODING); + for(i=4;i--;) { + *dst++ = (char) result; + result >>= 8; + } + addr += 4<>= ((addr>>SPARSE) & 3) * 8; + *dst++ = (char) result; + addr += 1<= 0)", resulting in a + * explicit compare against 0 (instead of just using the proper "blt reg, xx" or + * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually.. + */ + +/* + * This should be done in one go with ldq_u*2/mask/stq_u. Do it + * with a macro so that we can fix it up later.. + */ +#define ALIGN_DEST_TO8(d,s,n) \ + while (d & 7) { \ + if (n <= 0) return; \ + n--; \ + *(char *) d = *(char *) s; \ + d++; s++; \ + } + +/* + * This should similarly be done with ldq_u*2/mask/stq. The destination + * is aligned, but we don't fill in a full quad-word + */ +#define DO_REST(d,s,n) \ + while (n > 0) { \ + n--; \ + *(char *) d = *(char *) s; \ + d++; s++; \ + } + +/* + * This should be done with ldq/mask/stq. The source and destination are + * aligned, but we don't fill in a full quad-word + */ +#define DO_REST_ALIGNED(d,s,n) DO_REST(d,s,n) + +/* + * This does unaligned memory copies. We want to avoid storing to + * an unaligned address, as that would do a read-modify-write cycle. + * We also want to avoid double-reading the unaligned reads. + * + * Note the ordering to try to avoid load (and address generation) latencies. + */ +static __inline__ void __memcpy_unaligned(unsigned long d, unsigned long s, long n) +{ + ALIGN_DEST_TO8(d,s,n); + n -= 8; /* to avoid compare against 8 in the loop */ + if (n >= 0) { + unsigned long low_word, high_word; + __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s)); + do { + unsigned long tmp; + __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8))); + n -= 8; + __asm__("extql %1,%2,%0" + :"=r" (low_word) + :"r" (low_word), "r" (s)); + __asm__("extqh %1,%2,%0" + :"=r" (tmp) + :"r" (high_word), "r" (s)); + s += 8; + *(unsigned long *) d = low_word | tmp; + d += 8; + low_word = high_word; + } while (n >= 0); + } + n += 8; + DO_REST(d,s,n); +} + +/* + * Hmm.. Strange. The __asm__ here is there to make gcc use a integer register + * for the load-store. I don't know why, but it would seem that using a floating + * point register for the move seems to slow things down (very small difference, + * though). + * + * Note the ordering to try to avoid load (and address generation) latencies. + */ +static __inline__ void __memcpy_aligned(unsigned long d, unsigned long s, long n) +{ + ALIGN_DEST_TO8(d,s,n); + n -= 8; + while (n >= 0) { + unsigned long tmp; + __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s)); + n -= 8; + s += 8; + *(unsigned long *) d = tmp; + d += 8; + } + n += 8; + DO_REST_ALIGNED(d,s,n); +} + +static unsigned long __memcpy(unsigned long dest, unsigned long src, int n) +{ + if (!((dest ^ src) & 7)) { + __memcpy_aligned(dest, src, n); + return dest; + } + __memcpy_unaligned(dest, src, n); + return dest; +} + +#else /* __alpha__ */ + +void +xf86BusToMem(unsigned char *dst, unsigned char *src, int len) +{ + memcpy(dst, src, len); +} +void +xf86MemToBus(unsigned char *dst, unsigned char *src, int len) +{ + memcpy(dst, src, len); +} + +#endif /* __alpha__ */ -- cgit v1.2.3