Hi,

On Tue, 12 Dec 2006, Michael Schmitz wrote:

> > > vger refused the atafb patch again, that's why I spammed Geert by CC: on
> > > all this, sigh.
> >
> > BTW I pretty much rewrote the bit plane stuff, so it doesn't constantly
> > corrupts the output.
> 
> ??? When did you get to see output corruption? Can you send me your
> rewrite for testing? (Though Aranym should suffice for testing, really)

I was testing with vga256 and pretty much any line edit operation which 
calls copyarea or fillrect causes a corrupt output (i.e. anything besides 
special cases like full lines). Am I the only one seeing that?
I attached the 8 plane version, it still needs some more cleanups. The 2 
and 4 plane version will look similiar and I propably going to merge them 
at some point to use one template to produce the 3 versions.
I killed the movep stuff, I'm not sure it still is really a win (maybe at 
68000 times it was, but on a 68060 it certainly isn't), some operations 
should be even considerably faster now (e.g. odd<->even copy).

bye, Roman
/*
 *  linux/drivers/video/iplan2p8.c -- Low level frame buffer operations for
 *				      interleaved bitplanes à la Atari (8
 *				      planes, 2 bytes interleave)
 *
 *	Created 5 Apr 1997 by Geert Uytterhoeven
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of this archive for
 *  more details.
 */

#include <linux/module.h>
#include <linux/string.h>
#include <linux/fb.h>

#include <asm/setup.h>

#include "console/fbcon.h"
#include "atafb_iplan2p8.h"

#define BPL	8
#include "atafb_utils.h"


/* Copies a 8 plane column from 's', height 'h', to 'd'. */

/* This expands a 8 bit color into two longs for two movepl (8 plane)
 * operations.
 */

void atafb_iplan2p8_copyarea(struct fb_info *info, u_long next_line,
                             int sy, int sx, int dy, int dx,
			     int height, int width)
{
	/*  bmove() has to distinguish two major cases: If both, source and
	 *  destination, start at even addresses or both are at odd
	 *  addresses, just the first odd and last even column (if present)
	 *  require special treatment (memmove_col()). The rest between
	 *  then can be copied by normal operations, because all adjacent
	 *  bytes are affected and are to be stored in the same order.
	 *    The pathological case is when the move should go from an odd
	 *  address to an even or vice versa. Since the bytes in the plane
	 *  words must be assembled in new order, it seems wisest to make
	 *  all movements by memmove_col().
	 */

	u8 *src, *dst;
	u32 *s, *d;
	int w, l , i, j;
	u_int colsize;
	u_int upwards = (dy < sy) || (dy == sy && dx < sx);

	colsize = height;
	if (!((sx ^ dx) & 15)) {
		/* odd->odd or even->even */

		if (upwards) {
			src = (u8 *)info->screen_base + sy * next_line + (sx & ~15);
			dst = (u8 *)info->screen_base + dy * next_line + (dx & ~15);
			if (sx & 15) {
				memmove32_col(dst, src, 0xff00ff, height, next_line - 16);
				src += 16;
				dst += 16;
				width -= 8;
			}
			w = width >> 4;
			if (w) {
				s = (u32 *)src;
				d = (u32 *)dst;
				w *= 4;
				l = next_line - w * 4;
				for (j = height; j > 0; j--) {
					for (i = w; i > 0; i--)
						*d++ = *s++;
					s = (u32 *)((u8 *)s + l);
					d = (u32 *)((u8 *)d + l);
				}
			}
			if (width & 15)
				memmove32_col(dst + width, src + width,
					      0xff00ff00, height, next_line - 16);
		} else {
			src = (u8 *)info->screen_base + (sy - 1) * next_line + ((sx + width + 8) & ~15);
			dst = (u8 *)info->screen_base + (dy - 1) * next_line + ((dx + width + 8) & ~15);

			if ((sx + width) & 15) {
				src -= 16;
				dst -= 16;
				memmove32_col(dst, src, 0xff00ff00, colsize, -next_line - 16);
				width -= 8;
			}
			w = width >> 4;
			if (w) {
				s = (u32 *)src;
				d = (u32 *)dst;
				w *= 4;
				l = next_line - w * 4;
				for (j = height; j > 0; j--) {
					for (i = w; i > 0; i--)
						*--d = *--s;
					s = (u32 *)((u8 *)s - l);
					d = (u32 *)((u8 *)d - l);
				}
			}
			if (sx & 15)
				memmove32_col(dst - width - 16, src - width - 16,
					      0xff00ff, colsize, -next_line - 16);
		}
	} else {
		/* odd->even or even->odd */
		if (upwards) {
			u32 *src32, *dst32;
			u32 pval[4], v, v1, mask;
			int i, j, w, f;

			src = (u8 *)info->screen_base + sy * next_line + (sx & ~15);
			dst = (u8 *)info->screen_base + dy * next_line + (dx & ~15);

			mask = 0xff00ff00;
			f = 0;
			w = width;
			if (sx & 15) {
				f = 1;
				w += 8;
			}
			if ((sx + width) & 15)
				f |= 2;
			w >>= 4;
			for (i = height; i; i--) {
				src32 = (u32 *)src;
				dst32 = (u32 *)dst;

				if (f & 1) {
					pval[0] = (*src32++ << 8) & mask;
					pval[1] = (*src32++ << 8) & mask;
					pval[2] = (*src32++ << 8) & mask;
					pval[3] = (*src32++ << 8) & mask;
				} else {
					pval[0] = dst32[0] & mask;
					pval[1] = dst32[1] & mask;
					pval[2] = dst32[2] & mask;
					pval[3] = dst32[3] & mask;
				}

				for (j = w; j > 0; j--) {
					v = *src32++;
					v1 = v & mask;
					*dst32++ = pval[0] | (v1 >> 8);
					pval[0] = (v ^ v1) << 8;
					v = *src32++;
					v1 = v & mask;
					*dst32++ = pval[1] | (v1 >> 8);
					pval[1] = (v ^ v1) << 8;
					v = *src32++;
					v1 = v & mask;
					*dst32++ = pval[2] | (v1 >> 8);
					pval[2] = (v ^ v1) << 8;
					v = *src32++;
					v1 = v & mask;
					*dst32++ = pval[3] | (v1 >> 8);
					pval[3] = (v ^ v1) << 8;
				}

				if (f & 2) {
					dst32[0] = (dst32[0] & mask) | pval[0];
					dst32[1] = (dst32[1] & mask) | pval[1];
					dst32[2] = (dst32[2] & mask) | pval[2];
					dst32[3] = (dst32[3] & mask) | pval[3];
				}

				src += next_line;
				dst += next_line;
			}
		} else {
			u32 *src32, *dst32;
			u32 pval[4], v, v1, mask;
			int i, j, w, f;

			src = (u8 *)(info->screen_base + (sy - 1) * next_line + ((sx + width + 8) & ~15));
			dst = (u8 *)(info->screen_base + (dy - 1) * next_line + ((dx + width + 8) & ~15));

			mask = 0xff00ff;
			f = 0;
			w = width;
			if ((dx + width) & 15)
				f = 1;
			if (sx & 15) {
				f |= 2;
				w += 8;
			}
			w >>= 4;
			for (i = height; i; i--) {
				src32 = (u32 *)src;
				dst32 = (u32 *)dst;

				if (f & 1) {
					pval[0] = dst32[-1] & mask;
					pval[1] = dst32[-2] & mask;
					pval[2] = dst32[-3] & mask;
					pval[3] = dst32[-4] & mask;
				} else {
					pval[0] = (*--src32 >> 8) & mask;
					pval[1] = (*--src32 >> 8) & mask;
					pval[2] = (*--src32 >> 8) & mask;
					pval[3] = (*--src32 >> 8) & mask;
				}

				for (j = w; j > 0; j--) {
					v = *--src32;
					v1 = v & mask;
					*--dst32 = pval[0] | (v1 << 8);
					pval[0] = (v ^ v1) >> 8;
					v = *--src32;
					v1 = v & mask;
					*--dst32 = pval[1] | (v1 << 8);
					pval[1] = (v ^ v1) >> 8;
					v = *--src32;
					v1 = v & mask;
					*--dst32 = pval[2] | (v1 << 8);
					pval[2] = (v ^ v1) >> 8;
					v = *--src32;
					v1 = v & mask;
					*--dst32 = pval[3] | (v1 << 8);
					pval[3] = (v ^ v1) >> 8;
				}

				if (!(f & 2)) {
					dst32[-1] = (dst32[-1] & mask) | pval[0];
					dst32[-2] = (dst32[-2] & mask) | pval[1];
					dst32[-3] = (dst32[-3] & mask) | pval[2];
					dst32[-4] = (dst32[-4] & mask) | pval[3];
				}

				src -= next_line;
				dst -= next_line;
			}
		}
	}
}

void atafb_iplan2p8_fillrect(struct fb_info *info, u_long next_line, u32 color,
                             int sy, int sx, int height, int width)
{
	u32 *dest;
	int rows, i, j;
	u32 cval[4], tmp;

	dest = (u32 *)(info->screen_base + sy * next_line + (sx & ~15) / (8 / BPL));
	if (sx & 15) {
		u8 *dest8 = (u8 *)dest + 1;

		expand8_col2mask(color, cval);

		for (i = height; i; i--) {
			fill8_col(dest8, cval);
			dest8 += next_line;
		}
		dest += BPL / 2;
		width -= 8;
	}

	expand16_col2mask(color, cval);
	rows = width >> 4;
	if (rows) {
		u32 *d = dest;
		u32 off = next_line - rows * BPL * 2;
		for (i = height; i; i--) {
			d = fill16_col(d, rows, cval);
			d = (u32 *)((long)d + off);
		}
		dest += rows << 2;
		width &= 15;
	}

	if (width) {
		u8 *dest8 = (u8 *)dest;

		expand8_col2mask(color, cval);

		for (i = height; i; i--) {
			fill8_col(dest8, cval);
			dest8 += next_line;
		}
	}
}

void atafb_iplan2p8_linefill(struct fb_info *info, u_long next_line,
			     int dy, int dx, u32 width,
			     const u8 *data, u32 bgcolor, u32 fgcolor)
{
	u32 *dest;
	const u16 *data16;
	int rows;
	u32 m, tmp;
	u32 fgm[4], bgm[4];

	dest = (u32 *)(info->screen_base + dy * next_line + (dx & ~15) / (8 / BPL));
	if (dx & 15) {
		fill8_2col((u8 *)dest + 1, fgcolor, bgcolor, *data++);
		dest += BPL / 2;
		width -= 8;
	}

	if (width >= 16) {
		data16 = (const u16 *)data;
		expand16_2col2mask(fgcolor, bgcolor, fgm, bgm);

		for (rows = width / 16; rows; rows--) {
			u16 d = *data16++;
			m = d | ((u32)d << 16);
			*dest++ = (m & fgm[0]) ^ bgm[0];
			*dest++ = (m & fgm[1]) ^ bgm[1];
			*dest++ = (m & fgm[2]) ^ bgm[2];
			*dest++ = (m & fgm[3]) ^ bgm[3];
		}

		data = (const u8 *)data16;
		width &= 15;
	}

	if (width)
		fill8_2col((u8 *)dest, fgcolor, bgcolor, *data);
}

#ifdef MODULE
MODULE_LICENSE("GPL");

int init_module(void)
{
	return 0;
}

void cleanup_module(void)
{}
#endif /* MODULE */


    /*
     *  Visible symbols for modules
     */

EXPORT_SYMBOL(atafb_iplan2p8_copyarea);
EXPORT_SYMBOL(atafb_iplan2p8_fillrect);
EXPORT_SYMBOL(atafb_iplan2p8_linefill);
#ifndef _VIDEO_ATAFB_UTILS_H
#define _VIDEO_ATAFB_UTILS_H

/* ================================================================= */
/*                      Utility Assembler Functions                  */
/* ================================================================= */

/* ====================================================================== */

/* Those of a delicate disposition might like to skip the next couple of
 * pages.
 *
 * These functions are drop in replacements for memmove and
 * memset(_, 0, _). However their five instances add at least a kilobyte
 * to the object file. You have been warned.
 *
 * Not a great fan of assembler for the sake of it, but I think
 * that these routines are at least 10 times faster than their C
 * equivalents for large blits, and that's important to the lowest level of
 * a graphics driver. Question is whether some scheme with the blitter
 * would be faster. I suspect not for simple text system - not much
 * asynchrony.
 *
 * Code is very simple, just gruesome expansion. Basic strategy is to
 * increase data moved/cleared at each step to 16 bytes to reduce
 * instruction per data move overhead. movem might be faster still
 * For more than 15 bytes, we try to align the write direction on a
 * longword boundary to get maximum speed. This is even more gruesome.
 * Unaligned read/write used requires 68020+ - think this is a problem?
 *
 * Sorry!
 */


/* ++roman: I've optimized Robert's original versions in some minor
 * aspects, e.g. moveq instead of movel, let gcc choose the registers,
 * use movem in some places...
 * For other modes than 1 plane, lots of more such assembler functions
 * were needed (e.g. the ones using movep or expanding color values).
 */

/* ++andreas: more optimizations:
   subl #65536,d0 replaced by clrw d0; subql #1,d0 for dbcc
   addal is faster than addaw
   movep is rather expensive compared to ordinary move's
   some functions rewritten in C for clarity, no speed loss */

static __inline__ void *fb_memclear_small(void *s, size_t count)
{
   if (!count)
      return(0);

   __asm__ __volatile__(
         "lsrl   #1,%1 ; jcc 1f ; moveb %2,[EMAIL PROTECTED]"
      "1: lsrl   #1,%1 ; jcc 1f ; movew %2,[EMAIL PROTECTED]"
      "1: lsrl   #1,%1 ; jcc 1f ; movel %2,[EMAIL PROTECTED]"
      "1: lsrl   #1,%1 ; jcc 1f ; movel %2,[EMAIL PROTECTED] ; movel %2,[EMAIL PROTECTED]"
      "1:"
         : "=a" (s), "=d" (count)
         : "d" (0), "0" ((char *)s+count), "1" (count)
   );
   __asm__ __volatile__(
         "subql  #1,%1 ; jcs 3f\n\t"
	 "movel %2,%%d4; movel %2,%%d5; movel %2,%%d6\n\t"
      "2: moveml %2/%%d4/%%d5/%%d6,[EMAIL PROTECTED]"
         "dbra %1,2b\n\t"
      "3:"
         : "=a" (s), "=d" (count)
         : "d" (0), "0" (s), "1" (count)
	 : "d4", "d5", "d6"
  );

   return(0);
}


static __inline__ void *fb_memclear(void *s, size_t count)
{
   if (!count)
      return(0);

   if (count < 16) {
      __asm__ __volatile__(
            "lsrl   #1,%1 ; jcc 1f ; clrb [EMAIL PROTECTED]"
         "1: lsrl   #1,%1 ; jcc 1f ; clrw [EMAIL PROTECTED]"
         "1: lsrl   #1,%1 ; jcc 1f ; clrl [EMAIL PROTECTED]"
         "1: lsrl   #1,%1 ; jcc 1f ; clrl [EMAIL PROTECTED] ; clrl [EMAIL PROTECTED]"
         "1:"
            : "=a" (s), "=d" (count)
            : "0" (s), "1" (count)
     );
   } else {
      long tmp;
      __asm__ __volatile__(
            "movel %1,%2\n\t"
            "lsrl   #1,%2 ; jcc 1f ; clrb [EMAIL PROTECTED] ; subqw #1,%1\n\t"
            "lsrl   #1,%2 ; jcs 2f\n\t"  /* %0 increased=>bit 2 switched*/
            "clrw   [EMAIL PROTECTED]  ; subqw  #2,%1 ; jra 2f\n\t"
         "1: lsrl   #1,%2 ; jcc 2f\n\t"
            "clrw   [EMAIL PROTECTED]  ; subqw  #2,%1\n\t"
         "2: movew %1,%2; lsrl #2,%1 ; jeq 6f\n\t"
            "lsrl   #1,%1 ; jcc 3f ; clrl [EMAIL PROTECTED]"
         "3: lsrl   #1,%1 ; jcc 4f ; clrl [EMAIL PROTECTED] ; clrl [EMAIL PROTECTED]"
         "4: subql  #1,%1 ; jcs 6f\n\t"
         "5: clrl [EMAIL PROTECTED]; clrl [EMAIL PROTECTED] ; clrl [EMAIL PROTECTED] ; clrl [EMAIL PROTECTED]"
            "dbra %1,5b   ; clrw %1; subql #1,%1; jcc 5b\n\t"
         "6: movew %2,%1; btst #1,%1 ; jeq 7f ; clrw [EMAIL PROTECTED]"
         "7:            ; btst #0,%1 ; jeq 8f ; clrb [EMAIL PROTECTED]"
         "8:"
            : "=a" (s), "=d" (count), "=d" (tmp)
            : "0" (s), "1" (count)
     );
   }

   return(0);
}


static __inline__ void *fb_memset255(void *s, size_t count)
{
   if (!count)
      return(0);

   __asm__ __volatile__(
         "lsrl   #1,%1 ; jcc 1f ; moveb %2,[EMAIL PROTECTED]"
      "1: lsrl   #1,%1 ; jcc 1f ; movew %2,[EMAIL PROTECTED]"
      "1: lsrl   #1,%1 ; jcc 1f ; movel %2,[EMAIL PROTECTED]"
      "1: lsrl   #1,%1 ; jcc 1f ; movel %2,[EMAIL PROTECTED] ; movel %2,[EMAIL PROTECTED]"
      "1:"
         : "=a" (s), "=d" (count)
         : "d" (-1), "0" ((char *)s+count), "1" (count)
   );
   __asm__ __volatile__(
         "subql  #1,%1 ; jcs 3f\n\t"
	 "movel %2,%%d4; movel %2,%%d5; movel %2,%%d6\n\t"
      "2: moveml %2/%%d4/%%d5/%%d6,[EMAIL PROTECTED]"
         "dbra %1,2b\n\t"
      "3:"
         : "=a" (s), "=d" (count)
         : "d" (-1), "0" (s), "1" (count)
	 : "d4", "d5", "d6"
  );

   return(0);
}


static __inline__ void *fb_memmove(void *d, const void *s, size_t count)
{
   if (d < s) {
      if (count < 16) {
         __asm__ __volatile__(
               "lsrl   #1,%2 ; jcc 1f ; moveb [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "1: lsrl   #1,%2 ; jcc 1f ; movew [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "1: lsrl   #1,%2 ; jcc 1f ; movel [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "1: lsrl   #1,%2 ; jcc 1f ; movel [EMAIL PROTECTED],[EMAIL PROTECTED] ; movel [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "1:"
               : "=a" (d), "=a" (s), "=d" (count)
               : "0" (d), "1" (s), "2" (count)
        );
      } else {
         long tmp;
         __asm__ __volatile__(
               "movel  %0,%3\n\t"
               "lsrl   #1,%3 ; jcc 1f ; moveb [EMAIL PROTECTED],[EMAIL PROTECTED] ; subqw #1,%2\n\t"
               "lsrl   #1,%3 ; jcs 2f\n\t"  /* %0 increased=>bit 2 switched*/
               "movew  [EMAIL PROTECTED],[EMAIL PROTECTED]  ; subqw  #2,%2 ; jra 2f\n\t"
            "1: lsrl   #1,%3 ; jcc 2f\n\t"
               "movew  [EMAIL PROTECTED],[EMAIL PROTECTED]  ; subqw  #2,%2\n\t"
            "2: movew  %2,%-; lsrl #2,%2 ; jeq 6f\n\t"
               "lsrl   #1,%2 ; jcc 3f ; movel [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "3: lsrl   #1,%2 ; jcc 4f ; movel [EMAIL PROTECTED],[EMAIL PROTECTED] ; movel [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "4: subql  #1,%2 ; jcs 6f\n\t"
            "5: movel  [EMAIL PROTECTED],[EMAIL PROTECTED];movel [EMAIL PROTECTED],[EMAIL PROTECTED]"
               "movel  [EMAIL PROTECTED],[EMAIL PROTECTED];movel [EMAIL PROTECTED],[EMAIL PROTECTED]"
               "dbra   %2,5b ; clrw %2; subql #1,%2; jcc 5b\n\t"
            "6: movew  %+,%2; btst #1,%2 ; jeq 7f ; movew [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "7:              ; btst #0,%2 ; jeq 8f ; moveb [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "8:"
               : "=a" (d), "=a" (s), "=d" (count), "=d" (tmp)
               : "0" (d), "1" (s), "2" (count)
        );
      }
   } else {
      if (count < 16) {
         __asm__ __volatile__(
               "lsrl   #1,%2 ; jcc 1f ; moveb [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "1: lsrl   #1,%2 ; jcc 1f ; movew [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "1: lsrl   #1,%2 ; jcc 1f ; movel [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "1: lsrl   #1,%2 ; jcc 1f ; movel [EMAIL PROTECTED],[EMAIL PROTECTED] ; movel [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "1:"
               : "=a" (d), "=a" (s), "=d" (count)
               : "0" ((char *) d + count), "1" ((char *) s + count), "2" (count)
        );
      } else {
         long tmp;
         __asm__ __volatile__(
               "movel %0,%3\n\t"
               "lsrl   #1,%3 ; jcc 1f ; moveb [EMAIL PROTECTED],[EMAIL PROTECTED] ; subqw #1,%2\n\t"
               "lsrl   #1,%3 ; jcs 2f\n\t"  /* %0 increased=>bit 2 switched*/
               "movew  [EMAIL PROTECTED],[EMAIL PROTECTED]  ; subqw  #2,%2 ; jra 2f\n\t"
            "1: lsrl   #1,%3 ; jcc 2f\n\t"
               "movew  [EMAIL PROTECTED],[EMAIL PROTECTED]  ; subqw  #2,%2\n\t"
            "2: movew %2,%-; lsrl #2,%2 ; jeq 6f\n\t"
               "lsrl   #1,%2 ; jcc 3f ; movel [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "3: lsrl   #1,%2 ; jcc 4f ; movel [EMAIL PROTECTED],[EMAIL PROTECTED] ; movel [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "4: subql  #1,%2 ; jcs 6f\n\t"
            "5: movel [EMAIL PROTECTED],[EMAIL PROTECTED];movel [EMAIL PROTECTED],[EMAIL PROTECTED]"
               "movel [EMAIL PROTECTED],[EMAIL PROTECTED];movel [EMAIL PROTECTED],[EMAIL PROTECTED]"
               "dbra %2,5b ; clrw %2; subql #1,%2; jcc 5b\n\t"
            "6: movew %+,%2; btst #1,%2 ; jeq 7f ; movew [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "7:              ; btst #0,%2 ; jeq 8f ; moveb [EMAIL PROTECTED],[EMAIL PROTECTED]"
            "8:"
               : "=a" (d), "=a" (s), "=d" (count), "=d" (tmp)
               : "0" ((char *) d + count), "1" ((char *) s + count), "2" (count)
        );
      }
   }

   return(0);
}


/* ++andreas: Simple and fast version of memmove, assumes size is
   divisible by 16, suitable for moving the whole screen bitplane */
static __inline__ void fast_memmove(char *dst, const char *src, size_t size)
{
  if (!size)
    return;
  if (dst < src)
    __asm__ __volatile__
      ("1:"
       "  moveml [EMAIL PROTECTED],%/d0/%/d1/%/a0/%/a1\n"
       "  moveml %/d0/%/d1/%/a0/%/a1,[EMAIL PROTECTED]"
       "  addql #8,%1; addql #8,%1\n"
       "  dbra %2,1b\n"
       "  clrw %2; subql #1,%2\n"
       "  jcc 1b"
       : "=a" (src), "=a" (dst), "=d" (size)
       : "0" (src), "1" (dst), "2" (size / 16 - 1)
       : "d0", "d1", "a0", "a1", "memory");
  else
    __asm__ __volatile__
      ("1:"
       "  subql #8,%0; subql #8,%0\n"
       "  moveml %0@,%/d0/%/d1/%/a0/%/a1\n"
       "  moveml %/d0/%/d1/%/a0/%/a1,[EMAIL PROTECTED]"
       "  dbra %2,1b\n"
       "  clrw %2; subql #1,%2\n"
       "  jcc 1b"
       : "=a" (src), "=a" (dst), "=d" (size)
       : "0" (src + size), "1" (dst + size), "2" (size / 16 - 1)
       : "d0", "d1", "a0", "a1", "memory");
}

#ifdef BPL

/*
 * This expands a up to 8 bit color into two longs
 * for movel operations.
 */
static const u32 four2long[] =
{
	0x00000000, 0x000000ff, 0x0000ff00, 0x0000ffff,
	0x00ff0000, 0x00ff00ff, 0x00ffff00, 0x00ffffff,
	0xff000000, 0xff0000ff, 0xff00ff00, 0xff00ffff,
	0xffff0000, 0xffff00ff, 0xffffff00, 0xffffffff,
};

static inline void expand8_col2mask(u8 c, u32 m[])
{
	m[0] = four2long[c & 15];
#if BPL > 4
	m[1] = four2long[c >> 4];
#endif
}

static inline void expand8_2col2mask(u8 fg, u8 bg, u32 fgm[], u32 bgm[])
{
	fgm[0] = four2long[fg & 15] ^ (bgm[0] = four2long[bg & 15]);
#if BPL > 4
	fgm[1] = four2long[fg >> 4] ^ (bgm[1] = four2long[bg >> 4]);
#endif
}

/*
 * set an 8bit value to a color
 */
static inline void fill8_col(u8 *dst, u32 m[])
{
	u32 tmp = m[0];
	dst[0] = tmp;
	dst[2] = (tmp >>= 8);
#if BPL > 2
	dst[4] = (tmp >>= 8);
	dst[6] = tmp >> 8;
#endif
#if BPL > 4
	tmp = m[1];
	dst[8] = tmp;
	dst[10] = (tmp >>= 8);
	dst[12] = (tmp >>= 8);
	dst[14] = tmp >> 8;
#endif
}

/*
 * set an 8bit value according to foreground/background color
 */
static inline void fill8_2col(u8 *dst, u8 fg, u8 bg, u32 mask)
{
	u32 fgm[2], bgm[2], tmp;

	expand8_2col2mask(fg, bg, fgm, bgm);

	mask |= mask << 8;
#if BPL > 2
	mask |= mask << 16;
#endif
	tmp = (mask & fgm[0]) ^ bgm[0];
	dst[0] = tmp;
	dst[2] = (tmp >>= 8);
#if BPL > 2
	dst[4] = (tmp >>= 8);
	dst[6] = tmp >> 8;
#endif
#if BPL > 4
	tmp = (mask & fgm[1]) ^ bgm[1];
	dst[8] = tmp;
	dst[10] = (tmp >>= 8);
	dst[12] = (tmp >>= 8);
	dst[14] = tmp >> 8;
#endif
}

static const u32 two2word[] =
{
	0x00000000, 0xffff0000, 0x0000ffff, 0xffffffff
};

static inline void expand16_col2mask(u8 c, u32 m[])
{
	m[0] = two2word[c & 3];
#if BPL > 2
	m[1] = two2word[(c >> 2) & 3];
#endif
#if BPL > 4
	m[2] = two2word[(c >> 4) & 3];
	m[3] = two2word[c >> 6];
#endif
}

static inline void expand16_2col2mask(u8 fg, u8 bg, u32 fgm[], u32 bgm[])
{
	bgm[0] = two2word[bg & 3];
	fgm[0] = two2word[fg & 3] ^ bgm[0];
#if BPL > 2
	bgm[1] = two2word[(bg >> 2) & 3];
	fgm[1] = two2word[(fg >> 2) & 3] ^ bgm[1];
#endif
#if BPL > 4
	bgm[2] = two2word[(bg >> 4) & 3];
	fgm[2] = two2word[(fg >> 4) & 3] ^ bgm[2];
	bgm[3] = two2word[bg >> 6];
	fgm[3] = two2word[fg >> 6] ^ bgm[3];
#endif
}

static inline u32 *fill16_col(u32 *dst, int rows, u32 m[])
{
	while (rows) {
		*dst++ = m[0];
#if BPL > 2
		*dst++ = m[1];
#endif
#if BPL > 4
		*dst++ = m[2];
		*dst++ = m[3];
#endif
		rows--;
	}
	return dst;
}

static inline void memmove32_col(void *dst, void *src, u32 mask, u32 h, u32 bytes)
{
	u32 *s, *d, v;

        s = src;
        d = dst;
        do {
                v = (*s++ & mask) | (*d  & ~mask);
                *d++ = v;
#if BPL > 2
                v = (*s++ & mask) | (*d  & ~mask);
                *d++ = v;
#endif
#if BPL > 4
                v = (*s++ & mask) | (*d  & ~mask);
                *d++ = v;
                v = (*s++ & mask) | (*d  & ~mask);
                *d++ = v;
#endif
                d = (u32 *)((u8 *)d + bytes);
                s = (u32 *)((u8 *)s + bytes);
        } while (--h);
}

#endif

#endif /* _VIDEO_ATAFB_UTILS_H */

Reply via email to