cscg22-gearboy

CSCG 2022 Challenge 'Gearboy'
git clone https://git.sinitax.com/sinitax/cscg22-gearboy
Log | Files | Refs | sfeed.txt

SDL_blit_A.c (43167B)


      1/*
      2  Simple DirectMedia Layer
      3  Copyright (C) 1997-2014 Sam Lantinga <slouken@libsdl.org>
      4
      5  This software is provided 'as-is', without any express or implied
      6  warranty.  In no event will the authors be held liable for any damages
      7  arising from the use of this software.
      8
      9  Permission is granted to anyone to use this software for any purpose,
     10  including commercial applications, and to alter it and redistribute it
     11  freely, subject to the following restrictions:
     12
     13  1. The origin of this software must not be misrepresented; you must not
     14     claim that you wrote the original software. If you use this software
     15     in a product, an acknowledgment in the product documentation would be
     16     appreciated but is not required.
     17  2. Altered source versions must be plainly marked as such, and must not be
     18     misrepresented as being the original software.
     19  3. This notice may not be removed or altered from any source distribution.
     20*/
     21#include "../SDL_internal.h"
     22
     23#include "SDL_video.h"
     24#include "SDL_blit.h"
     25
     26/* Functions to perform alpha blended blitting */
     27
     28/* N->1 blending with per-surface alpha */
     29static void
     30BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
     31{
     32    int width = info->dst_w;
     33    int height = info->dst_h;
     34    Uint8 *src = info->src;
     35    int srcskip = info->src_skip;
     36    Uint8 *dst = info->dst;
     37    int dstskip = info->dst_skip;
     38    Uint8 *palmap = info->table;
     39    SDL_PixelFormat *srcfmt = info->src_fmt;
     40    SDL_PixelFormat *dstfmt = info->dst_fmt;
     41    int srcbpp = srcfmt->BytesPerPixel;
     42    Uint32 Pixel;
     43    unsigned sR, sG, sB;
     44    unsigned dR, dG, dB;
     45    const unsigned A = info->a;
     46
     47    while (height--) {
     48	    /* *INDENT-OFF* */
     49	    DUFFS_LOOP4(
     50	    {
     51		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
     52		dR = dstfmt->palette->colors[*dst].r;
     53		dG = dstfmt->palette->colors[*dst].g;
     54		dB = dstfmt->palette->colors[*dst].b;
     55		ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
     56		dR &= 0xff;
     57		dG &= 0xff;
     58		dB &= 0xff;
     59		/* Pack RGB into 8bit pixel */
     60		if ( palmap == NULL ) {
     61		    *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
     62		} else {
     63		    *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
     64		}
     65		dst++;
     66		src += srcbpp;
     67	    },
     68	    width);
     69	    /* *INDENT-ON* */
     70        src += srcskip;
     71        dst += dstskip;
     72    }
     73}
     74
     75/* N->1 blending with pixel alpha */
     76static void
     77BlitNto1PixelAlpha(SDL_BlitInfo * info)
     78{
     79    int width = info->dst_w;
     80    int height = info->dst_h;
     81    Uint8 *src = info->src;
     82    int srcskip = info->src_skip;
     83    Uint8 *dst = info->dst;
     84    int dstskip = info->dst_skip;
     85    Uint8 *palmap = info->table;
     86    SDL_PixelFormat *srcfmt = info->src_fmt;
     87    SDL_PixelFormat *dstfmt = info->dst_fmt;
     88    int srcbpp = srcfmt->BytesPerPixel;
     89    Uint32 Pixel;
     90    unsigned sR, sG, sB, sA;
     91    unsigned dR, dG, dB;
     92
     93    while (height--) {
     94	    /* *INDENT-OFF* */
     95	    DUFFS_LOOP4(
     96	    {
     97		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
     98		dR = dstfmt->palette->colors[*dst].r;
     99		dG = dstfmt->palette->colors[*dst].g;
    100		dB = dstfmt->palette->colors[*dst].b;
    101		ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
    102		dR &= 0xff;
    103		dG &= 0xff;
    104		dB &= 0xff;
    105		/* Pack RGB into 8bit pixel */
    106		if ( palmap == NULL ) {
    107		    *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
    108		} else {
    109		    *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
    110		}
    111		dst++;
    112		src += srcbpp;
    113	    },
    114	    width);
    115	    /* *INDENT-ON* */
    116        src += srcskip;
    117        dst += dstskip;
    118    }
    119}
    120
    121/* colorkeyed N->1 blending with per-surface alpha */
    122static void
    123BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
    124{
    125    int width = info->dst_w;
    126    int height = info->dst_h;
    127    Uint8 *src = info->src;
    128    int srcskip = info->src_skip;
    129    Uint8 *dst = info->dst;
    130    int dstskip = info->dst_skip;
    131    Uint8 *palmap = info->table;
    132    SDL_PixelFormat *srcfmt = info->src_fmt;
    133    SDL_PixelFormat *dstfmt = info->dst_fmt;
    134    int srcbpp = srcfmt->BytesPerPixel;
    135    Uint32 ckey = info->colorkey;
    136    Uint32 Pixel;
    137    unsigned sR, sG, sB;
    138    unsigned dR, dG, dB;
    139    const unsigned A = info->a;
    140
    141    while (height--) {
    142	    /* *INDENT-OFF* */
    143	    DUFFS_LOOP(
    144	    {
    145		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    146		if ( Pixel != ckey ) {
    147		    dR = dstfmt->palette->colors[*dst].r;
    148		    dG = dstfmt->palette->colors[*dst].g;
    149		    dB = dstfmt->palette->colors[*dst].b;
    150		    ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
    151		    dR &= 0xff;
    152		    dG &= 0xff;
    153		    dB &= 0xff;
    154		    /* Pack RGB into 8bit pixel */
    155		    if ( palmap == NULL ) {
    156                *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
    157		    } else {
    158                *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
    159		    }
    160		}
    161		dst++;
    162		src += srcbpp;
    163	    },
    164	    width);
    165	    /* *INDENT-ON* */
    166        src += srcskip;
    167        dst += dstskip;
    168    }
    169}
    170
    171#ifdef __MMX__
    172
    173/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
    174static void
    175BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
    176{
    177    int width = info->dst_w;
    178    int height = info->dst_h;
    179    Uint32 *srcp = (Uint32 *) info->src;
    180    int srcskip = info->src_skip >> 2;
    181    Uint32 *dstp = (Uint32 *) info->dst;
    182    int dstskip = info->dst_skip >> 2;
    183    Uint32 dalpha = info->dst_fmt->Amask;
    184
    185    __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
    186
    187    hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
    188    lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
    189    dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
    190
    191    while (height--) {
    192        int n = width;
    193        if (n & 1) {
    194            Uint32 s = *srcp++;
    195            Uint32 d = *dstp;
    196            *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
    197                       + (s & d & 0x00010101)) | dalpha;
    198            n--;
    199        }
    200
    201        for (n >>= 1; n > 0; --n) {
    202            dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
    203            dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
    204
    205            src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
    206            src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
    207
    208            dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
    209            src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
    210            src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
    211            src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
    212
    213            dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
    214            dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
    215            dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
    216            dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
    217
    218            *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
    219            dstp += 2;
    220            srcp += 2;
    221        }
    222
    223        srcp += srcskip;
    224        dstp += dstskip;
    225    }
    226    _mm_empty();
    227}
    228
    229/* fast RGB888->(A)RGB888 blending with surface alpha */
    230static void
    231BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
    232{
    233    SDL_PixelFormat *df = info->dst_fmt;
    234    Uint32 chanmask;
    235    unsigned alpha = info->a;
    236
    237    if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
    238        /* only call a128 version when R,G,B occupy lower bits */
    239        BlitRGBtoRGBSurfaceAlpha128MMX(info);
    240    } else {
    241        int width = info->dst_w;
    242        int height = info->dst_h;
    243        Uint32 *srcp = (Uint32 *) info->src;
    244        int srcskip = info->src_skip >> 2;
    245        Uint32 *dstp = (Uint32 *) info->dst;
    246        int dstskip = info->dst_skip >> 2;
    247        Uint32 dalpha = df->Amask;
    248        Uint32 amult;
    249
    250        __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
    251
    252        mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
    253        /* form the alpha mult */
    254        amult = alpha | (alpha << 8);
    255        amult = amult | (amult << 16);
    256        chanmask =
    257            (0xff << df->Rshift) | (0xff << df->
    258                                    Gshift) | (0xff << df->Bshift);
    259        mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
    260        mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
    261        /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
    262        dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
    263
    264        while (height--) {
    265            int n = width;
    266            if (n & 1) {
    267                /* One Pixel Blend */
    268                src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
    269                src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
    270
    271                dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
    272                dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
    273
    274                src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
    275                src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
    276                src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
    277                dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
    278
    279                dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
    280                dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
    281                *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
    282
    283                ++srcp;
    284                ++dstp;
    285
    286                n--;
    287            }
    288
    289            for (n >>= 1; n > 0; --n) {
    290                /* Two Pixels Blend */
    291                src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
    292                src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
    293                src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
    294                src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
    295
    296                dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
    297                dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
    298                dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
    299                dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
    300
    301                src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
    302                src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
    303                src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
    304                dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
    305
    306                src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
    307                src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
    308                src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
    309                dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
    310
    311                dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
    312                dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
    313
    314                *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
    315
    316                srcp += 2;
    317                dstp += 2;
    318            }
    319            srcp += srcskip;
    320            dstp += dstskip;
    321        }
    322        _mm_empty();
    323    }
    324}
    325
    326/* fast ARGB888->(A)RGB888 blending with pixel alpha */
    327static void
    328BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
    329{
    330    int width = info->dst_w;
    331    int height = info->dst_h;
    332    Uint32 *srcp = (Uint32 *) info->src;
    333    int srcskip = info->src_skip >> 2;
    334    Uint32 *dstp = (Uint32 *) info->dst;
    335    int dstskip = info->dst_skip >> 2;
    336    SDL_PixelFormat *sf = info->src_fmt;
    337    Uint32 amask = sf->Amask;
    338    Uint32 ashift = sf->Ashift;
    339    Uint64 multmask, multmask2;
    340
    341    __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
    342
    343    mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
    344    multmask = 0x00FF;
    345	multmask <<= (ashift * 2);
    346	multmask2 = 0x00FF00FF00FF00FFULL;
    347
    348    while (height--) {
    349		/* *INDENT-OFF* */
    350		DUFFS_LOOP4({
    351		Uint32 alpha = *srcp & amask;
    352		if (alpha == 0) {
    353			/* do nothing */
    354		} else if (alpha == amask) {
    355			*dstp = *srcp;
    356		} else {
    357			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
    358			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
    359
    360			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
    361			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
    362
    363			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
    364			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
    365			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
    366			mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
    367			mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);	/* 0F0A0A0A -> mm_alpha */
    368			mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);	/* 255 - mm_alpha -> mm_alpha */
    369
    370			/* blend */		    
    371			src1 = _mm_mullo_pi16(src1, mm_alpha);
    372			src1 = _mm_srli_pi16(src1, 8);
    373			dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
    374			dst1 = _mm_srli_pi16(dst1, 8);
    375			dst1 = _mm_add_pi16(src1, dst1);
    376			dst1 = _mm_packs_pu16(dst1, mm_zero);
    377			
    378			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
    379		}
    380		++srcp;
    381		++dstp;
    382	    }, width);
    383		/* *INDENT-ON* */
    384        srcp += srcskip;
    385        dstp += dstskip;
    386    }
    387    _mm_empty();
    388}
    389
    390#endif /* __MMX__ */
    391
    392/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
    393static void
    394BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
    395{
    396    int width = info->dst_w;
    397    int height = info->dst_h;
    398    Uint32 *srcp = (Uint32 *) info->src;
    399    int srcskip = info->src_skip >> 2;
    400    Uint32 *dstp = (Uint32 *) info->dst;
    401    int dstskip = info->dst_skip >> 2;
    402
    403    while (height--) {
    404	    /* *INDENT-OFF* */
    405	    DUFFS_LOOP4({
    406		    Uint32 s = *srcp++;
    407		    Uint32 d = *dstp;
    408		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
    409			       + (s & d & 0x00010101)) | 0xff000000;
    410	    }, width);
    411	    /* *INDENT-ON* */
    412        srcp += srcskip;
    413        dstp += dstskip;
    414    }
    415}
    416
    417/* fast RGB888->(A)RGB888 blending with surface alpha */
    418static void
    419BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
    420{
    421    unsigned alpha = info->a;
    422    if (alpha == 128) {
    423        BlitRGBtoRGBSurfaceAlpha128(info);
    424    } else {
    425        int width = info->dst_w;
    426        int height = info->dst_h;
    427        Uint32 *srcp = (Uint32 *) info->src;
    428        int srcskip = info->src_skip >> 2;
    429        Uint32 *dstp = (Uint32 *) info->dst;
    430        int dstskip = info->dst_skip >> 2;
    431        Uint32 s;
    432        Uint32 d;
    433        Uint32 s1;
    434        Uint32 d1;
    435
    436        while (height--) {
    437			/* *INDENT-OFF* */
    438			DUFFS_LOOP4({
    439				s = *srcp;
    440				d = *dstp;
    441				s1 = s & 0xff00ff;
    442				d1 = d & 0xff00ff;
    443				d1 = (d1 + ((s1 - d1) * alpha >> 8))
    444				     & 0xff00ff;
    445				s &= 0xff00;
    446				d &= 0xff00;
    447				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
    448				*dstp = d1 | d | 0xff000000;
    449				++srcp;
    450				++dstp;
    451			}, width);
    452			/* *INDENT-ON* */
    453            srcp += srcskip;
    454            dstp += dstskip;
    455        }
    456    }
    457}
    458
    459/* fast ARGB888->(A)RGB888 blending with pixel alpha */
    460static void
    461BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
    462{
    463    int width = info->dst_w;
    464    int height = info->dst_h;
    465    Uint32 *srcp = (Uint32 *) info->src;
    466    int srcskip = info->src_skip >> 2;
    467    Uint32 *dstp = (Uint32 *) info->dst;
    468    int dstskip = info->dst_skip >> 2;
    469
    470    while (height--) {
    471	    /* *INDENT-OFF* */
    472	    DUFFS_LOOP4({
    473		Uint32 dalpha;
    474		Uint32 d;
    475		Uint32 s1;
    476		Uint32 d1;
    477		Uint32 s = *srcp;
    478		Uint32 alpha = s >> 24;
    479		/* FIXME: Here we special-case opaque alpha since the
    480		   compositioning used (>>8 instead of /255) doesn't handle
    481		   it correctly. Also special-case alpha=0 for speed?
    482		   Benchmark this! */
    483		if (alpha) {
    484		  if (alpha == SDL_ALPHA_OPAQUE) {
    485			  *dstp = *srcp;
    486		  } else {
    487		    /*
    488		     * take out the middle component (green), and process
    489		     * the other two in parallel. One multiply less.
    490		     */
    491		    d = *dstp;
    492			dalpha = d >> 24;
    493		    s1 = s & 0xff00ff;
    494		    d1 = d & 0xff00ff;
    495		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
    496		    s &= 0xff00;
    497		    d &= 0xff00;
    498		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
    499			dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
    500		    *dstp = d1 | d | (dalpha << 24);
    501		  }
    502		}
    503		++srcp;
    504		++dstp;
    505	    }, width);
    506	    /* *INDENT-ON* */
    507        srcp += srcskip;
    508        dstp += dstskip;
    509    }
    510}
    511
    512#ifdef __3dNOW__
    513/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
    514static void
    515BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
    516{
    517    int width = info->dst_w;
    518    int height = info->dst_h;
    519    Uint32 *srcp = (Uint32 *) info->src;
    520    int srcskip = info->src_skip >> 2;
    521    Uint32 *dstp = (Uint32 *) info->dst;
    522    int dstskip = info->dst_skip >> 2;
    523    SDL_PixelFormat *sf = info->src_fmt;
    524    Uint32 amask = sf->Amask;
    525    Uint32 ashift = sf->Ashift;
    526    Uint64 multmask, multmask2;
    527
    528    __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
    529
    530    mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
    531    multmask = 0x00FF;
    532    multmask <<= (ashift * 2);
    533    multmask2 = 0x00FF00FF00FF00FFULL;
    534
    535    while (height--) {
    536	    /* *INDENT-OFF* */
    537	    DUFFS_LOOP4({
    538		Uint32 alpha;
    539
    540		_m_prefetch(srcp + 16);
    541		_m_prefetch(dstp + 16);
    542
    543		alpha = *srcp & amask;
    544		if (alpha == 0) {
    545			/* do nothing */
    546		} else if (alpha == amask) {
    547			*dstp = *srcp;
    548		} else {
    549			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
    550			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
    551
    552			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
    553			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
    554
    555			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
    556			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
    557			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
    558			mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
    559			mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);	/* 0F0A0A0A -> mm_alpha */
    560			mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);	/* 255 - mm_alpha -> mm_alpha */
    561
    562
    563			/* blend */		    
    564			src1 = _mm_mullo_pi16(src1, mm_alpha);
    565			src1 = _mm_srli_pi16(src1, 8);
    566			dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
    567			dst1 = _mm_srli_pi16(dst1, 8);
    568			dst1 = _mm_add_pi16(src1, dst1);
    569			dst1 = _mm_packs_pu16(dst1, mm_zero);
    570			
    571			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
    572		}
    573		++srcp;
    574		++dstp;
    575	    }, width);
    576	    /* *INDENT-ON* */
    577        srcp += srcskip;
    578        dstp += dstskip;
    579    }
    580    _mm_empty();
    581}
    582
    583#endif /* __MMX__ */
    584
    585/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
    586
    587/* blend a single 16 bit pixel at 50% */
    588#define BLEND16_50(d, s, mask)						\
    589	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
    590
    591/* blend two 16 bit pixels at 50% */
    592#define BLEND2x16_50(d, s, mask)					     \
    593	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
    594	 + (s & d & (~(mask | mask << 16))))
    595
    596static void
    597Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
    598{
    599    int width = info->dst_w;
    600    int height = info->dst_h;
    601    Uint16 *srcp = (Uint16 *) info->src;
    602    int srcskip = info->src_skip >> 1;
    603    Uint16 *dstp = (Uint16 *) info->dst;
    604    int dstskip = info->dst_skip >> 1;
    605
    606    while (height--) {
    607        if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
    608            /*
    609             * Source and destination not aligned, pipeline it.
    610             * This is mostly a win for big blits but no loss for
    611             * small ones
    612             */
    613            Uint32 prev_sw;
    614            int w = width;
    615
    616            /* handle odd destination */
    617            if ((uintptr_t) dstp & 2) {
    618                Uint16 d = *dstp, s = *srcp;
    619                *dstp = BLEND16_50(d, s, mask);
    620                dstp++;
    621                srcp++;
    622                w--;
    623            }
    624            srcp++;             /* srcp is now 32-bit aligned */
    625
    626            /* bootstrap pipeline with first halfword */
    627            prev_sw = ((Uint32 *) srcp)[-1];
    628
    629            while (w > 1) {
    630                Uint32 sw, dw, s;
    631                sw = *(Uint32 *) srcp;
    632                dw = *(Uint32 *) dstp;
    633#if SDL_BYTEORDER == SDL_BIG_ENDIAN
    634                s = (prev_sw << 16) + (sw >> 16);
    635#else
    636                s = (prev_sw >> 16) + (sw << 16);
    637#endif
    638                prev_sw = sw;
    639                *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
    640                dstp += 2;
    641                srcp += 2;
    642                w -= 2;
    643            }
    644
    645            /* final pixel if any */
    646            if (w) {
    647                Uint16 d = *dstp, s;
    648#if SDL_BYTEORDER == SDL_BIG_ENDIAN
    649                s = (Uint16) prev_sw;
    650#else
    651                s = (Uint16) (prev_sw >> 16);
    652#endif
    653                *dstp = BLEND16_50(d, s, mask);
    654                srcp++;
    655                dstp++;
    656            }
    657            srcp += srcskip - 1;
    658            dstp += dstskip;
    659        } else {
    660            /* source and destination are aligned */
    661            int w = width;
    662
    663            /* first odd pixel? */
    664            if ((uintptr_t) srcp & 2) {
    665                Uint16 d = *dstp, s = *srcp;
    666                *dstp = BLEND16_50(d, s, mask);
    667                srcp++;
    668                dstp++;
    669                w--;
    670            }
    671            /* srcp and dstp are now 32-bit aligned */
    672
    673            while (w > 1) {
    674                Uint32 sw = *(Uint32 *) srcp;
    675                Uint32 dw = *(Uint32 *) dstp;
    676                *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
    677                srcp += 2;
    678                dstp += 2;
    679                w -= 2;
    680            }
    681
    682            /* last odd pixel? */
    683            if (w) {
    684                Uint16 d = *dstp, s = *srcp;
    685                *dstp = BLEND16_50(d, s, mask);
    686                srcp++;
    687                dstp++;
    688            }
    689            srcp += srcskip;
    690            dstp += dstskip;
    691        }
    692    }
    693}
    694
    695#ifdef __MMX__
    696
    697/* fast RGB565->RGB565 blending with surface alpha */
    698static void
    699Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
    700{
    701    unsigned alpha = info->a;
    702    if (alpha == 128) {
    703        Blit16to16SurfaceAlpha128(info, 0xf7de);
    704    } else {
    705        int width = info->dst_w;
    706        int height = info->dst_h;
    707        Uint16 *srcp = (Uint16 *) info->src;
    708        int srcskip = info->src_skip >> 1;
    709        Uint16 *dstp = (Uint16 *) info->dst;
    710        int dstskip = info->dst_skip >> 1;
    711        Uint32 s, d;
    712
    713        __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
    714
    715        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
    716        mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
    717        alpha >>= 3;            /* downscale alpha to 5 bits */
    718
    719        mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
    720        mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
    721        /* position alpha to allow for mullo and mulhi on diff channels
    722           to reduce the number of operations */
    723        mm_alpha = _mm_slli_si64(mm_alpha, 3);
    724
    725        /* Setup the 565 color channel masks */
    726        gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
    727        bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
    728
    729        while (height--) {
    730			/* *INDENT-OFF* */
    731			DUFFS_LOOP_124(
    732			{
    733				s = *srcp++;
    734				d = *dstp;
    735				/*
    736				 * shift out the middle component (green) to
    737				 * the high 16 bits, and process all three RGB
    738				 * components at the same time.
    739				 */
    740				s = (s | s << 16) & 0x07e0f81f;
    741				d = (d | d << 16) & 0x07e0f81f;
    742				d += (s - d) * alpha >> 5;
    743				d &= 0x07e0f81f;
    744				*dstp++ = (Uint16)(d | d >> 16);
    745			},{
    746				s = *srcp++;
    747				d = *dstp;
    748				/*
    749				 * shift out the middle component (green) to
    750				 * the high 16 bits, and process all three RGB
    751				 * components at the same time.
    752				 */
    753				s = (s | s << 16) & 0x07e0f81f;
    754				d = (d | d << 16) & 0x07e0f81f;
    755				d += (s - d) * alpha >> 5;
    756				d &= 0x07e0f81f;
    757				*dstp++ = (Uint16)(d | d >> 16);
    758				s = *srcp++;
    759				d = *dstp;
    760				/*
    761				 * shift out the middle component (green) to
    762				 * the high 16 bits, and process all three RGB
    763				 * components at the same time.
    764				 */
    765				s = (s | s << 16) & 0x07e0f81f;
    766				d = (d | d << 16) & 0x07e0f81f;
    767				d += (s - d) * alpha >> 5;
    768				d &= 0x07e0f81f;
    769				*dstp++ = (Uint16)(d | d >> 16);
    770			},{
    771				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
    772				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
    773
    774				/* red */
    775				src2 = src1;
    776				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
    777
    778				dst2 = dst1;
    779				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
    780
    781				/* blend */
    782				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
    783				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
    784				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
    785				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
    786				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
    787
    788				mm_res = dst2; /* RED -> mm_res */
    789
    790				/* green -- process the bits in place */
    791				src2 = src1;
    792				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
    793
    794				dst2 = dst1;
    795				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
    796
    797				/* blend */
    798				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
    799				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
    800				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
    801				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
    802
    803				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
    804
    805				/* blue */
    806				src2 = src1;
    807				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
    808
    809				dst2 = dst1;
    810				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
    811
    812				/* blend */
    813				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
    814				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
    815				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
    816				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
    817				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
    818
    819				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
    820
    821				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
    822
    823				srcp += 4;
    824				dstp += 4;
    825			}, width);
    826			/* *INDENT-ON* */
    827            srcp += srcskip;
    828            dstp += dstskip;
    829        }
    830        _mm_empty();
    831    }
    832}
    833
    834/* fast RGB555->RGB555 blending with surface alpha */
    835static void
    836Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
    837{
    838    unsigned alpha = info->a;
    839    if (alpha == 128) {
    840        Blit16to16SurfaceAlpha128(info, 0xfbde);
    841    } else {
    842        int width = info->dst_w;
    843        int height = info->dst_h;
    844        Uint16 *srcp = (Uint16 *) info->src;
    845        int srcskip = info->src_skip >> 1;
    846        Uint16 *dstp = (Uint16 *) info->dst;
    847        int dstskip = info->dst_skip >> 1;
    848        Uint32 s, d;
    849
    850        __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
    851
    852        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
    853        mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
    854        alpha >>= 3;            /* downscale alpha to 5 bits */
    855
    856        mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
    857        mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
    858        /* position alpha to allow for mullo and mulhi on diff channels
    859           to reduce the number of operations */
    860        mm_alpha = _mm_slli_si64(mm_alpha, 3);
    861
    862        /* Setup the 555 color channel masks */
    863        rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
    864        gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
    865        bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
    866
    867        while (height--) {
    868			/* *INDENT-OFF* */
    869			DUFFS_LOOP_124(
    870			{
    871				s = *srcp++;
    872				d = *dstp;
    873				/*
    874				 * shift out the middle component (green) to
    875				 * the high 16 bits, and process all three RGB
    876				 * components at the same time.
    877				 */
    878				s = (s | s << 16) & 0x03e07c1f;
    879				d = (d | d << 16) & 0x03e07c1f;
    880				d += (s - d) * alpha >> 5;
    881				d &= 0x03e07c1f;
    882				*dstp++ = (Uint16)(d | d >> 16);
    883			},{
    884				s = *srcp++;
    885				d = *dstp;
    886				/*
    887				 * shift out the middle component (green) to
    888				 * the high 16 bits, and process all three RGB
    889				 * components at the same time.
    890				 */
    891				s = (s | s << 16) & 0x03e07c1f;
    892				d = (d | d << 16) & 0x03e07c1f;
    893				d += (s - d) * alpha >> 5;
    894				d &= 0x03e07c1f;
    895				*dstp++ = (Uint16)(d | d >> 16);
    896			        s = *srcp++;
    897				d = *dstp;
    898				/*
    899				 * shift out the middle component (green) to
    900				 * the high 16 bits, and process all three RGB
    901				 * components at the same time.
    902				 */
    903				s = (s | s << 16) & 0x03e07c1f;
    904				d = (d | d << 16) & 0x03e07c1f;
    905				d += (s - d) * alpha >> 5;
    906				d &= 0x03e07c1f;
    907				*dstp++ = (Uint16)(d | d >> 16);
    908			},{
    909				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
    910				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
    911
    912				/* red -- process the bits in place */
    913				src2 = src1;
    914				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
    915
    916				dst2 = dst1;
    917				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
    918
    919				/* blend */
    920				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
    921				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
    922				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
    923				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
    924				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
    925
    926				mm_res = dst2; /* RED -> mm_res */
    927				
    928				/* green -- process the bits in place */
    929				src2 = src1;
    930				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
    931
    932				dst2 = dst1;
    933				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
    934
    935				/* blend */
    936				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
    937				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
    938				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
    939				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
    940
    941				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
    942
    943				/* blue */
    944				src2 = src1; /* src -> src2 */
    945				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
    946
    947				dst2 = dst1; /* dst -> dst2 */
    948				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
    949
    950				/* blend */
    951				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
    952				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
    953				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
    954				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
    955				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
    956
    957				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
    958
    959				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
    960
    961				srcp += 4;
    962				dstp += 4;
    963			}, width);
    964			/* *INDENT-ON* */
    965            srcp += srcskip;
    966            dstp += dstskip;
    967        }
    968        _mm_empty();
    969    }
    970}
    971
    972#endif /* __MMX__ */
    973
    974/* fast RGB565->RGB565 blending with surface alpha */
    975static void
    976Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
    977{
    978    unsigned alpha = info->a;
    979    if (alpha == 128) {
    980        Blit16to16SurfaceAlpha128(info, 0xf7de);
    981    } else {
    982        int width = info->dst_w;
    983        int height = info->dst_h;
    984        Uint16 *srcp = (Uint16 *) info->src;
    985        int srcskip = info->src_skip >> 1;
    986        Uint16 *dstp = (Uint16 *) info->dst;
    987        int dstskip = info->dst_skip >> 1;
    988        alpha >>= 3;            /* downscale alpha to 5 bits */
    989
    990        while (height--) {
    991			/* *INDENT-OFF* */
    992			DUFFS_LOOP4({
    993				Uint32 s = *srcp++;
    994				Uint32 d = *dstp;
    995				/*
    996				 * shift out the middle component (green) to
    997				 * the high 16 bits, and process all three RGB
    998				 * components at the same time.
    999				 */
   1000				s = (s | s << 16) & 0x07e0f81f;
   1001				d = (d | d << 16) & 0x07e0f81f;
   1002				d += (s - d) * alpha >> 5;
   1003				d &= 0x07e0f81f;
   1004				*dstp++ = (Uint16)(d | d >> 16);
   1005			}, width);
   1006			/* *INDENT-ON* */
   1007            srcp += srcskip;
   1008            dstp += dstskip;
   1009        }
   1010    }
   1011}
   1012
   1013/* fast RGB555->RGB555 blending with surface alpha */
   1014static void
   1015Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
   1016{
   1017    unsigned alpha = info->a;   /* downscale alpha to 5 bits */
   1018    if (alpha == 128) {
   1019        Blit16to16SurfaceAlpha128(info, 0xfbde);
   1020    } else {
   1021        int width = info->dst_w;
   1022        int height = info->dst_h;
   1023        Uint16 *srcp = (Uint16 *) info->src;
   1024        int srcskip = info->src_skip >> 1;
   1025        Uint16 *dstp = (Uint16 *) info->dst;
   1026        int dstskip = info->dst_skip >> 1;
   1027        alpha >>= 3;            /* downscale alpha to 5 bits */
   1028
   1029        while (height--) {
   1030			/* *INDENT-OFF* */
   1031			DUFFS_LOOP4({
   1032				Uint32 s = *srcp++;
   1033				Uint32 d = *dstp;
   1034				/*
   1035				 * shift out the middle component (green) to
   1036				 * the high 16 bits, and process all three RGB
   1037				 * components at the same time.
   1038				 */
   1039				s = (s | s << 16) & 0x03e07c1f;
   1040				d = (d | d << 16) & 0x03e07c1f;
   1041				d += (s - d) * alpha >> 5;
   1042				d &= 0x03e07c1f;
   1043				*dstp++ = (Uint16)(d | d >> 16);
   1044			}, width);
   1045			/* *INDENT-ON* */
   1046            srcp += srcskip;
   1047            dstp += dstskip;
   1048        }
   1049    }
   1050}
   1051
   1052/* fast ARGB8888->RGB565 blending with pixel alpha */
   1053static void
   1054BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
   1055{
   1056    int width = info->dst_w;
   1057    int height = info->dst_h;
   1058    Uint32 *srcp = (Uint32 *) info->src;
   1059    int srcskip = info->src_skip >> 2;
   1060    Uint16 *dstp = (Uint16 *) info->dst;
   1061    int dstskip = info->dst_skip >> 1;
   1062
   1063    while (height--) {
   1064	    /* *INDENT-OFF* */
   1065	    DUFFS_LOOP4({
   1066		Uint32 s = *srcp;
   1067		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
   1068		/* FIXME: Here we special-case opaque alpha since the
   1069		   compositioning used (>>8 instead of /255) doesn't handle
   1070		   it correctly. Also special-case alpha=0 for speed?
   1071		   Benchmark this! */
   1072		if(alpha) {   
   1073		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
   1074		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
   1075		  } else {
   1076		    Uint32 d = *dstp;
   1077		    /*
   1078		     * convert source and destination to G0RAB65565
   1079		     * and blend all components at the same time
   1080		     */
   1081		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
   1082		      + (s >> 3 & 0x1f);
   1083		    d = (d | d << 16) & 0x07e0f81f;
   1084		    d += (s - d) * alpha >> 5;
   1085		    d &= 0x07e0f81f;
   1086		    *dstp = (Uint16)(d | d >> 16);
   1087		  }
   1088		}
   1089		srcp++;
   1090		dstp++;
   1091	    }, width);
   1092	    /* *INDENT-ON* */
   1093        srcp += srcskip;
   1094        dstp += dstskip;
   1095    }
   1096}
   1097
   1098/* fast ARGB8888->RGB555 blending with pixel alpha */
   1099static void
   1100BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
   1101{
   1102    int width = info->dst_w;
   1103    int height = info->dst_h;
   1104    Uint32 *srcp = (Uint32 *) info->src;
   1105    int srcskip = info->src_skip >> 2;
   1106    Uint16 *dstp = (Uint16 *) info->dst;
   1107    int dstskip = info->dst_skip >> 1;
   1108
   1109    while (height--) {
   1110	    /* *INDENT-OFF* */
   1111	    DUFFS_LOOP4({
   1112		unsigned alpha;
   1113		Uint32 s = *srcp;
   1114		alpha = s >> 27; /* downscale alpha to 5 bits */
   1115		/* FIXME: Here we special-case opaque alpha since the
   1116		   compositioning used (>>8 instead of /255) doesn't handle
   1117		   it correctly. Also special-case alpha=0 for speed?
   1118		   Benchmark this! */
   1119		if(alpha) {   
   1120		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
   1121		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
   1122		  } else {
   1123		    Uint32 d = *dstp;
   1124		    /*
   1125		     * convert source and destination to G0RAB65565
   1126		     * and blend all components at the same time
   1127		     */
   1128		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
   1129		      + (s >> 3 & 0x1f);
   1130		    d = (d | d << 16) & 0x03e07c1f;
   1131		    d += (s - d) * alpha >> 5;
   1132		    d &= 0x03e07c1f;
   1133		    *dstp = (Uint16)(d | d >> 16);
   1134		  }
   1135		}
   1136		srcp++;
   1137		dstp++;
   1138	    }, width);
   1139	    /* *INDENT-ON* */
   1140        srcp += srcskip;
   1141        dstp += dstskip;
   1142    }
   1143}
   1144
   1145/* General (slow) N->N blending with per-surface alpha */
   1146static void
   1147BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
   1148{
   1149    int width = info->dst_w;
   1150    int height = info->dst_h;
   1151    Uint8 *src = info->src;
   1152    int srcskip = info->src_skip;
   1153    Uint8 *dst = info->dst;
   1154    int dstskip = info->dst_skip;
   1155    SDL_PixelFormat *srcfmt = info->src_fmt;
   1156    SDL_PixelFormat *dstfmt = info->dst_fmt;
   1157    int srcbpp = srcfmt->BytesPerPixel;
   1158    int dstbpp = dstfmt->BytesPerPixel;
   1159    Uint32 Pixel;
   1160    unsigned sR, sG, sB;
   1161    unsigned dR, dG, dB, dA;
   1162    const unsigned sA = info->a;
   1163
   1164    if (sA) {
   1165        while (height--) {
   1166	    /* *INDENT-OFF* */
   1167	    DUFFS_LOOP4(
   1168	    {
   1169		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   1170		DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
   1171		ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
   1172		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
   1173		src += srcbpp;
   1174		dst += dstbpp;
   1175	    },
   1176	    width);
   1177	    /* *INDENT-ON* */
   1178            src += srcskip;
   1179            dst += dstskip;
   1180        }
   1181    }
   1182}
   1183
   1184/* General (slow) colorkeyed N->N blending with per-surface alpha */
   1185static void
   1186BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
   1187{
   1188    int width = info->dst_w;
   1189    int height = info->dst_h;
   1190    Uint8 *src = info->src;
   1191    int srcskip = info->src_skip;
   1192    Uint8 *dst = info->dst;
   1193    int dstskip = info->dst_skip;
   1194    SDL_PixelFormat *srcfmt = info->src_fmt;
   1195    SDL_PixelFormat *dstfmt = info->dst_fmt;
   1196    Uint32 ckey = info->colorkey;
   1197    int srcbpp = srcfmt->BytesPerPixel;
   1198    int dstbpp = dstfmt->BytesPerPixel;
   1199    Uint32 Pixel;
   1200    unsigned sR, sG, sB;
   1201    unsigned dR, dG, dB, dA;
   1202    const unsigned sA = info->a;
   1203
   1204    while (height--) {
   1205	    /* *INDENT-OFF* */
   1206	    DUFFS_LOOP4(
   1207	    {
   1208		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
   1209		if(sA && Pixel != ckey) {
   1210		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
   1211		    DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
   1212		    ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
   1213		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
   1214		}
   1215		src += srcbpp;
   1216		dst += dstbpp;
   1217	    },
   1218	    width);
   1219	    /* *INDENT-ON* */
   1220        src += srcskip;
   1221        dst += dstskip;
   1222    }
   1223}
   1224
   1225/* General (slow) N->N blending with pixel alpha */
   1226static void
   1227BlitNtoNPixelAlpha(SDL_BlitInfo * info)
   1228{
   1229    int width = info->dst_w;
   1230    int height = info->dst_h;
   1231    Uint8 *src = info->src;
   1232    int srcskip = info->src_skip;
   1233    Uint8 *dst = info->dst;
   1234    int dstskip = info->dst_skip;
   1235    SDL_PixelFormat *srcfmt = info->src_fmt;
   1236    SDL_PixelFormat *dstfmt = info->dst_fmt;
   1237    int srcbpp;
   1238    int dstbpp;
   1239    Uint32 Pixel;
   1240    unsigned sR, sG, sB, sA;
   1241    unsigned dR, dG, dB, dA;
   1242
   1243    /* Set up some basic variables */
   1244    srcbpp = srcfmt->BytesPerPixel;
   1245    dstbpp = dstfmt->BytesPerPixel;
   1246
   1247    while (height--) {
   1248	    /* *INDENT-OFF* */
   1249	    DUFFS_LOOP4(
   1250	    {
   1251		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
   1252		if(sA) {
   1253		    DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
   1254		    ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
   1255		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
   1256		}
   1257		src += srcbpp;
   1258		dst += dstbpp;
   1259	    },
   1260	    width);
   1261	    /* *INDENT-ON* */
   1262        src += srcskip;
   1263        dst += dstskip;
   1264    }
   1265}
   1266
   1267
   1268SDL_BlitFunc
   1269SDL_CalculateBlitA(SDL_Surface * surface)
   1270{
   1271    SDL_PixelFormat *sf = surface->format;
   1272    SDL_PixelFormat *df = surface->map->dst->format;
   1273
   1274    switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
   1275    case SDL_COPY_BLEND:
   1276        /* Per-pixel alpha blits */
   1277        switch (df->BytesPerPixel) {
   1278        case 1:
   1279            return BlitNto1PixelAlpha;
   1280
   1281        case 2:
   1282                if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
   1283                    && sf->Gmask == 0xff00
   1284                    && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
   1285                        || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
   1286                if (df->Gmask == 0x7e0)
   1287                    return BlitARGBto565PixelAlpha;
   1288                else if (df->Gmask == 0x3e0)
   1289                    return BlitARGBto555PixelAlpha;
   1290            }
   1291            return BlitNtoNPixelAlpha;
   1292
   1293        case 4:
   1294            if (sf->Rmask == df->Rmask
   1295                && sf->Gmask == df->Gmask
   1296                && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
   1297#if defined(__MMX__) || defined(__3dNOW__)
   1298                if (sf->Rshift % 8 == 0
   1299                    && sf->Gshift % 8 == 0
   1300                    && sf->Bshift % 8 == 0
   1301                    && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
   1302#ifdef __3dNOW__
   1303                    if (SDL_Has3DNow())
   1304                        return BlitRGBtoRGBPixelAlphaMMX3DNOW;
   1305#endif
   1306#ifdef __MMX__
   1307                    if (SDL_HasMMX())
   1308                        return BlitRGBtoRGBPixelAlphaMMX;
   1309#endif
   1310                }
   1311#endif /* __MMX__ || __3dNOW__ */
   1312                if (sf->Amask == 0xff000000) {
   1313                    return BlitRGBtoRGBPixelAlpha;
   1314                }
   1315            }
   1316            return BlitNtoNPixelAlpha;
   1317
   1318        case 3:
   1319        default:
   1320            return BlitNtoNPixelAlpha;
   1321        }
   1322        break;
   1323
   1324    case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
   1325        if (sf->Amask == 0) {
   1326            /* Per-surface alpha blits */
   1327            switch (df->BytesPerPixel) {
   1328            case 1:
   1329                return BlitNto1SurfaceAlpha;
   1330
   1331            case 2:
   1332                if (surface->map->identity) {
   1333                    if (df->Gmask == 0x7e0) {
   1334#ifdef __MMX__
   1335                        if (SDL_HasMMX())
   1336                            return Blit565to565SurfaceAlphaMMX;
   1337                        else
   1338#endif
   1339                            return Blit565to565SurfaceAlpha;
   1340                    } else if (df->Gmask == 0x3e0) {
   1341#ifdef __MMX__
   1342                        if (SDL_HasMMX())
   1343                            return Blit555to555SurfaceAlphaMMX;
   1344                        else
   1345#endif
   1346                            return Blit555to555SurfaceAlpha;
   1347                    }
   1348                }
   1349                return BlitNtoNSurfaceAlpha;
   1350
   1351            case 4:
   1352                if (sf->Rmask == df->Rmask
   1353                    && sf->Gmask == df->Gmask
   1354                    && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
   1355#ifdef __MMX__
   1356                    if (sf->Rshift % 8 == 0
   1357                        && sf->Gshift % 8 == 0
   1358                        && sf->Bshift % 8 == 0 && SDL_HasMMX())
   1359                        return BlitRGBtoRGBSurfaceAlphaMMX;
   1360#endif
   1361                    if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
   1362                        return BlitRGBtoRGBSurfaceAlpha;
   1363                    }
   1364                }
   1365                return BlitNtoNSurfaceAlpha;
   1366
   1367            case 3:
   1368            default:
   1369                return BlitNtoNSurfaceAlpha;
   1370            }
   1371        }
   1372        break;
   1373
   1374    case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
   1375        if (sf->Amask == 0) {
   1376            if (df->BytesPerPixel == 1) {
   1377                return BlitNto1SurfaceAlphaKey;
   1378            } else {
   1379                return BlitNtoNSurfaceAlphaKey;
   1380            }
   1381        }
   1382        break;
   1383    }
   1384
   1385    return NULL;
   1386}
   1387
   1388/* vi: set ts=4 sw=4 expandtab: */