apps/png/libpng/pnggccrd.c

Go to the documentation of this file.
00001 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
00002  *
00003  * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
00004  *
00005  *     See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
00006  *     and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
00007  *     for Intel's performance analysis of the MMX vs. non-MMX code.
00008  *
00009  * libpng version 1.2.8 - December 3, 2004
00010  * For conditions of distribution and use, see copyright notice in png.h
00011  * Copyright (c) 1998-2004 Glenn Randers-Pehrson
00012  * Copyright (c) 1998, Intel Corporation
00013  *
00014  * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
00015  * Interface to libpng contributed by Gilles Vollant, 1999.
00016  * GNU C port by Greg Roelofs, 1999-2001.
00017  *
00018  * Lines 2350-4300 converted in place with intel2gas 1.3.1:
00019  *
00020  *   intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
00021  *
00022  * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
00023  *
00024  * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
00025  *        is required to assemble the newer MMX instructions such as movq.
00026  *        For djgpp, see
00027  *
00028  *           ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
00029  *
00030  *        (or a later version in the same directory).  For Linux, check your
00031  *        distribution's web site(s) or try these links:
00032  *
00033  *           http://rufus.w3.org/linux/RPM/binutils.html
00034  *           http://www.debian.org/Packages/stable/devel/binutils.html
00035  *           ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
00036  *             binutils.tgz
00037  *
00038  *        For other platforms, see the main GNU site:
00039  *
00040  *           ftp://ftp.gnu.org/pub/gnu/binutils/
00041  *
00042  *        Version 2.5.2l.15 is definitely too old...
00043  */
00044 
00045 /*
00046  * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
00047  * =====================================
00048  *
00049  * 19991006:
00050  *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
00051  *
00052  * 19991007:
00053  *  - additional optimizations (possible or definite):
00054  *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
00055  *     - write MMX code for 48-bit case (pixel_bytes == 6)
00056  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
00057  *        why subtract 8 from width_mmx in the pass 4/5 case?
00058  *        (only width_mmx case) (near line 1606)
00059  *     x [DONE] replace pixel_bytes within each block with the true
00060  *        constant value (or are compilers smart enough to do that?)
00061  *     - rewrite all MMX interlacing code so it's aligned with
00062  *        the *beginning* of the row buffer, not the end.  This
00063  *        would not only allow one to eliminate half of the memory
00064  *        writes for odd passes (that is, pass == odd), it may also
00065  *        eliminate some unaligned-data-access exceptions (assuming
00066  *        there's a penalty for not aligning 64-bit accesses on
00067  *        64-bit boundaries).  The only catch is that the "leftover"
00068  *        pixel(s) at the end of the row would have to be saved,
00069  *        but there are enough unused MMX registers in every case,
00070  *        so this is not a problem.  A further benefit is that the
00071  *        post-MMX cleanup code (C code) in at least some of the
00072  *        cases could be done within the assembler block.
00073  *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
00074  *     inconsistent, and don't match the MMX Programmer's Reference
00075  *     Manual conventions anyway.  They should be changed to
00076  *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
00077  *     was lowest in memory (e.g., corresponding to a left pixel)
00078  *     and b7 is the byte that was highest (e.g., a right pixel).
00079  *
00080  * 19991016:
00081  *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
00082  *     want globals prefixed by underscores when referencing them--
00083  *     i.e., if the variable is const4, then refer to it as const4,
00084  *     not _const4.  This seems to be a djgpp-specific requirement.
00085  *     Also, such variables apparently *must* be declared outside
00086  *     of functions; neither static nor automatic variables work if
00087  *     defined within the scope of a single function, but both
00088  *     static and truly global (multi-module) variables work fine.
00089  *
00090  * 19991023:
00091  *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
00092  *  - switched from string-concatenation-with-macros to cleaner method of
00093  *     renaming global variables for djgpp--i.e., always use prefixes in
00094  *     inlined assembler code (== strings) and conditionally rename the
00095  *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
00096  *
00097  * 19991024:
00098  *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
00099  *     This one was severely weird:  even though mmxsupport() doesn't touch
00100  *     ebx (where "row" pointer was stored), it nevertheless managed to zero
00101  *     the register (even in static/non-fPIC code--see below), which in turn
00102  *     caused png_do_read_interlace() to return prematurely on the first row of
00103  *     interlaced images (i.e., without expanding the interlaced pixels).
00104  *     Inspection of the generated assembly code didn't turn up any clues,
00105  *     although it did point at a minor optimization (i.e., get rid of
00106  *     mmx_supported_local variable and just use eax).  Possibly the CPUID
00107  *     instruction is more destructive than it looks?  (Not yet checked.)
00108  *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
00109  *     listings...  Apparently register spillage has to do with ebx, since
00110  *     it's used to index the global offset table.  Commenting it out of the
00111  *     input-reg lists in png_combine_row() eliminated compiler barfage, so
00112  *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
00113  *
00114  * 19991107:
00115  *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
00116  *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
00117  *
00118  * 19991120:
00119  *  - made "diff" variable (now "_dif") global to simplify conversion of
00120  *     filtering routines (running out of regs, sigh).  "diff" is still used
00121  *     in interlacing routines, however.
00122  *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
00123  *     macro determines which is used); original not yet tested.
00124  *
00125  * 20000213:
00126  *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
00127  *
00128  * 20000319:
00129  *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
00130  *     pass == 4 or 5, that caused visible corruption of interlaced images
00131  *
00132  * 20000623:
00133  *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
00134  *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
00135  *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
00136  *     Chuck Wilson supplied a patch involving dummy output registers.  See
00137  *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
00138  *     for the original (anonymous) SourceForge bug report.
00139  *
00140  * 20000706:
00141  *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
00142  *       pnggccrd.c: In function `png_combine_row':
00143  *       pnggccrd.c:525: more than 10 operands in `asm'
00144  *       pnggccrd.c:669: more than 10 operands in `asm'
00145  *       pnggccrd.c:828: more than 10 operands in `asm'
00146  *       pnggccrd.c:994: more than 10 operands in `asm'
00147  *       pnggccrd.c:1177: more than 10 operands in `asm'
00148  *     They are all the same problem and can be worked around by using the
00149  *     global _unmask variable unconditionally, not just in the -fPIC case.
00150  *     Reportedly earlier versions of gcc also have the problem with more than
00151  *     10 operands; they just don't report it.  Much strangeness ensues, etc.
00152  *
00153  * 20000729:
00154  *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
00155  *     MMX routine); began converting png_read_filter_row_mmx_sub()
00156  *  - to finish remaining sections:
00157  *     - clean up indentation and comments
00158  *     - preload local variables
00159  *     - add output and input regs (order of former determines numerical
00160  *        mapping of latter)
00161  *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
00162  *     - remove "$" from addressing of Shift and Mask variables [20000823]
00163  *
00164  * 20000731:
00165  *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
00166  *
00167  * 20000822:
00168  *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
00169  *     shared-library (-fPIC) version!  Code works just fine as part of static
00170  *     library.  Damn damn damn damn damn, should have tested that sooner.
00171  *     ebx is getting clobbered again (explicitly this time); need to save it
00172  *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
00173  *
00174  * 20000823:
00175  *  - first section was trickiest; all remaining sections have ebx -> edx now.
00176  *     (-fPIC works again.)  Also added missing underscores to various Shift*
00177  *     and *Mask* globals and got rid of leading "$" signs.
00178  *
00179  * 20000826:
00180  *  - added visual separators to help navigate microscopic printed copies
00181  *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
00182  *     on png_read_filter_row_mmx_avg()
00183  *
00184  * 20000828:
00185  *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
00186  *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
00187  *     cleaned up/shortened in either routine, but functionality is complete
00188  *     and seems to be working fine.
00189  *
00190  * 20000829:
00191  *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
00192  *     as an input reg (with dummy output variables, etc.), then it *cannot*
00193  *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
00194  *     is simple enough...
00195  *
00196  * 20000914:
00197  *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
00198  *     correctly (but 48-bit RGB just fine)
00199  *
00200  * 20000916:
00201  *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
00202  *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
00203  *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
00204  *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
00205  *
00206  * 20010101:
00207  *  - added new png_init_mmx_flags() function (here only because it needs to
00208  *     call mmxsupport(), which should probably become global png_mmxsupport());
00209  *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
00210  *
00211  * 20010103:
00212  *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
00213  *     and made it public; moved png_init_mmx_flags() to png.c as internal func
00214  *
00215  * 20010104:
00216  *  - removed dependency on png_read_filter_row_c() (C code already duplicated
00217  *     within MMX version of png_read_filter_row()) so no longer necessary to
00218  *     compile it into pngrutil.o
00219  *
00220  * 20010310:
00221  *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
00222  *
00223  * 20020304:
00224  *  - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
00225  *
00226  * 20040724:
00227  *   - more tinkering with clobber list at lines 4529 and 5033, to get
00228  *     it to compile on gcc-3.4.
00229  *
00230  * STILL TO DO:
00231  *     - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
00232  *     - write MMX code for 48-bit case (pixel_bytes == 6)
00233  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
00234  *        why subtract 8 from width_mmx in the pass 4/5 case?
00235  *        (only width_mmx case) (near line 1606)
00236  *     - rewrite all MMX interlacing code so it's aligned with beginning
00237  *        of the row buffer, not the end (see 19991007 for details)
00238  *     x pick one version of mmxsupport() and get rid of the other
00239  *     - add error messages to any remaining bogus default cases
00240  *     - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
00241  *     x add support for runtime enable/disable/query of various MMX routines
00242  */
00243 
00244 #define PNG_INTERNAL
00245 #include "png.h"
00246 
00247 #if defined(PNG_USE_PNGGCCRD)
00248 
00249 int PNGAPI png_mmx_support(void);
00250 
00251 #ifdef PNG_USE_LOCAL_ARRAYS
00252 static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
00253 static const int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
00254 static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
00255 #endif
00256 
00257 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
00258 /* djgpp, Win32, and Cygwin add their own underscores to global variables,
00259  * so define them without: */
00260 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
00261 #  define _mmx_supported  mmx_supported
00262 #  define _const4         const4
00263 #  define _const6         const6
00264 #  define _mask8_0        mask8_0
00265 #  define _mask16_1       mask16_1
00266 #  define _mask16_0       mask16_0
00267 #  define _mask24_2       mask24_2
00268 #  define _mask24_1       mask24_1
00269 #  define _mask24_0       mask24_0
00270 #  define _mask32_3       mask32_3
00271 #  define _mask32_2       mask32_2
00272 #  define _mask32_1       mask32_1
00273 #  define _mask32_0       mask32_0
00274 #  define _mask48_5       mask48_5
00275 #  define _mask48_4       mask48_4
00276 #  define _mask48_3       mask48_3
00277 #  define _mask48_2       mask48_2
00278 #  define _mask48_1       mask48_1
00279 #  define _mask48_0       mask48_0
00280 #  define _LBCarryMask    LBCarryMask
00281 #  define _HBClearMask    HBClearMask
00282 #  define _ActiveMask     ActiveMask
00283 #  define _ActiveMask2    ActiveMask2
00284 #  define _ActiveMaskEnd  ActiveMaskEnd
00285 #  define _ShiftBpp       ShiftBpp
00286 #  define _ShiftRem       ShiftRem
00287 #ifdef PNG_THREAD_UNSAFE_OK
00288 #  define _unmask         unmask
00289 #  define _FullLength     FullLength
00290 #  define _MMXLength      MMXLength
00291 #  define _dif            dif
00292 #  define _patemp         patemp
00293 #  define _pbtemp         pbtemp
00294 #  define _pctemp         pctemp
00295 #endif
00296 #endif
00297 
00298 
00299 /* These constants are used in the inlined MMX assembly code.
00300    Ignore gcc's "At top level: defined but not used" warnings. */
00301 
00302 /* GRR 20000706:  originally _unmask was needed only when compiling with -fPIC,
00303  *  since that case uses the %ebx register for indexing the Global Offset Table
00304  *  and there were no other registers available.  But gcc 2.95 and later emit
00305  *  "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
00306  *  in the non-PIC case, so we'll just use the global unconditionally now.
00307  */
00308 #ifdef PNG_THREAD_UNSAFE_OK
00309 static int _unmask;
00310 #endif
00311 
00312 static unsigned long long _mask8_0  = 0x0102040810204080LL;
00313 
00314 static unsigned long long _mask16_1 = 0x0101020204040808LL;
00315 static unsigned long long _mask16_0 = 0x1010202040408080LL;
00316 
00317 static unsigned long long _mask24_2 = 0x0101010202020404LL;
00318 static unsigned long long _mask24_1 = 0x0408080810101020LL;
00319 static unsigned long long _mask24_0 = 0x2020404040808080LL;
00320 
00321 static unsigned long long _mask32_3 = 0x0101010102020202LL;
00322 static unsigned long long _mask32_2 = 0x0404040408080808LL;
00323 static unsigned long long _mask32_1 = 0x1010101020202020LL;
00324 static unsigned long long _mask32_0 = 0x4040404080808080LL;
00325 
00326 static unsigned long long _mask48_5 = 0x0101010101010202LL;
00327 static unsigned long long _mask48_4 = 0x0202020204040404LL;
00328 static unsigned long long _mask48_3 = 0x0404080808080808LL;
00329 static unsigned long long _mask48_2 = 0x1010101010102020LL;
00330 static unsigned long long _mask48_1 = 0x2020202040404040LL;
00331 static unsigned long long _mask48_0 = 0x4040808080808080LL;
00332 
00333 static unsigned long long _const4   = 0x0000000000FFFFFFLL;
00334 //static unsigned long long _const5 = 0x000000FFFFFF0000LL;     // NOT USED
00335 static unsigned long long _const6   = 0x00000000000000FFLL;
00336 
00337 // These are used in the row-filter routines and should/would be local
00338 //  variables if not for gcc addressing limitations.
00339 // WARNING: Their presence probably defeats the thread safety of libpng.
00340 
00341 #ifdef PNG_THREAD_UNSAFE_OK
00342 static png_uint_32  _FullLength;
00343 static png_uint_32  _MMXLength;
00344 static int          _dif;
00345 static int          _patemp; // temp variables for Paeth routine
00346 static int          _pbtemp;
00347 static int          _pctemp;
00348 #endif
00349 
00350 void /* PRIVATE */
00351 png_squelch_warnings(void)
00352 {
00353 #ifdef PNG_THREAD_UNSAFE_OK
00354    _dif = _dif;
00355    _patemp = _patemp;
00356    _pbtemp = _pbtemp;
00357    _pctemp = _pctemp;
00358    _MMXLength = _MMXLength;
00359 #endif
00360    _const4  = _const4;
00361    _const6  = _const6;
00362    _mask8_0  = _mask8_0;
00363    _mask16_1 = _mask16_1;
00364    _mask16_0 = _mask16_0;
00365    _mask24_2 = _mask24_2;
00366    _mask24_1 = _mask24_1;
00367    _mask24_0 = _mask24_0;
00368    _mask32_3 = _mask32_3;
00369    _mask32_2 = _mask32_2;
00370    _mask32_1 = _mask32_1;
00371    _mask32_0 = _mask32_0;
00372    _mask48_5 = _mask48_5;
00373    _mask48_4 = _mask48_4;
00374    _mask48_3 = _mask48_3;
00375    _mask48_2 = _mask48_2;
00376    _mask48_1 = _mask48_1;
00377    _mask48_0 = _mask48_0;
00378 }
00379 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
00380 
00381 
00382 static int _mmx_supported = 2;
00383 
00384 /*===========================================================================*/
00385 /*                                                                           */
00386 /*                       P N G _ C O M B I N E _ R O W                       */
00387 /*                                                                           */
00388 /*===========================================================================*/
00389 
00390 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
00391 
00392 #define BPP2  2
00393 #define BPP3  3 /* bytes per pixel (a.k.a. pixel_bytes) */
00394 #define BPP4  4
00395 #define BPP6  6 /* (defined only to help avoid cut-and-paste errors) */
00396 #define BPP8  8
00397 
00398 /* Combines the row recently read in with the previous row.
00399    This routine takes care of alpha and transparency if requested.
00400    This routine also handles the two methods of progressive display
00401    of interlaced images, depending on the mask value.
00402    The mask value describes which pixels are to be combined with
00403    the row.  The pattern always repeats every 8 pixels, so just 8
00404    bits are needed.  A one indicates the pixel is to be combined; a
00405    zero indicates the pixel is to be skipped.  This is in addition
00406    to any alpha or transparency value associated with the pixel.
00407    If you want all pixels to be combined, pass 0xff (255) in mask. */
00408 
00409 /* Use this routine for the x86 platform - it uses a faster MMX routine
00410    if the machine supports MMX. */
00411 
00412 void /* PRIVATE */
00413 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
00414 {
00415    png_debug(1, "in png_combine_row (pnggccrd.c)\n");
00416 
00417 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
00418    if (_mmx_supported == 2) {
00419 #if !defined(PNG_1_0_X)
00420        /* this should have happened in png_init_mmx_flags() already */
00421        png_warning(png_ptr, "asm_flags may not have been initialized");
00422 #endif
00423        png_mmx_support();
00424    }
00425 #endif
00426 
00427    if (mask == 0xff)
00428    {
00429       png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
00430       png_memcpy(row, png_ptr->row_buf + 1,
00431        (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
00432    }
00433    else   /* (png_combine_row() is never called with mask == 0) */
00434    {
00435       switch (png_ptr->row_info.pixel_depth)
00436       {
00437          case 1:        /* png_ptr->row_info.pixel_depth */
00438          {
00439             png_bytep sp;
00440             png_bytep dp;
00441             int s_inc, s_start, s_end;
00442             int m;
00443             int shift;
00444             png_uint_32 i;
00445 
00446             sp = png_ptr->row_buf + 1;
00447             dp = row;
00448             m = 0x80;
00449 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
00450             if (png_ptr->transformations & PNG_PACKSWAP)
00451             {
00452                 s_start = 0;
00453                 s_end = 7;
00454                 s_inc = 1;
00455             }
00456             else
00457 #endif
00458             {
00459                 s_start = 7;
00460                 s_end = 0;
00461                 s_inc = -1;
00462             }
00463 
00464             shift = s_start;
00465 
00466             for (i = 0; i < png_ptr->width; i++)
00467             {
00468                if (m & mask)
00469                {
00470                   int value;
00471 
00472                   value = (*sp >> shift) & 0x1;
00473                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
00474                   *dp |= (png_byte)(value << shift);
00475                }
00476 
00477                if (shift == s_end)
00478                {
00479                   shift = s_start;
00480                   sp++;
00481                   dp++;
00482                }
00483                else
00484                   shift += s_inc;
00485 
00486                if (m == 1)
00487                   m = 0x80;
00488                else
00489                   m >>= 1;
00490             }
00491             break;
00492          }
00493 
00494          case 2:        /* png_ptr->row_info.pixel_depth */
00495          {
00496             png_bytep sp;
00497             png_bytep dp;
00498             int s_start, s_end, s_inc;
00499             int m;
00500             int shift;
00501             png_uint_32 i;
00502             int value;
00503 
00504             sp = png_ptr->row_buf + 1;
00505             dp = row;
00506             m = 0x80;
00507 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
00508             if (png_ptr->transformations & PNG_PACKSWAP)
00509             {
00510                s_start = 0;
00511                s_end = 6;
00512                s_inc = 2;
00513             }
00514             else
00515 #endif
00516             {
00517                s_start = 6;
00518                s_end = 0;
00519                s_inc = -2;
00520             }
00521 
00522             shift = s_start;
00523 
00524             for (i = 0; i < png_ptr->width; i++)
00525             {
00526                if (m & mask)
00527                {
00528                   value = (*sp >> shift) & 0x3;
00529                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
00530                   *dp |= (png_byte)(value << shift);
00531                }
00532 
00533                if (shift == s_end)
00534                {
00535                   shift = s_start;
00536                   sp++;
00537                   dp++;
00538                }
00539                else
00540                   shift += s_inc;
00541                if (m == 1)
00542                   m = 0x80;
00543                else
00544                   m >>= 1;
00545             }
00546             break;
00547          }
00548 
00549          case 4:        /* png_ptr->row_info.pixel_depth */
00550          {
00551             png_bytep sp;
00552             png_bytep dp;
00553             int s_start, s_end, s_inc;
00554             int m;
00555             int shift;
00556             png_uint_32 i;
00557             int value;
00558 
00559             sp = png_ptr->row_buf + 1;
00560             dp = row;
00561             m = 0x80;
00562 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
00563             if (png_ptr->transformations & PNG_PACKSWAP)
00564             {
00565                s_start = 0;
00566                s_end = 4;
00567                s_inc = 4;
00568             }
00569             else
00570 #endif
00571             {
00572                s_start = 4;
00573                s_end = 0;
00574                s_inc = -4;
00575             }
00576             shift = s_start;
00577 
00578             for (i = 0; i < png_ptr->width; i++)
00579             {
00580                if (m & mask)
00581                {
00582                   value = (*sp >> shift) & 0xf;
00583                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
00584                   *dp |= (png_byte)(value << shift);
00585                }
00586 
00587                if (shift == s_end)
00588                {
00589                   shift = s_start;
00590                   sp++;
00591                   dp++;
00592                }
00593                else
00594                   shift += s_inc;
00595                if (m == 1)
00596                   m = 0x80;
00597                else
00598                   m >>= 1;
00599             }
00600             break;
00601          }
00602 
00603          case 8:        /* png_ptr->row_info.pixel_depth */
00604          {
00605             png_bytep srcptr;
00606             png_bytep dstptr;
00607 
00608 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
00609 #if !defined(PNG_1_0_X)
00610             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00611                 /* && _mmx_supported */ )
00612 #else
00613             if (_mmx_supported)
00614 #endif
00615             {
00616                png_uint_32 len;
00617                int diff;
00618                int dummy_value_a;   // fix 'forbidden register spilled' error
00619                int dummy_value_d;
00620                int dummy_value_c;
00621                int dummy_value_S;
00622                int dummy_value_D;
00623                _unmask = ~mask;            // global variable for -fPIC version
00624                srcptr = png_ptr->row_buf + 1;
00625                dstptr = row;
00626                len  = png_ptr->width &~7;  // reduce to multiple of 8
00627                diff = (int) (png_ptr->width & 7);  // amount lost
00628 
00629                __asm__ __volatile__ (
00630                   "movd      _unmask, %%mm7  \n\t" // load bit pattern
00631                   "psubb     %%mm6, %%mm6    \n\t" // zero mm6
00632                   "punpcklbw %%mm7, %%mm7    \n\t"
00633                   "punpcklwd %%mm7, %%mm7    \n\t"
00634                   "punpckldq %%mm7, %%mm7    \n\t" // fill reg with 8 masks
00635 
00636                   "movq      _mask8_0, %%mm0 \n\t"
00637                   "pand      %%mm7, %%mm0    \n\t" // nonzero if keep byte
00638                   "pcmpeqb   %%mm6, %%mm0    \n\t" // zeros->1s, v versa
00639 
00640 // preload        "movl      len, %%ecx      \n\t" // load length of line
00641 // preload        "movl      srcptr, %%esi   \n\t" // load source
00642 // preload        "movl      dstptr, %%edi   \n\t" // load dest
00643 
00644                   "cmpl      $0, %%ecx       \n\t" // len == 0 ?
00645                   "je        mainloop8end    \n\t"
00646 
00647                 "mainloop8:                  \n\t"
00648                   "movq      (%%esi), %%mm4  \n\t" // *srcptr
00649                   "pand      %%mm0, %%mm4    \n\t"
00650                   "movq      %%mm0, %%mm6    \n\t"
00651                   "pandn     (%%edi), %%mm6  \n\t" // *dstptr
00652                   "por       %%mm6, %%mm4    \n\t"
00653                   "movq      %%mm4, (%%edi)  \n\t"
00654                   "addl      $8, %%esi       \n\t" // inc by 8 bytes processed
00655                   "addl      $8, %%edi       \n\t"
00656                   "subl      $8, %%ecx       \n\t" // dec by 8 pixels processed
00657                   "ja        mainloop8       \n\t"
00658 
00659                 "mainloop8end:               \n\t"
00660 // preload        "movl      diff, %%ecx     \n\t" // (diff is in eax)
00661                   "movl      %%eax, %%ecx    \n\t"
00662                   "cmpl      $0, %%ecx       \n\t"
00663                   "jz        end8            \n\t"
00664 // preload        "movl      mask, %%edx     \n\t"
00665                   "sall      $24, %%edx      \n\t" // make low byte, high byte
00666 
00667                 "secondloop8:                \n\t"
00668                   "sall      %%edx           \n\t" // move high bit to CF
00669                   "jnc       skip8           \n\t" // if CF = 0
00670                   "movb      (%%esi), %%al   \n\t"
00671                   "movb      %%al, (%%edi)   \n\t"
00672 
00673                 "skip8:                      \n\t"
00674                   "incl      %%esi           \n\t"
00675                   "incl      %%edi           \n\t"
00676                   "decl      %%ecx           \n\t"
00677                   "jnz       secondloop8     \n\t"
00678 
00679                 "end8:                       \n\t"
00680                   "EMMS                      \n\t"  // DONE
00681 
00682                   : "=a" (dummy_value_a),           // output regs (dummy)
00683                     "=d" (dummy_value_d),
00684                     "=c" (dummy_value_c),
00685                     "=S" (dummy_value_S),
00686                     "=D" (dummy_value_D)
00687 
00688                   : "3" (srcptr),      // esi       // input regs
00689                     "4" (dstptr),      // edi
00690                     "0" (diff),        // eax
00691 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
00692                     "2" (len),         // ecx
00693                     "1" (mask)         // edx
00694 
00695 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
00696                   : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
00697 #endif
00698                );
00699             }
00700             else /* mmx _not supported - Use modified C routine */
00701 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
00702             {
00703                register png_uint_32 i;
00704                png_uint_32 initial_val = png_pass_start[png_ptr->pass];
00705                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
00706                register int stride = png_pass_inc[png_ptr->pass];
00707                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
00708                register int rep_bytes = png_pass_width[png_ptr->pass];
00709                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
00710                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
00711                int diff = (int) (png_ptr->width & 7); /* amount lost */
00712                register png_uint_32 final_val = len;  /* GRR bugfix */
00713 
00714                srcptr = png_ptr->row_buf + 1 + initial_val;
00715                dstptr = row + initial_val;
00716 
00717                for (i = initial_val; i < final_val; i += stride)
00718                {
00719                   png_memcpy(dstptr, srcptr, rep_bytes);
00720                   srcptr += stride;
00721                   dstptr += stride;
00722                }
00723                if (diff)  /* number of leftover pixels:  3 for pngtest */
00724                {
00725                   final_val+=diff /* *BPP1 */ ;
00726                   for (; i < final_val; i += stride)
00727                   {
00728                      if (rep_bytes > (int)(final_val-i))
00729                         rep_bytes = (int)(final_val-i);
00730                      png_memcpy(dstptr, srcptr, rep_bytes);
00731                      srcptr += stride;
00732                      dstptr += stride;
00733                   }
00734                }
00735 
00736             } /* end of else (_mmx_supported) */
00737 
00738             break;
00739          }       /* end 8 bpp */
00740 
00741          case 16:       /* png_ptr->row_info.pixel_depth */
00742          {
00743             png_bytep srcptr;
00744             png_bytep dstptr;
00745 
00746 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
00747 #if !defined(PNG_1_0_X)
00748             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00749                 /* && _mmx_supported */ )
00750 #else
00751             if (_mmx_supported)
00752 #endif
00753             {
00754                png_uint_32 len;
00755                int diff;
00756                int dummy_value_a;   // fix 'forbidden register spilled' error
00757                int dummy_value_d;
00758                int dummy_value_c;
00759                int dummy_value_S;
00760                int dummy_value_D;
00761                _unmask = ~mask;            // global variable for -fPIC version
00762                srcptr = png_ptr->row_buf + 1;
00763                dstptr = row;
00764                len  = png_ptr->width &~7;  // reduce to multiple of 8
00765                diff = (int) (png_ptr->width & 7); // amount lost //
00766 
00767                __asm__ __volatile__ (
00768                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
00769                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
00770                   "punpcklbw %%mm7, %%mm7     \n\t"
00771                   "punpcklwd %%mm7, %%mm7     \n\t"
00772                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
00773 
00774                   "movq      _mask16_0, %%mm0 \n\t"
00775                   "movq      _mask16_1, %%mm1 \n\t"
00776 
00777                   "pand      %%mm7, %%mm0     \n\t"
00778                   "pand      %%mm7, %%mm1     \n\t"
00779 
00780                   "pcmpeqb   %%mm6, %%mm0     \n\t"
00781                   "pcmpeqb   %%mm6, %%mm1     \n\t"
00782 
00783 // preload        "movl      len, %%ecx       \n\t" // load length of line
00784 // preload        "movl      srcptr, %%esi    \n\t" // load source
00785 // preload        "movl      dstptr, %%edi    \n\t" // load dest
00786 
00787                   "cmpl      $0, %%ecx        \n\t"
00788                   "jz        mainloop16end    \n\t"
00789 
00790                 "mainloop16:                  \n\t"
00791                   "movq      (%%esi), %%mm4   \n\t"
00792                   "pand      %%mm0, %%mm4     \n\t"
00793                   "movq      %%mm0, %%mm6     \n\t"
00794                   "movq      (%%edi), %%mm7   \n\t"
00795                   "pandn     %%mm7, %%mm6     \n\t"
00796                   "por       %%mm6, %%mm4     \n\t"
00797                   "movq      %%mm4, (%%edi)   \n\t"
00798 
00799                   "movq      8(%%esi), %%mm5  \n\t"
00800                   "pand      %%mm1, %%mm5     \n\t"
00801                   "movq      %%mm1, %%mm7     \n\t"
00802                   "movq      8(%%edi), %%mm6  \n\t"
00803                   "pandn     %%mm6, %%mm7     \n\t"
00804                   "por       %%mm7, %%mm5     \n\t"
00805                   "movq      %%mm5, 8(%%edi)  \n\t"
00806 
00807                   "addl      $16, %%esi       \n\t" // inc by 16 bytes processed
00808                   "addl      $16, %%edi       \n\t"
00809                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
00810                   "ja        mainloop16       \n\t"
00811 
00812                 "mainloop16end:               \n\t"
00813 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
00814                   "movl      %%eax, %%ecx     \n\t"
00815                   "cmpl      $0, %%ecx        \n\t"
00816                   "jz        end16            \n\t"
00817 // preload        "movl      mask, %%edx      \n\t"
00818                   "sall      $24, %%edx       \n\t" // make low byte, high byte
00819 
00820                 "secondloop16:                \n\t"
00821                   "sall      %%edx            \n\t" // move high bit to CF
00822                   "jnc       skip16           \n\t" // if CF = 0
00823                   "movw      (%%esi), %%ax    \n\t"
00824                   "movw      %%ax, (%%edi)    \n\t"
00825 
00826                 "skip16:                      \n\t"
00827                   "addl      $2, %%esi        \n\t"
00828                   "addl      $2, %%edi        \n\t"
00829                   "decl      %%ecx            \n\t"
00830                   "jnz       secondloop16     \n\t"
00831 
00832                 "end16:                       \n\t"
00833                   "EMMS                       \n\t" // DONE
00834 
00835                   : "=a" (dummy_value_a),           // output regs (dummy)
00836                     "=c" (dummy_value_c),
00837                     "=d" (dummy_value_d),
00838                     "=S" (dummy_value_S),
00839                     "=D" (dummy_value_D)
00840 
00841                   : "0" (diff),        // eax       // input regs
00842 // was (unmask)     " "    RESERVED    // ebx       // Global Offset Table idx
00843                     "1" (len),         // ecx
00844                     "2" (mask),        // edx
00845                     "3" (srcptr),      // esi
00846                     "4" (dstptr)       // edi
00847 
00848 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
00849                   : "%mm0", "%mm1", "%mm4"          // clobber list
00850                   , "%mm5", "%mm6", "%mm7"
00851 #endif
00852                );
00853             }
00854             else /* mmx _not supported - Use modified C routine */
00855 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
00856             {
00857                register png_uint_32 i;
00858                png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
00859                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
00860                register int stride = BPP2 * png_pass_inc[png_ptr->pass];
00861                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
00862                register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
00863                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
00864                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
00865                int diff = (int) (png_ptr->width & 7); /* amount lost */
00866                register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
00867 
00868                srcptr = png_ptr->row_buf + 1 + initial_val;
00869                dstptr = row + initial_val;
00870 
00871                for (i = initial_val; i < final_val; i += stride)
00872                {
00873                   png_memcpy(dstptr, srcptr, rep_bytes);
00874                   srcptr += stride;
00875                   dstptr += stride;
00876                }
00877                if (diff)  /* number of leftover pixels:  3 for pngtest */
00878                {
00879                   final_val+=diff*BPP2;
00880                   for (; i < final_val; i += stride)
00881                   {
00882                      if (rep_bytes > (int)(final_val-i))
00883                         rep_bytes = (int)(final_val-i);
00884                      png_memcpy(dstptr, srcptr, rep_bytes);
00885                      srcptr += stride;
00886                      dstptr += stride;
00887                   }
00888                }
00889             } /* end of else (_mmx_supported) */
00890 
00891             break;
00892          }       /* end 16 bpp */
00893 
00894          case 24:       /* png_ptr->row_info.pixel_depth */
00895          {
00896             png_bytep srcptr;
00897             png_bytep dstptr;
00898 
00899 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
00900 #if !defined(PNG_1_0_X)
00901             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00902                 /* && _mmx_supported */ )
00903 #else
00904             if (_mmx_supported)
00905 #endif
00906             {
00907                png_uint_32 len;
00908                int diff;
00909                int dummy_value_a;   // fix 'forbidden register spilled' error
00910                int dummy_value_d;
00911                int dummy_value_c;
00912                int dummy_value_S;
00913                int dummy_value_D;
00914                _unmask = ~mask;            // global variable for -fPIC version
00915                srcptr = png_ptr->row_buf + 1;
00916                dstptr = row;
00917                len  = png_ptr->width &~7;  // reduce to multiple of 8
00918                diff = (int) (png_ptr->width & 7); // amount lost //
00919 
00920                __asm__ __volatile__ (
00921                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
00922                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
00923                   "punpcklbw %%mm7, %%mm7     \n\t"
00924                   "punpcklwd %%mm7, %%mm7     \n\t"
00925                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
00926 
00927                   "movq      _mask24_0, %%mm0 \n\t"
00928                   "movq      _mask24_1, %%mm1 \n\t"
00929                   "movq      _mask24_2, %%mm2 \n\t"
00930 
00931                   "pand      %%mm7, %%mm0     \n\t"
00932                   "pand      %%mm7, %%mm1     \n\t"
00933                   "pand      %%mm7, %%mm2     \n\t"
00934 
00935                   "pcmpeqb   %%mm6, %%mm0     \n\t"
00936                   "pcmpeqb   %%mm6, %%mm1     \n\t"
00937                   "pcmpeqb   %%mm6, %%mm2     \n\t"
00938 
00939 // preload        "movl      len, %%ecx       \n\t" // load length of line
00940 // preload        "movl      srcptr, %%esi    \n\t" // load source
00941 // preload        "movl      dstptr, %%edi    \n\t" // load dest
00942 
00943                   "cmpl      $0, %%ecx        \n\t"
00944                   "jz        mainloop24end    \n\t"
00945 
00946                 "mainloop24:                  \n\t"
00947                   "movq      (%%esi), %%mm4   \n\t"
00948                   "pand      %%mm0, %%mm4     \n\t"
00949                   "movq      %%mm0, %%mm6     \n\t"
00950                   "movq      (%%edi), %%mm7   \n\t"
00951                   "pandn     %%mm7, %%mm6     \n\t"
00952                   "por       %%mm6, %%mm4     \n\t"
00953                   "movq      %%mm4, (%%edi)   \n\t"
00954 
00955                   "movq      8(%%esi), %%mm5  \n\t"
00956                   "pand      %%mm1, %%mm5     \n\t"
00957                   "movq      %%mm1, %%mm7     \n\t"
00958                   "movq      8(%%edi), %%mm6  \n\t"
00959                   "pandn     %%mm6, %%mm7     \n\t"
00960                   "por       %%mm7, %%mm5     \n\t"
00961                   "movq      %%mm5, 8(%%edi)  \n\t"
00962 
00963                   "movq      16(%%esi), %%mm6 \n\t"
00964                   "pand      %%mm2, %%mm6     \n\t"
00965                   "movq      %%mm2, %%mm4     \n\t"
00966                   "movq      16(%%edi), %%mm7 \n\t"
00967                   "pandn     %%mm7, %%mm4     \n\t"
00968                   "por       %%mm4, %%mm6     \n\t"
00969                   "movq      %%mm6, 16(%%edi) \n\t"
00970 
00971                   "addl      $24, %%esi       \n\t" // inc by 24 bytes processed
00972                   "addl      $24, %%edi       \n\t"
00973                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
00974 
00975                   "ja        mainloop24       \n\t"
00976 
00977                 "mainloop24end:               \n\t"
00978 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
00979                   "movl      %%eax, %%ecx     \n\t"
00980                   "cmpl      $0, %%ecx        \n\t"
00981                   "jz        end24            \n\t"
00982 // preload        "movl      mask, %%edx      \n\t"
00983                   "sall      $24, %%edx       \n\t" // make low byte, high byte
00984 
00985                 "secondloop24:                \n\t"
00986                   "sall      %%edx            \n\t" // move high bit to CF
00987                   "jnc       skip24           \n\t" // if CF = 0
00988                   "movw      (%%esi), %%ax    \n\t"
00989                   "movw      %%ax, (%%edi)    \n\t"
00990                   "xorl      %%eax, %%eax     \n\t"
00991                   "movb      2(%%esi), %%al   \n\t"
00992                   "movb      %%al, 2(%%edi)   \n\t"
00993 
00994                 "skip24:                      \n\t"
00995                   "addl      $3, %%esi        \n\t"
00996                   "addl      $3, %%edi        \n\t"
00997                   "decl      %%ecx            \n\t"
00998                   "jnz       secondloop24     \n\t"
00999 
01000                 "end24:                       \n\t"
01001                   "EMMS                       \n\t" // DONE
01002 
01003                   : "=a" (dummy_value_a),           // output regs (dummy)
01004                     "=d" (dummy_value_d),
01005                     "=c" (dummy_value_c),
01006                     "=S" (dummy_value_S),
01007                     "=D" (dummy_value_D)
01008 
01009                   : "3" (srcptr),      // esi       // input regs
01010                     "4" (dstptr),      // edi
01011                     "0" (diff),        // eax
01012 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
01013                     "2" (len),         // ecx
01014                     "1" (mask)         // edx
01015 
01016 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
01017                   : "%mm0", "%mm1", "%mm2"          // clobber list
01018                   , "%mm4", "%mm5", "%mm6", "%mm7"
01019 #endif
01020                );
01021             }
01022             else /* mmx _not supported - Use modified C routine */
01023 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
01024             {
01025                register png_uint_32 i;
01026                png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
01027                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
01028                register int stride = BPP3 * png_pass_inc[png_ptr->pass];
01029                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
01030                register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
01031                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
01032                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
01033                int diff = (int) (png_ptr->width & 7); /* amount lost */
01034                register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
01035 
01036                srcptr = png_ptr->row_buf + 1 + initial_val;
01037                dstptr = row + initial_val;
01038 
01039                for (i = initial_val; i < final_val; i += stride)
01040                {
01041                   png_memcpy(dstptr, srcptr, rep_bytes);
01042                   srcptr += stride;
01043                   dstptr += stride;
01044                }
01045                if (diff)  /* number of leftover pixels:  3 for pngtest */
01046                {
01047                   final_val+=diff*BPP3;
01048                   for (; i < final_val; i += stride)
01049                   {
01050                      if (rep_bytes > (int)(final_val-i))
01051                         rep_bytes = (int)(final_val-i);
01052                      png_memcpy(dstptr, srcptr, rep_bytes);
01053                      srcptr += stride;
01054                      dstptr += stride;
01055                   }
01056                }
01057             } /* end of else (_mmx_supported) */
01058 
01059             break;
01060          }       /* end 24 bpp */
01061 
01062          case 32:       /* png_ptr->row_info.pixel_depth */
01063          {
01064             png_bytep srcptr;
01065             png_bytep dstptr;
01066 
01067 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
01068 #if !defined(PNG_1_0_X)
01069             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
01070                 /* && _mmx_supported */ )
01071 #else
01072             if (_mmx_supported)
01073 #endif
01074             {
01075                png_uint_32 len;
01076                int diff;
01077                int dummy_value_a;   // fix 'forbidden register spilled' error
01078                int dummy_value_d;
01079                int dummy_value_c;
01080                int dummy_value_S;
01081                int dummy_value_D;
01082                _unmask = ~mask;            // global variable for -fPIC version
01083                srcptr = png_ptr->row_buf + 1;
01084                dstptr = row;
01085                len  = png_ptr->width &~7;  // reduce to multiple of 8
01086                diff = (int) (png_ptr->width & 7); // amount lost //
01087 
01088                __asm__ __volatile__ (
01089                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
01090                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
01091                   "punpcklbw %%mm7, %%mm7     \n\t"
01092                   "punpcklwd %%mm7, %%mm7     \n\t"
01093                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
01094 
01095                   "movq      _mask32_0, %%mm0 \n\t"
01096                   "movq      _mask32_1, %%mm1 \n\t"
01097                   "movq      _mask32_2, %%mm2 \n\t"
01098                   "movq      _mask32_3, %%mm3 \n\t"
01099 
01100                   "pand      %%mm7, %%mm0     \n\t"
01101                   "pand      %%mm7, %%mm1     \n\t"
01102                   "pand      %%mm7, %%mm2     \n\t"
01103                   "pand      %%mm7, %%mm3     \n\t"
01104 
01105                   "pcmpeqb   %%mm6, %%mm0     \n\t"
01106                   "pcmpeqb   %%mm6, %%mm1     \n\t"
01107                   "pcmpeqb   %%mm6, %%mm2     \n\t"
01108                   "pcmpeqb   %%mm6, %%mm3     \n\t"
01109 
01110 // preload        "movl      len, %%ecx       \n\t" // load length of line
01111 // preload        "movl      srcptr, %%esi    \n\t" // load source
01112 // preload        "movl      dstptr, %%edi    \n\t" // load dest
01113 
01114                   "cmpl      $0, %%ecx        \n\t" // lcr
01115                   "jz        mainloop32end    \n\t"
01116 
01117                 "mainloop32:                  \n\t"
01118                   "movq      (%%esi), %%mm4   \n\t"
01119                   "pand      %%mm0, %%mm4     \n\t"
01120                   "movq      %%mm0, %%mm6     \n\t"
01121                   "movq      (%%edi), %%mm7   \n\t"
01122                   "pandn     %%mm7, %%mm6     \n\t"
01123                   "por       %%mm6, %%mm4     \n\t"
01124                   "movq      %%mm4, (%%edi)   \n\t"
01125 
01126                   "movq      8(%%esi), %%mm5  \n\t"
01127                   "pand      %%mm1, %%mm5     \n\t"
01128                   "movq      %%mm1, %%mm7     \n\t"
01129                   "movq      8(%%edi), %%mm6  \n\t"
01130                   "pandn     %%mm6, %%mm7     \n\t"
01131                   "por       %%mm7, %%mm5     \n\t"
01132                   "movq      %%mm5, 8(%%edi)  \n\t"
01133 
01134                   "movq      16(%%esi), %%mm6 \n\t"
01135                   "pand      %%mm2, %%mm6     \n\t"
01136                   "movq      %%mm2, %%mm4     \n\t"
01137                   "movq      16(%%edi), %%mm7 \n\t"
01138                   "pandn     %%mm7, %%mm4     \n\t"
01139                   "por       %%mm4, %%mm6     \n\t"
01140                   "movq      %%mm6, 16(%%edi) \n\t"
01141 
01142                   "movq      24(%%esi), %%mm7 \n\t"
01143                   "pand      %%mm3, %%mm7     \n\t"
01144                   "movq      %%mm3, %%mm5     \n\t"
01145                   "movq      24(%%edi), %%mm4 \n\t"
01146                   "pandn     %%mm4, %%mm5     \n\t"
01147                   "por       %%mm5, %%mm7     \n\t"
01148                   "movq      %%mm7, 24(%%edi) \n\t"
01149 
01150                   "addl      $32, %%esi       \n\t" // inc by 32 bytes processed
01151                   "addl      $32, %%edi       \n\t"
01152                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
01153                   "ja        mainloop32       \n\t"
01154 
01155                 "mainloop32end:               \n\t"
01156 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
01157                   "movl      %%eax, %%ecx     \n\t"
01158                   "cmpl      $0, %%ecx        \n\t"
01159                   "jz        end32            \n\t"
01160 // preload        "movl      mask, %%edx      \n\t"
01161                   "sall      $24, %%edx       \n\t" // low byte => high byte
01162 
01163                 "secondloop32:                \n\t"
01164                   "sall      %%edx            \n\t" // move high bit to CF
01165                   "jnc       skip32           \n\t" // if CF = 0
01166                   "movl      (%%esi), %%eax   \n\t"
01167                   "movl      %%eax, (%%edi)   \n\t"
01168 
01169                 "skip32:                      \n\t"
01170                   "addl      $4, %%esi        \n\t"
01171                   "addl      $4, %%edi        \n\t"
01172                   "decl      %%ecx            \n\t"
01173                   "jnz       secondloop32     \n\t"
01174 
01175                 "end32:                       \n\t"
01176                   "EMMS                       \n\t" // DONE
01177 
01178                   : "=a" (dummy_value_a),           // output regs (dummy)
01179                     "=d" (dummy_value_d),
01180                     "=c" (dummy_value_c),
01181                     "=S" (dummy_value_S),
01182                     "=D" (dummy_value_D)
01183 
01184                   : "3" (srcptr),      // esi       // input regs
01185                     "4" (dstptr),      // edi
01186                     "0" (diff),        // eax
01187 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
01188                     "2" (len),         // ecx
01189                     "1" (mask)         // edx
01190 
01191 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
01192                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
01193                   , "%mm4", "%mm5", "%mm6", "%mm7"
01194 #endif
01195                );
01196             }
01197             else /* mmx _not supported - Use modified C routine */
01198 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
01199             {
01200                register png_uint_32 i;
01201                png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
01202                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
01203                register int stride = BPP4 * png_pass_inc[png_ptr->pass];
01204                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
01205                register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
01206                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
01207                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
01208                int diff = (int) (png_ptr->width & 7); /* amount lost */
01209                register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
01210 
01211                srcptr = png_ptr->row_buf + 1 + initial_val;
01212                dstptr = row + initial_val;
01213 
01214                for (i = initial_val; i < final_val; i += stride)
01215                {
01216                   png_memcpy(dstptr, srcptr, rep_bytes);
01217                   srcptr += stride;
01218                   dstptr += stride;
01219                }
01220                if (diff)  /* number of leftover pixels:  3 for pngtest */
01221                {
01222                   final_val+=diff*BPP4;
01223                   for (; i < final_val; i += stride)
01224                   {
01225                      if (rep_bytes > (int)(final_val-i))
01226                         rep_bytes = (int)(final_val-i);
01227                      png_memcpy(dstptr, srcptr, rep_bytes);
01228                      srcptr += stride;
01229                      dstptr += stride;
01230                   }
01231                }
01232             } /* end of else (_mmx_supported) */
01233 
01234             break;
01235          }       /* end 32 bpp */
01236 
01237          case 48:       /* png_ptr->row_info.pixel_depth */
01238          {
01239             png_bytep srcptr;
01240             png_bytep dstptr;
01241 
01242 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
01243 #if !defined(PNG_1_0_X)
01244             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
01245                 /* && _mmx_supported */ )
01246 #else
01247             if (_mmx_supported)
01248 #endif
01249             {
01250                png_uint_32 len;
01251                int diff;
01252                int dummy_value_a;   // fix 'forbidden register spilled' error
01253                int dummy_value_d;
01254                int dummy_value_c;
01255                int dummy_value_S;
01256                int dummy_value_D;
01257                _unmask = ~mask;            // global variable for -fPIC version
01258                srcptr = png_ptr->row_buf + 1;
01259                dstptr = row;
01260                len  = png_ptr->width &~7;  // reduce to multiple of 8
01261                diff = (int) (png_ptr->width & 7); // amount lost //
01262 
01263                __asm__ __volatile__ (
01264                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
01265                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
01266                   "punpcklbw %%mm7, %%mm7     \n\t"
01267                   "punpcklwd %%mm7, %%mm7     \n\t"
01268                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
01269 
01270                   "movq      _mask48_0, %%mm0 \n\t"
01271                   "movq      _mask48_1, %%mm1 \n\t"
01272                   "movq      _mask48_2, %%mm2 \n\t"
01273                   "movq      _mask48_3, %%mm3 \n\t"
01274                   "movq      _mask48_4, %%mm4 \n\t"
01275                   "movq      _mask48_5, %%mm5 \n\t"
01276 
01277                   "pand      %%mm7, %%mm0     \n\t"
01278                   "pand      %%mm7, %%mm1     \n\t"
01279                   "pand      %%mm7, %%mm2     \n\t"
01280                   "pand      %%mm7, %%mm3     \n\t"
01281                   "pand      %%mm7, %%mm4     \n\t"
01282                   "pand      %%mm7, %%mm5     \n\t"
01283 
01284                   "pcmpeqb   %%mm6, %%mm0     \n\t"
01285                   "pcmpeqb   %%mm6, %%mm1     \n\t"
01286                   "pcmpeqb   %%mm6, %%mm2     \n\t"
01287                   "pcmpeqb   %%mm6, %%mm3     \n\t"
01288                   "pcmpeqb   %%mm6, %%mm4     \n\t"
01289                   "pcmpeqb   %%mm6, %%mm5     \n\t"
01290 
01291 // preload        "movl      len, %%ecx       \n\t" // load length of line
01292 // preload        "movl      srcptr, %%esi    \n\t" // load source
01293 // preload        "movl      dstptr, %%edi    \n\t" // load dest
01294 
01295                   "cmpl      $0, %%ecx        \n\t"
01296                   "jz        mainloop48end    \n\t"
01297 
01298                 "mainloop48:                  \n\t"
01299                   "movq      (%%esi), %%mm7   \n\t"
01300                   "pand      %%mm0, %%mm7     \n\t"
01301                   "movq      %%mm0, %%mm6     \n\t"
01302                   "pandn     (%%edi), %%mm6   \n\t"
01303                   "por       %%mm6, %%mm7     \n\t"
01304                   "movq      %%mm7, (%%edi)   \n\t"
01305 
01306                   "movq      8(%%esi), %%mm6  \n\t"
01307                   "pand      %%mm1, %%mm6     \n\t"
01308                   "movq      %%mm1, %%mm7     \n\t"
01309                   "pandn     8(%%edi), %%mm7  \n\t"
01310                   "por       %%mm7, %%mm6     \n\t"
01311                   "movq      %%mm6, 8(%%edi)  \n\t"
01312 
01313                   "movq      16(%%esi), %%mm6 \n\t"
01314                   "pand      %%mm2, %%mm6     \n\t"
01315                   "movq      %%mm2, %%mm7     \n\t"
01316                   "pandn     16(%%edi), %%mm7 \n\t"
01317                   "por       %%mm7, %%mm6     \n\t"
01318                   "movq      %%mm6, 16(%%edi) \n\t"
01319 
01320                   "movq      24(%%esi), %%mm7 \n\t"
01321                   "pand      %%mm3, %%mm7     \n\t"
01322                   "movq      %%mm3, %%mm6     \n\t"
01323                   "pandn     24(%%edi), %%mm6 \n\t"
01324                   "por       %%mm6, %%mm7     \n\t"
01325                   "movq      %%mm7, 24(%%edi) \n\t"
01326 
01327                   "movq      32(%%esi), %%mm6 \n\t"
01328                   "pand      %%mm4, %%mm6     \n\t"
01329                   "movq      %%mm4, %%mm7     \n\t"
01330                   "pandn     32(%%edi), %%mm7 \n\t"
01331                   "por       %%mm7, %%mm6     \n\t"
01332                   "movq      %%mm6, 32(%%edi) \n\t"
01333 
01334                   "movq      40(%%esi), %%mm7 \n\t"
01335                   "pand      %%mm5, %%mm7     \n\t"
01336                   "movq      %%mm5, %%mm6     \n\t"
01337                   "pandn     40(%%edi), %%mm6 \n\t"
01338                   "por       %%mm6, %%mm7     \n\t"
01339                   "movq      %%mm7, 40(%%edi) \n\t"
01340 
01341                   "addl      $48, %%esi       \n\t" // inc by 48 bytes processed
01342                   "addl      $48, %%edi       \n\t"
01343                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
01344 
01345                   "ja        mainloop48       \n\t"
01346 
01347                 "mainloop48end:               \n\t"
01348 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
01349                   "movl      %%eax, %%ecx     \n\t"
01350                   "cmpl      $0, %%ecx        \n\t"
01351                   "jz        end48            \n\t"
01352 // preload        "movl      mask, %%edx      \n\t"
01353                   "sall      $24, %%edx       \n\t" // make low byte, high byte
01354 
01355                 "secondloop48:                \n\t"
01356                   "sall      %%edx            \n\t" // move high bit to CF
01357                   "jnc       skip48           \n\t" // if CF = 0
01358                   "movl      (%%esi), %%eax   \n\t"
01359                   "movl      %%eax, (%%edi)   \n\t"
01360 
01361                 "skip48:                      \n\t"
01362                   "addl      $4, %%esi        \n\t"
01363                   "addl      $4, %%edi        \n\t"
01364                   "decl      %%ecx            \n\t"
01365                   "jnz       secondloop48     \n\t"
01366 
01367                 "end48:                       \n\t"
01368                   "EMMS                       \n\t" // DONE
01369 
01370                   : "=a" (dummy_value_a),           // output regs (dummy)
01371                     "=d" (dummy_value_d),
01372                     "=c" (dummy_value_c),
01373                     "=S" (dummy_value_S),
01374                     "=D" (dummy_value_D)
01375 
01376                   : "3" (srcptr),      // esi       // input regs
01377                     "4" (dstptr),      // edi
01378                     "0" (diff),        // eax
01379 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
01380                     "2" (len),         // ecx
01381                     "1" (mask)         // edx
01382 
01383 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
01384                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
01385                   , "%mm4", "%mm5", "%mm6", "%mm7"
01386 #endif
01387                );
01388             }
01389             else /* mmx _not supported - Use modified C routine */
01390 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
01391             {
01392                register png_uint_32 i;
01393                png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
01394                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
01395                register int stride = BPP6 * png_pass_inc[png_ptr->pass];
01396                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
01397                register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
01398                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
01399                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
01400                int diff = (int) (png_ptr->width & 7); /* amount lost */
01401                register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
01402 
01403                srcptr = png_ptr->row_buf + 1 + initial_val;
01404                dstptr = row + initial_val;
01405 
01406                for (i = initial_val; i < final_val; i += stride)
01407                {
01408                   png_memcpy(dstptr, srcptr, rep_bytes);
01409                   srcptr += stride;
01410                   dstptr += stride;
01411                }
01412                if (diff)  /* number of leftover pixels:  3 for pngtest */
01413                {
01414                   final_val+=diff*BPP6;
01415                   for (; i < final_val; i += stride)
01416                   {
01417                      if (rep_bytes > (int)(final_val-i))
01418                         rep_bytes = (int)(final_val-i);
01419                      png_memcpy(dstptr, srcptr, rep_bytes);
01420                      srcptr += stride;
01421                      dstptr += stride;
01422                   }
01423                }
01424             } /* end of else (_mmx_supported) */
01425 
01426             break;
01427          }       /* end 48 bpp */
01428 
01429          case 64:       /* png_ptr->row_info.pixel_depth */
01430          {
01431             png_bytep srcptr;
01432             png_bytep dstptr;
01433             register png_uint_32 i;
01434             png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
01435               /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
01436             register int stride = BPP8 * png_pass_inc[png_ptr->pass];
01437               /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
01438             register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
01439               /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
01440             png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
01441             int diff = (int) (png_ptr->width & 7); /* amount lost */
01442             register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
01443 
01444             srcptr = png_ptr->row_buf + 1 + initial_val;
01445             dstptr = row + initial_val;
01446 
01447             for (i = initial_val; i < final_val; i += stride)
01448             {
01449                png_memcpy(dstptr, srcptr, rep_bytes);
01450                srcptr += stride;
01451                dstptr += stride;
01452             }
01453             if (diff)  /* number of leftover pixels:  3 for pngtest */
01454             {
01455                final_val+=diff*BPP8;
01456                for (; i < final_val; i += stride)
01457                {
01458                   if (rep_bytes > (int)(final_val-i))
01459                      rep_bytes = (int)(final_val-i);
01460                   png_memcpy(dstptr, srcptr, rep_bytes);
01461                   srcptr += stride;
01462                   dstptr += stride;
01463                }
01464             }
01465 
01466             break;
01467          }       /* end 64 bpp */
01468 
01469          default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
01470          {
01471             /* this should never happen */
01472             png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
01473             break;
01474          }
01475       } /* end switch (png_ptr->row_info.pixel_depth) */
01476 
01477    } /* end if (non-trivial mask) */
01478 
01479 } /* end png_combine_row() */
01480 
01481 #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
01482 
01483 
01484 
01485 
01486 /*===========================================================================*/
01487 /*                                                                           */
01488 /*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
01489 /*                                                                           */
01490 /*===========================================================================*/
01491 
01492 #if defined(PNG_READ_INTERLACING_SUPPORTED)
01493 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
01494 
01495 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
01496  * has taken place.  [GRR: what other steps come before and/or after?]
01497  */
01498 
01499 void /* PRIVATE */
01500 png_do_read_interlace(png_structp png_ptr)
01501 {
01502    png_row_infop row_info = &(png_ptr->row_info);
01503    png_bytep row = png_ptr->row_buf + 1;
01504    int pass = png_ptr->pass;
01505 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
01506    png_uint_32 transformations = png_ptr->transformations;
01507 #endif
01508 
01509    png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
01510 
01511 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
01512    if (_mmx_supported == 2) {
01513 #if !defined(PNG_1_0_X)
01514        /* this should have happened in png_init_mmx_flags() already */
01515        png_warning(png_ptr, "asm_flags may not have been initialized");
01516 #endif
01517        png_mmx_support();
01518    }
01519 #endif
01520 
01521    if (row != NULL && row_info != NULL)
01522    {
01523       png_uint_32 final_width;
01524 
01525       final_width = row_info->width * png_pass_inc[pass];
01526 
01527       switch (row_info->pixel_depth)
01528       {
01529          case 1:
01530          {
01531             png_bytep sp, dp;
01532             int sshift, dshift;
01533             int s_start, s_end, s_inc;
01534             png_byte v;
01535             png_uint_32 i;
01536             int j;
01537 
01538             sp = row + (png_size_t)((row_info->width - 1) >> 3);
01539             dp = row + (png_size_t)((final_width - 1) >> 3);
01540 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
01541             if (transformations & PNG_PACKSWAP)
01542             {
01543                sshift = (int)((row_info->width + 7) & 7);
01544                dshift = (int)((final_width + 7) & 7);
01545                s_start = 7;
01546                s_end = 0;
01547                s_inc = -1;
01548             }
01549             else
01550 #endif
01551             {
01552                sshift = 7 - (int)((row_info->width + 7) & 7);
01553                dshift = 7 - (int)((final_width + 7) & 7);
01554                s_start = 0;
01555                s_end = 7;
01556                s_inc = 1;
01557             }
01558 
01559             for (i = row_info->width; i; i--)
01560             {
01561                v = (png_byte)((*sp >> sshift) & 0x1);
01562                for (j = 0; j < png_pass_inc[pass]; j++)
01563                {
01564                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
01565                   *dp |= (png_byte)(v << dshift);
01566                   if (dshift == s_end)
01567                   {
01568                      dshift = s_start;
01569                      dp--;
01570                   }
01571                   else
01572                      dshift += s_inc;
01573                }
01574                if (sshift == s_end)
01575                {
01576                   sshift = s_start;
01577                   sp--;
01578                }
01579                else
01580                   sshift += s_inc;
01581             }
01582             break;
01583          }
01584 
01585          case 2:
01586          {
01587             png_bytep sp, dp;
01588             int sshift, dshift;
01589             int s_start, s_end, s_inc;
01590             png_uint_32 i;
01591 
01592             sp = row + (png_size_t)((row_info->width - 1) >> 2);
01593             dp = row + (png_size_t)((final_width - 1) >> 2);
01594 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
01595             if (transformations & PNG_PACKSWAP)
01596             {
01597                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
01598                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
01599                s_start = 6;
01600                s_end = 0;
01601                s_inc = -2;
01602             }
01603             else
01604 #endif
01605             {
01606                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
01607                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
01608                s_start = 0;
01609                s_end = 6;
01610                s_inc = 2;
01611             }
01612 
01613             for (i = row_info->width; i; i--)
01614             {
01615                png_byte v;
01616                int j;
01617 
01618                v = (png_byte)((*sp >> sshift) & 0x3);
01619                for (j = 0; j < png_pass_inc[pass]; j++)
01620                {
01621                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
01622                   *dp |= (png_byte)(v << dshift);
01623                   if (dshift == s_end)
01624                   {
01625                      dshift = s_start;
01626                      dp--;
01627                   }
01628                   else
01629                      dshift += s_inc;
01630                }
01631                if (sshift == s_end)
01632                {
01633                   sshift = s_start;
01634                   sp--;
01635                }
01636                else
01637                   sshift += s_inc;
01638             }
01639             break;
01640          }
01641 
01642          case 4:
01643          {
01644             png_bytep sp, dp;
01645             int sshift, dshift;
01646             int s_start, s_end, s_inc;
01647             png_uint_32 i;
01648 
01649             sp = row + (png_size_t)((row_info->width - 1) >> 1);
01650             dp = row + (png_size_t)((final_width - 1) >> 1);
01651 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
01652             if (transformations & PNG_PACKSWAP)
01653             {
01654                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
01655                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
01656                s_start = 4;
01657                s_end = 0;
01658                s_inc = -4;
01659             }
01660             else
01661 #endif
01662             {
01663                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
01664                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
01665                s_start = 0;
01666                s_end = 4;
01667                s_inc = 4;
01668             }
01669 
01670             for (i = row_info->width; i; i--)
01671             {
01672                png_byte v;
01673                int j;
01674 
01675                v = (png_byte)((*sp >> sshift) & 0xf);
01676                for (j = 0; j < png_pass_inc[pass]; j++)
01677                {
01678                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
01679                   *dp |= (png_byte)(v << dshift);
01680                   if (dshift == s_end)
01681                   {
01682                      dshift = s_start;
01683                      dp--;
01684                   }
01685                   else
01686                      dshift += s_inc;
01687                }
01688                if (sshift == s_end)
01689                {
01690                   sshift = s_start;
01691                   sp--;
01692                }
01693                else
01694                   sshift += s_inc;
01695             }
01696             break;
01697          }
01698 
01699        /*====================================================================*/
01700 
01701          default: /* 8-bit or larger (this is where the routine is modified) */
01702          {
01703 #if 0
01704 //          static unsigned long long _const4 = 0x0000000000FFFFFFLL;  no good
01705 //          static unsigned long long const4 = 0x0000000000FFFFFFLL;   no good
01706 //          unsigned long long _const4 = 0x0000000000FFFFFFLL;         no good
01707 //          unsigned long long const4 = 0x0000000000FFFFFFLL;          no good
01708 #endif
01709             png_bytep sptr, dp;
01710             png_uint_32 i;
01711             png_size_t pixel_bytes;
01712             int width = (int)row_info->width;
01713 
01714             pixel_bytes = (row_info->pixel_depth >> 3);
01715 
01716             /* point sptr at the last pixel in the pre-expanded row: */
01717             sptr = row + (width - 1) * pixel_bytes;
01718 
01719             /* point dp at the last pixel position in the expanded row: */
01720             dp = row + (final_width - 1) * pixel_bytes;
01721 
01722             /* New code by Nirav Chhatrapati - Intel Corporation */
01723 
01724 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
01725 #if !defined(PNG_1_0_X)
01726             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
01727                 /* && _mmx_supported */ )
01728 #else
01729             if (_mmx_supported)
01730 #endif
01731             {
01732                //--------------------------------------------------------------
01733                if (pixel_bytes == 3)
01734                {
01735                   if (((pass == 0) || (pass == 1)) && width)
01736                   {
01737                      int dummy_value_c;   // fix 'forbidden register spilled'
01738                      int dummy_value_S;
01739                      int dummy_value_D;
01740 
01741                      __asm__ __volatile__ (
01742                         "subl $21, %%edi         \n\t"
01743                                      // (png_pass_inc[pass] - 1)*pixel_bytes
01744 
01745                      ".loop3_pass0:              \n\t"
01746                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
01747                         "pand _const4, %%mm0     \n\t" // z z z z z 2 1 0
01748                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
01749                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
01750                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
01751                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
01752                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
01753                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
01754                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
01755                         "movq %%mm0, %%mm3       \n\t" // 2 1 0 2 1 0 2 1
01756                         "psllq $16, %%mm0        \n\t" // 0 2 1 0 2 1 z z
01757                         "movq %%mm3, %%mm4       \n\t" // 2 1 0 2 1 0 2 1
01758                         "punpckhdq %%mm0, %%mm3  \n\t" // 0 2 1 0 2 1 0 2
01759                         "movq %%mm4, 16(%%edi)   \n\t"
01760                         "psrlq $32, %%mm0        \n\t" // z z z z 0 2 1 0
01761                         "movq %%mm3, 8(%%edi)    \n\t"
01762                         "punpckldq %%mm4, %%mm0  \n\t" // 1 0 2 1 0 2 1 0
01763                         "subl $3, %%esi          \n\t"
01764                         "movq %%mm0, (%%edi)     \n\t"
01765                         "subl $24, %%edi         \n\t"
01766                         "decl %%ecx              \n\t"
01767                         "jnz .loop3_pass0        \n\t"
01768                         "EMMS                    \n\t" // DONE
01769 
01770                         : "=c" (dummy_value_c),        // output regs (dummy)
01771                           "=S" (dummy_value_S),
01772                           "=D" (dummy_value_D)
01773 
01774                         : "1" (sptr),      // esi      // input regs
01775                           "2" (dp),        // edi
01776                           "0" (width),     // ecx
01777                           "rim" (_const4)  // %1(?)  (0x0000000000FFFFFFLL)
01778 
01779 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
01780                         : "%mm0", "%mm1", "%mm2"       // clobber list
01781                         , "%mm3", "%mm4"
01782 #endif
01783                      );
01784                   }
01785                   else if (((pass == 2) || (pass == 3)) && width)
01786                   {
01787                      int dummy_value_c;   // fix 'forbidden register spilled'
01788                      int dummy_value_S;
01789                      int dummy_value_D;
01790 
01791                      __asm__ __volatile__ (
01792                         "subl $9, %%edi          \n\t"
01793                                      // (png_pass_inc[pass] - 1)*pixel_bytes
01794 
01795                      ".loop3_pass2:              \n\t"
01796                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
01797                         "pand _const4, %%mm0     \n\t" // z z z z z 2 1 0
01798                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
01799                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
01800                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
01801                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
01802                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
01803                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
01804                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
01805                         "movq %%mm0, 4(%%edi)    \n\t"
01806                         "psrlq $16, %%mm0        \n\t" // z z 2 1 0 2 1 0
01807                         "subl $3, %%esi          \n\t"
01808                         "movd %%mm0, (%%edi)     \n\t"
01809                         "subl $12, %%edi         \n\t"
01810                         "decl %%ecx              \n\t"
01811                         "jnz .loop3_pass2        \n\t"
01812                         "EMMS                    \n\t" // DONE
01813 
01814                         : "=c" (dummy_value_c),        // output regs (dummy)
01815                           "=S" (dummy_value_S),
01816                           "=D" (dummy_value_D)
01817 
01818                         : "1" (sptr),      // esi      // input regs
01819                           "2" (dp),        // edi
01820                           "0" (width),     // ecx
01821                           "rim" (_const4)  // (0x0000000000FFFFFFLL)
01822 
01823 #if 0  /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
01824                         : "%mm0", "%mm1", "%mm2"       // clobber list
01825 #endif
01826                      );
01827                   }
01828                   else if (width) /* && ((pass == 4) || (pass == 5)) */
01829                   {
01830                      int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
01831                      if (width_mmx < 0)
01832                          width_mmx = 0;
01833                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
01834                      if (width_mmx)
01835                      {
01836                         // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
01837                         // sptr points at last pixel in pre-expanded row
01838                         // dp points at last pixel position in expanded row
01839                         int dummy_value_c;  // fix 'forbidden register spilled'
01840                         int dummy_value_S;
01841                         int dummy_value_D;
01842 
01843                         __asm__ __volatile__ (
01844                            "subl $3, %%esi          \n\t"
01845                            "subl $9, %%edi          \n\t"
01846                                         // (png_pass_inc[pass] + 1)*pixel_bytes
01847 
01848                         ".loop3_pass4:              \n\t"
01849                            "movq (%%esi), %%mm0     \n\t" // x x 5 4 3 2 1 0
01850                            "movq %%mm0, %%mm1       \n\t" // x x 5 4 3 2 1 0
01851                            "movq %%mm0, %%mm2       \n\t" // x x 5 4 3 2 1 0
01852                            "psllq $24, %%mm0        \n\t" // 4 3 2 1 0 z z z
01853                            "pand _const4, %%mm1     \n\t" // z z z z z 2 1 0
01854                            "psrlq $24, %%mm2        \n\t" // z z z x x 5 4 3
01855                            "por %%mm1, %%mm0        \n\t" // 4 3 2 1 0 2 1 0
01856                            "movq %%mm2, %%mm3       \n\t" // z z z x x 5 4 3
01857                            "psllq $8, %%mm2         \n\t" // z z x x 5 4 3 z
01858                            "movq %%mm0, (%%edi)     \n\t"
01859                            "psrlq $16, %%mm3        \n\t" // z z z z z x x 5
01860                            "pand _const6, %%mm3     \n\t" // z z z z z z z 5
01861                            "por %%mm3, %%mm2        \n\t" // z z x x 5 4 3 5
01862                            "subl $6, %%esi          \n\t"
01863                            "movd %%mm2, 8(%%edi)    \n\t"
01864                            "subl $12, %%edi         \n\t"
01865                            "subl $2, %%ecx          \n\t"
01866                            "jnz .loop3_pass4        \n\t"
01867                            "EMMS                    \n\t" // DONE
01868 
01869                            : "=c" (dummy_value_c),        // output regs (dummy)
01870                              "=S" (dummy_value_S),
01871                              "=D" (dummy_value_D)
01872 
01873                            : "1" (sptr),      // esi      // input regs
01874                              "2" (dp),        // edi
01875                              "0" (width_mmx), // ecx
01876                              "rim" (_const4), // 0x0000000000FFFFFFLL
01877                              "rim" (_const6)  // 0x00000000000000FFLL
01878 
01879 #if 0  /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
01880                            : "%mm0", "%mm1"               // clobber list
01881                            , "%mm2", "%mm3"
01882 #endif
01883                         );
01884                      }
01885 
01886                      sptr -= width_mmx*3;
01887                      dp -= width_mmx*6;
01888                      for (i = width; i; i--)
01889                      {
01890                         png_byte v[8];
01891                         int j;
01892 
01893                         png_memcpy(v, sptr, 3);
01894                         for (j = 0; j < png_pass_inc[pass]; j++)
01895                         {
01896                            png_memcpy(dp, v, 3);
01897                            dp -= 3;
01898                         }
01899                         sptr -= 3;
01900                      }
01901                   }
01902                } /* end of pixel_bytes == 3 */
01903 
01904                //--------------------------------------------------------------
01905                else if (pixel_bytes == 1)
01906                {
01907                   if (((pass == 0) || (pass == 1)) && width)
01908                   {
01909                      int width_mmx = ((width >> 2) << 2);
01910                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
01911                      if (width_mmx)
01912                      {
01913                         int dummy_value_c;  // fix 'forbidden register spilled'
01914                         int dummy_value_S;
01915                         int dummy_value_D;
01916 
01917                         __asm__ __volatile__ (
01918                            "subl $3, %%esi          \n\t"
01919                            "subl $31, %%edi         \n\t"
01920 
01921                         ".loop1_pass0:              \n\t"
01922                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
01923                            "movq %%mm0, %%mm1       \n\t" // x x x x 3 2 1 0
01924                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
01925                            "movq %%mm0, %%mm2       \n\t" // 3 3 2 2 1 1 0 0
01926                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
01927                            "movq %%mm0, %%mm3       \n\t" // 1 1 1 1 0 0 0 0
01928                            "punpckldq %%mm0, %%mm0  \n\t" // 0 0 0 0 0 0 0 0
01929                            "punpckhdq %%mm3, %%mm3  \n\t" // 1 1 1 1 1 1 1 1
01930                            "movq %%mm0, (%%edi)     \n\t"
01931                            "punpckhwd %%mm2, %%mm2  \n\t" // 3 3 3 3 2 2 2 2
01932                            "movq %%mm3, 8(%%edi)    \n\t"
01933                            "movq %%mm2, %%mm4       \n\t" // 3 3 3 3 2 2 2 2
01934                            "punpckldq %%mm2, %%mm2  \n\t" // 2 2 2 2 2 2 2 2
01935                            "punpckhdq %%mm4, %%mm4  \n\t" // 3 3 3 3 3 3 3 3
01936                            "movq %%mm2, 16(%%edi)   \n\t"
01937                            "subl $4, %%esi          \n\t"
01938                            "movq %%mm4, 24(%%edi)   \n\t"
01939                            "subl $32, %%edi         \n\t"
01940                            "subl $4, %%ecx          \n\t"
01941                            "jnz .loop1_pass0        \n\t"
01942                            "EMMS                    \n\t" // DONE
01943 
01944                            : "=c" (dummy_value_c),        // output regs (dummy)
01945                              "=S" (dummy_value_S),
01946                              "=D" (dummy_value_D)
01947 
01948                            : "1" (sptr),      // esi      // input regs
01949                              "2" (dp),        // edi
01950                              "0" (width_mmx)  // ecx
01951 
01952 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
01953                            : "%mm0", "%mm1", "%mm2"       // clobber list
01954                            , "%mm3", "%mm4"
01955 #endif
01956                         );
01957                      }
01958 
01959                      sptr -= width_mmx;
01960                      dp -= width_mmx*8;
01961                      for (i = width; i; i--)
01962                      {
01963                         int j;
01964 
01965                        /* I simplified this part in version 1.0.4e
01966                         * here and in several other instances where
01967                         * pixel_bytes == 1  -- GR-P
01968                         *
01969                         * Original code:
01970                         *
01971                         * png_byte v[8];
01972                         * png_memcpy(v, sptr, pixel_bytes);
01973                         * for (j = 0; j < png_pass_inc[pass]; j++)
01974                         * {
01975                         *    png_memcpy(dp, v, pixel_bytes);
01976                         *    dp -= pixel_bytes;
01977                         * }
01978                         * sptr -= pixel_bytes;
01979                         *
01980                         * Replacement code is in the next three lines:
01981                         */
01982 
01983                         for (j = 0; j < png_pass_inc[pass]; j++)
01984                         {
01985                            *dp-- = *sptr;
01986                         }
01987                         --sptr;
01988                      }
01989                   }
01990                   else if (((pass == 2) || (pass == 3)) && width)
01991                   {
01992                      int width_mmx = ((width >> 2) << 2);
01993                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
01994                      if (width_mmx)
01995                      {
01996                         int dummy_value_c;  // fix 'forbidden register spilled'
01997                         int dummy_value_S;
01998                         int dummy_value_D;
01999 
02000                         __asm__ __volatile__ (
02001                            "subl $3, %%esi          \n\t"
02002                            "subl $15, %%edi         \n\t"
02003 
02004                         ".loop1_pass2:              \n\t"
02005                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
02006                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
02007                            "movq %%mm0, %%mm1       \n\t" // 3 3 2 2 1 1 0 0
02008                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
02009                            "punpckhwd %%mm1, %%mm1  \n\t" // 3 3 3 3 2 2 2 2
02010                            "movq %%mm0, (%%edi)     \n\t"
02011                            "subl $4, %%esi          \n\t"
02012                            "movq %%mm1, 8(%%edi)    \n\t"
02013                            "subl $16, %%edi         \n\t"
02014                            "subl $4, %%ecx          \n\t"
02015                            "jnz .loop1_pass2        \n\t"
02016                            "EMMS                    \n\t" // DONE
02017 
02018                            : "=c" (dummy_value_c),        // output regs (dummy)
02019                              "=S" (dummy_value_S),
02020                              "=D" (dummy_value_D)
02021 
02022                            : "1" (sptr),      // esi      // input regs
02023                              "2" (dp),        // edi
02024                              "0" (width_mmx)  // ecx
02025 
02026 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
02027                            : "%mm0", "%mm1"               // clobber list
02028 #endif
02029                         );
02030                      }
02031 
02032                      sptr -= width_mmx;
02033                      dp -= width_mmx*4;
02034                      for (i = width; i; i--)
02035                      {
02036                         int j;
02037 
02038                         for (j = 0; j < png_pass_inc[pass]; j++)
02039                         {
02040                            *dp-- = *sptr;
02041                         }
02042                         --sptr;
02043                      }
02044                   }
02045                   else if (width)  /* && ((pass == 4) || (pass == 5)) */
02046                   {
02047                      int width_mmx = ((width >> 3) << 3);
02048                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
02049                      if (width_mmx)
02050                      {
02051                         int dummy_value_c;  // fix 'forbidden register spilled'
02052                         int dummy_value_S;
02053                         int dummy_value_D;
02054 
02055                         __asm__ __volatile__ (
02056                            "subl $7, %%esi          \n\t"
02057                            "subl $15, %%edi         \n\t"
02058 
02059                         ".loop1_pass4:              \n\t"
02060                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
02061                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
02062                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
02063                            "punpckhbw %%mm1, %%mm1  \n\t" // 7 7 6 6 5 5 4 4
02064                            "movq %%mm1, 8(%%edi)    \n\t"
02065                            "subl $8, %%esi          \n\t"
02066                            "movq %%mm0, (%%edi)     \n\t"
02067                            "subl $16, %%edi         \n\t"
02068                            "subl $8, %%ecx          \n\t"
02069                            "jnz .loop1_pass4        \n\t"
02070                            "EMMS                    \n\t" // DONE
02071 
02072                            : "=c" (dummy_value_c),        // output regs (none)
02073                              "=S" (dummy_value_S),
02074                              "=D" (dummy_value_D)
02075 
02076                            : "1" (sptr),      // esi      // input regs
02077                              "2" (dp),        // edi
02078                              "0" (width_mmx)  // ecx
02079 
02080 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
02081                            : "%mm0", "%mm1"               // clobber list
02082 #endif
02083                         );
02084                      }
02085 
02086                      sptr -= width_mmx;
02087                      dp -= width_mmx*2;
02088                      for (i = width; i; i--)
02089                      {
02090                         int j;
02091 
02092                         for (j = 0; j < png_pass_inc[pass]; j++)
02093                         {
02094                            *dp-- = *sptr;
02095                         }
02096                         --sptr;
02097                      }
02098                   }
02099                } /* end of pixel_bytes == 1 */
02100 
02101                //--------------------------------------------------------------
02102                else if (pixel_bytes == 2)
02103                {
02104                   if (((pass == 0) || (pass == 1)) && width)
02105                   {
02106                      int width_mmx = ((width >> 1) << 1);
02107                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
02108                      if (width_mmx)
02109                      {
02110                         int dummy_value_c;  // fix 'forbidden register spilled'
02111                         int dummy_value_S;
02112                         int dummy_value_D;
02113 
02114                         __asm__ __volatile__ (
02115                            "subl $2, %%esi          \n\t"
02116                            "subl $30, %%edi         \n\t"
02117 
02118                         ".loop2_pass0:              \n\t"
02119                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
02120                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
02121                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
02122                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
02123                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
02124                            "movq %%mm0, (%%edi)     \n\t"
02125                            "movq %%mm0, 8(%%edi)    \n\t"
02126                            "movq %%mm1, 16(%%edi)   \n\t"
02127                            "subl $4, %%esi          \n\t"
02128                            "movq %%mm1, 24(%%edi)   \n\t"
02129                            "subl $32, %%edi         \n\t"
02130                            "subl $2, %%ecx          \n\t"
02131                            "jnz .loop2_pass0        \n\t"
02132                            "EMMS                    \n\t" // DONE
02133 
02134                            : "=c" (dummy_value_c),        // output regs (dummy)
02135                              "=S" (dummy_value_S),
02136                              "=D" (dummy_value_D)
02137 
02138                            : "1" (sptr),      // esi      // input regs
02139                              "2" (dp),        // edi
02140                              "0" (width_mmx)  // ecx
02141 
02142 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
02143                            : "%mm0", "%mm1"               // clobber list
02144 #endif
02145                         );
02146                      }
02147 
02148                      sptr -= (width_mmx*2 - 2); // sign fixed
02149                      dp -= (width_mmx*16 - 2);  // sign fixed
02150                      for (i = width; i; i--)
02151                      {
02152                         png_byte v[8];
02153                         int j;
02154                         sptr -= 2;
02155                         png_memcpy(v, sptr, 2);
02156                         for (j = 0; j < png_pass_inc[pass]; j++)
02157                         {
02158                            dp -= 2;
02159                            png_memcpy(dp, v, 2);
02160                         }
02161                      }
02162                   }
02163                   else if (((pass == 2) || (pass == 3)) && width)
02164                   {
02165                      int width_mmx = ((width >> 1) << 1) ;
02166                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
02167                      if (width_mmx)
02168                      {
02169                         int dummy_value_c;  // fix 'forbidden register spilled'
02170                         int dummy_value_S;
02171                         int dummy_value_D;
02172 
02173                         __asm__ __volatile__ (
02174                            "subl $2, %%esi          \n\t"
02175                            "subl $14, %%edi         \n\t"
02176 
02177                         ".loop2_pass2:              \n\t"
02178                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
02179                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
02180                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
02181                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
02182                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
02183                            "movq %%mm0, (%%edi)     \n\t"
02184                            "subl $4, %%esi          \n\t"
02185                            "movq %%mm1, 8(%%edi)    \n\t"
02186                            "subl $16, %%edi         \n\t"
02187                            "subl $2, %%ecx          \n\t"
02188                            "jnz .loop2_pass2        \n\t"
02189                            "EMMS                    \n\t" // DONE
02190 
02191                            : "=c" (dummy_value_c),        // output regs (dummy)
02192                              "=S" (dummy_value_S),
02193                              "=D" (dummy_value_D)
02194 
02195                            : "1" (sptr),      // esi      // input regs
02196                              "2" (dp),        // edi
02197                              "0" (width_mmx)  // ecx
02198 
02199 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
02200                            : "%mm0", "%mm1"               // clobber list
02201 #endif
02202                         );
02203                      }
02204 
02205                      sptr -= (width_mmx*2 - 2); // sign fixed
02206                      dp -= (width_mmx*8 - 2);   // sign fixed
02207                      for (i = width; i; i--)
02208                      {
02209                         png_byte v[8];
02210                         int j;
02211                         sptr -= 2;
02212                         png_memcpy(v, sptr, 2);
02213                         for (j = 0; j < png_pass_inc[pass]; j++)
02214                         {
02215                            dp -= 2;
02216                            png_memcpy(dp, v, 2);
02217                         }
02218                      }
02219                   }
02220                   else if (width)  // pass == 4 or 5
02221                   {
02222                      int width_mmx = ((width >> 1) << 1) ;
02223                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
02224                      if (width_mmx)
02225                      {
02226                         int dummy_value_c;  // fix 'forbidden register spilled'
02227                         int dummy_value_S;
02228                         int dummy_value_D;
02229 
02230                         __asm__ __volatile__ (
02231                            "subl $2, %%esi          \n\t"
02232                            "subl $6, %%edi          \n\t"
02233 
02234                         ".loop2_pass4:              \n\t"
02235                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
02236                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
02237                            "subl $4, %%esi          \n\t"
02238                            "movq %%mm0, (%%edi)     \n\t"
02239                            "subl $8, %%edi          \n\t"
02240                            "subl $2, %%ecx          \n\t"
02241                            "jnz .loop2_pass4        \n\t"
02242                            "EMMS                    \n\t" // DONE
02243 
02244                            : "=c" (dummy_value_c),        // output regs (dummy)
02245                              "=S" (dummy_value_S),
02246                              "=D" (dummy_value_D)
02247 
02248                            : "1" (sptr),      // esi      // input regs
02249                              "2" (dp),        // edi
02250                              "0" (width_mmx)  // ecx
02251 
02252 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
02253                            : "%mm0"                       // clobber list
02254 #endif
02255                         );
02256                      }
02257 
02258                      sptr -= (width_mmx*2 - 2); // sign fixed
02259                      dp -= (width_mmx*4 - 2);   // sign fixed
02260                      for (i = width; i; i--)
02261                      {
02262                         png_byte v[8];
02263                         int j;
02264                         sptr -= 2;
02265                         png_memcpy(v, sptr, 2);
02266                         for (j = 0; j < png_pass_inc[pass]; j++)
02267                         {
02268                            dp -= 2;
02269                            png_memcpy(dp, v, 2);
02270                         }
02271                      }
02272                   }
02273                } /* end of pixel_bytes == 2 */
02274 
02275                //--------------------------------------------------------------
02276                else if (pixel_bytes == 4)
02277                {
02278                   if (((pass == 0) || (pass == 1)) && width)
02279                   {
02280                      int width_mmx = ((width >> 1) << 1);
02281                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
02282                      if (width_mmx)
02283                      {
02284                         int dummy_value_c;  // fix 'forbidden register spilled'
02285                         int dummy_value_S;
02286                         int dummy_value_D;
02287 
02288                         __asm__ __volatile__ (
02289                            "subl $4, %%esi          \n\t"
02290                            "subl $60, %%edi         \n\t"
02291 
02292                         ".loop4_pass0:              \n\t"
02293                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
02294                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
02295                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
02296                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
02297                            "movq %%mm0, (%%edi)     \n\t"
02298                            "movq %%mm0, 8(%%edi)    \n\t"
02299                            "movq %%mm0, 16(%%edi)   \n\t"
02300                            "movq %%mm0, 24(%%edi)   \n\t"
02301                            "movq %%mm1, 32(%%edi)   \n\t"
02302                            "movq %%mm1, 40(%%edi)   \n\t"
02303                            "movq %%mm1, 48(%%edi)   \n\t"
02304                            "subl $8, %%esi          \n\t"
02305                            "movq %%mm1, 56(%%edi)   \n\t"
02306                            "subl $64, %%edi         \n\t"
02307                            "subl $2, %%ecx          \n\t"
02308                            "jnz .loop4_pass0        \n\t"
02309                            "EMMS                    \n\t" // DONE
02310 
02311                            : "=c" (dummy_value_c),        // output regs (dummy)
02312                              "=S" (dummy_value_S),
02313                              "=D" (dummy_value_D)
02314 
02315                            : "1" (sptr),      // esi      // input regs
02316                              "2" (dp),        // edi
02317                              "0" (width_mmx)  // ecx
02318 
02319 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
02320                            : "%mm0", "%mm1"               // clobber list
02321 #endif
02322                         );
02323                      }
02324 
02325                      sptr -= (width_mmx*4 - 4); // sign fixed
02326                      dp -= (width_mmx*32 - 4);  // sign fixed
02327                      for (i = width; i; i--)
02328                      {
02329                         png_byte v[8];
02330                         int j;
02331                         sptr -= 4;
02332                         png_memcpy(v, sptr, 4);
02333                         for (j = 0; j < png_pass_inc[pass]; j++)
02334                         {
02335                            dp -= 4;
02336                            png_memcpy(dp, v, 4);
02337                         }
02338                      }
02339                   }
02340                   else if (((pass == 2) || (pass == 3)) && width)
02341                   {
02342                      int width_mmx = ((width >> 1) << 1);
02343                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
02344                      if (width_mmx)
02345                      {
02346                         int dummy_value_c;  // fix 'forbidden register spilled'
02347                         int dummy_value_S;
02348                         int dummy_value_D;
02349 
02350                         __asm__ __volatile__ (
02351                            "subl $4, %%esi          \n\t"
02352                            "subl $28, %%edi         \n\t"
02353 
02354                         ".loop4_pass2:              \n\t"
02355                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
02356                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
02357                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
02358                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
02359                            "movq %%mm0, (%%edi)     \n\t"
02360                            "movq %%mm0, 8(%%edi)    \n\t"
02361                            "movq %%mm1, 16(%%edi)   \n\t"
02362                            "movq %%mm1, 24(%%edi)   \n\t"
02363                            "subl $8, %%esi          \n\t"
02364                            "subl $32, %%edi         \n\t"
02365                            "subl $2, %%ecx          \n\t"
02366                            "jnz .loop4_pass2        \n\t"
02367                            "EMMS                    \n\t" // DONE
02368 
02369                            : "=c" (dummy_value_c),        // output regs (dummy)
02370                              "=S" (dummy_value_S),
02371                              "=D" (dummy_value_D)
02372 
02373                            : "1" (sptr),      // esi      // input regs
02374                              "2" (dp),        // edi
02375                              "0" (width_mmx)  // ecx
02376 
02377 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
02378                            : "%mm0", "%mm1"               // clobber list
02379 #endif
02380                         );
02381                      }
02382 
02383                      sptr -= (width_mmx*4 - 4); // sign fixed
02384                      dp -= (width_mmx*16 - 4);  // sign fixed
02385                      for (i = width; i; i--)
02386                      {
02387                         png_byte v[8];
02388                         int j;
02389                         sptr -= 4;
02390                         png_memcpy(v, sptr, 4);
02391                         for (j = 0; j < png_pass_inc[pass]; j++)
02392                         {
02393                            dp -= 4;
02394                            png_memcpy(dp, v, 4);
02395                         }
02396                      }
02397                   }
02398                   else if (width)  // pass == 4 or 5
02399                   {
02400                      int width_mmx = ((width >> 1) << 1) ;
02401                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
02402                      if (width_mmx)
02403                      {
02404                         int dummy_value_c;  // fix 'forbidden register spilled'
02405                         int dummy_value_S;
02406                         int dummy_value_D;
02407 
02408                         __asm__ __volatile__ (
02409                            "subl $4, %%esi          \n\t"
02410                            "subl $12, %%edi         \n\t"
02411 
02412                         ".loop4_pass4:              \n\t"
02413                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
02414                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
02415                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
02416                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
02417                            "movq %%mm0, (%%edi)     \n\t"
02418                            "subl $8, %%esi          \n\t"
02419                            "movq %%mm1, 8(%%edi)    \n\t"
02420                            "subl $16, %%edi         \n\t"
02421                            "subl $2, %%ecx          \n\t"
02422                            "jnz .loop4_pass4        \n\t"
02423                            "EMMS                    \n\t" // DONE
02424 
02425                            : "=c" (dummy_value_c),        // output regs (dummy)
02426                              "=S" (dummy_value_S),
02427                              "=D" (dummy_value_D)
02428 
02429                            : "1" (sptr),      // esi      // input regs
02430                              "2" (dp),        // edi
02431                              "0" (width_mmx)  // ecx
02432 
02433 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
02434                            : "%mm0", "%mm1"               // clobber list
02435 #endif
02436                         );
02437                      }
02438 
02439                      sptr -= (width_mmx*4 - 4); // sign fixed
02440                      dp -= (width_mmx*8 - 4);   // sign fixed
02441                      for (i = width; i; i--)
02442                      {
02443                         png_byte v[8];
02444                         int j;
02445                         sptr -= 4;
02446                         png_memcpy(v, sptr, 4);
02447                         for (j = 0; j < png_pass_inc[pass]; j++)
02448                         {
02449                            dp -= 4;
02450                            png_memcpy(dp, v, 4);
02451                         }
02452                      }
02453                   }
02454                } /* end of pixel_bytes == 4 */
02455 
02456                //--------------------------------------------------------------
02457                else if (pixel_bytes == 8)
02458                {
02459 // GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
02460                   // GRR NOTE:  no need to combine passes here!
02461                   if (((pass == 0) || (pass == 1)) && width)
02462                   {
02463                      int dummy_value_c;  // fix 'forbidden register spilled'
02464                      int dummy_value_S;
02465                      int dummy_value_D;
02466 
02467                      // source is 8-byte RRGGBBAA
02468                      // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
02469                      __asm__ __volatile__ (
02470                         "subl $56, %%edi         \n\t" // start of last block
02471 
02472                      ".loop8_pass0:              \n\t"
02473                         "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
02474                         "movq %%mm0, (%%edi)     \n\t"
02475                         "movq %%mm0, 8(%%edi)    \n\t"
02476                         "movq %%mm0, 16(%%edi)   \n\t"
02477                         "movq %%mm0, 24(%%edi)   \n\t"
02478                         "movq %%mm0, 32(%%edi)   \n\t"
02479                         "movq %%mm0, 40(%%edi)   \n\t"
02480                         "movq %%mm0, 48(%%edi)   \n\t"
02481                         "subl $8, %%esi          \n\t"
02482                         "movq %%mm0, 56(%%edi)   \n\t"
02483                         "subl $64, %%edi         \n\t"
02484                         "decl %%ecx              \n\t"
02485                         "jnz .loop8_pass0        \n\t"
02486                         "EMMS                    \n\t" // DONE
02487 
02488                         : "=c" (dummy_value_c),        // output regs (dummy)
02489                           "=S" (dummy_value_S),
02490                           "=D" (dummy_value_D)
02491 
02492                         : "1" (sptr),      // esi      // input regs
02493                           "2" (dp),        // edi
02494                           "0" (width)      // ecx
02495 
02496 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
02497                         : "%mm0"                       // clobber list
02498 #endif
02499                      );
02500                   }
02501                   else if (((pass == 2) || (pass == 3)) && width)
02502                   {
02503                      // source is 8-byte RRGGBBAA
02504                      // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
02505                      // (recall that expansion is _in place_:  sptr and dp
02506                      //  both point at locations within same row buffer)
02507                      {
02508                         int dummy_value_c;  // fix 'forbidden register spilled'
02509                         int dummy_value_S;
02510                         int dummy_value_D;
02511 
02512                         __asm__ __volatile__ (
02513                            "subl $24, %%edi         \n\t" // start of last block
02514 
02515                         ".loop8_pass2:              \n\t"
02516                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
02517                            "movq %%mm0, (%%edi)     \n\t"
02518                            "movq %%mm0, 8(%%edi)    \n\t"
02519                            "movq %%mm0, 16(%%edi)   \n\t"
02520                            "subl $8, %%esi          \n\t"
02521                            "movq %%mm0, 24(%%edi)   \n\t"
02522                            "subl $32, %%edi         \n\t"
02523                            "decl %%ecx              \n\t"
02524                            "jnz .loop8_pass2        \n\t"
02525                            "EMMS                    \n\t" // DONE
02526 
02527                            : "=c" (dummy_value_c),        // output regs (dummy)
02528                              "=S" (dummy_value_S),
02529                              "=D" (dummy_value_D)
02530 
02531                            : "1" (sptr),      // esi      // input regs
02532                              "2" (dp),        // edi
02533                              "0" (width)      // ecx
02534 
02535 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
02536                            : "%mm0"                       // clobber list
02537 #endif
02538                         );
02539                      }
02540                   }
02541                   else if (width)  // pass == 4 or 5
02542                   {
02543                      // source is 8-byte RRGGBBAA
02544                      // dest is 16-byte RRGGBBAA RRGGBBAA
02545                      {
02546                         int dummy_value_c;  // fix 'forbidden register spilled'
02547                         int dummy_value_S;
02548                         int dummy_value_D;
02549 
02550                         __asm__ __volatile__ (
02551                            "subl $8, %%edi          \n\t" // start of last block
02552 
02553                         ".loop8_pass4:              \n\t"
02554                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
02555                            "movq %%mm0, (%%edi)     \n\t"
02556                            "subl $8, %%esi          \n\t"
02557                            "movq %%mm0, 8(%%edi)    \n\t"
02558                            "subl $16, %%edi         \n\t"
02559                            "decl %%ecx              \n\t"
02560                            "jnz .loop8_pass4        \n\t"
02561                            "EMMS                    \n\t" // DONE
02562 
02563                            : "=c" (dummy_value_c),        // output regs (dummy)
02564                              "=S" (dummy_value_S),
02565                              "=D" (dummy_value_D)
02566 
02567                            : "1" (sptr),      // esi      // input regs
02568                              "2" (dp),        // edi
02569                              "0" (width)      // ecx
02570 
02571 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
02572                            : "%mm0"                       // clobber list
02573 #endif
02574                         );
02575                      }
02576                   }
02577 
02578                } /* end of pixel_bytes == 8 */
02579 
02580                //--------------------------------------------------------------
02581                else if (pixel_bytes == 6)
02582                {
02583                   for (i = width; i; i--)
02584                   {
02585                      png_byte v[8];
02586                      int j;
02587                      png_memcpy(v, sptr, 6);
02588                      for (j = 0; j < png_pass_inc[pass]; j++)
02589                      {
02590                         png_memcpy(dp, v, 6);
02591                         dp -= 6;
02592                      }
02593                      sptr -= 6;
02594                   }
02595                } /* end of pixel_bytes == 6 */
02596 
02597                //--------------------------------------------------------------
02598                else
02599                {
02600                   for (i = width; i; i--)
02601                   {
02602                      png_byte v[8];
02603                      int j;
02604                      png_memcpy(v, sptr, pixel_bytes);
02605                      for (j = 0; j < png_pass_inc[pass]; j++)
02606                      {
02607                         png_memcpy(dp, v, pixel_bytes);
02608                         dp -= pixel_bytes;
02609                      }
02610                      sptr-= pixel_bytes;
02611                   }
02612                }
02613             } // end of _mmx_supported ========================================
02614 
02615             else /* MMX not supported:  use modified C code - takes advantage
02616                   *   of inlining of png_memcpy for a constant */
02617                  /* GRR 19991007:  does it?  or should pixel_bytes in each
02618                   *   block be replaced with immediate value (e.g., 1)? */
02619                  /* GRR 19991017:  replaced with constants in each case */
02620 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
02621             {
02622                if (pixel_bytes == 1)
02623                {
02624                   for (i = width; i; i--)
02625                   {
02626                      int j;
02627                      for (j = 0; j < png_pass_inc[pass]; j++)
02628                      {
02629                         *dp-- = *sptr;
02630                      }
02631                      --sptr;
02632                   }
02633                }
02634                else if (pixel_bytes == 3)
02635                {
02636                   for (i = width; i; i--)
02637                   {
02638                      png_byte v[8];
02639                      int j;
02640                      png_memcpy(v, sptr, 3);
02641                      for (j = 0; j < png_pass_inc[pass]; j++)
02642                      {
02643                         png_memcpy(dp, v, 3);
02644                         dp -= 3;
02645                      }
02646                      sptr -= 3;
02647                   }
02648                }
02649                else if (pixel_bytes == 2)
02650                {
02651                   for (i = width; i; i--)
02652                   {
02653                      png_byte v[8];
02654                      int j;
02655                      png_memcpy(v, sptr, 2);
02656                      for (j = 0; j < png_pass_inc[pass]; j++)
02657                      {
02658                         png_memcpy(dp, v, 2);
02659                         dp -= 2;
02660                      }
02661                      sptr -= 2;
02662                   }
02663                }
02664                else if (pixel_bytes == 4)
02665                {
02666                   for (i = width; i; i--)
02667                   {
02668                      png_byte v[8];
02669                      int j;
02670                      png_memcpy(v, sptr, 4);
02671                      for (j = 0; j < png_pass_inc[pass]; j++)
02672                      {
02673 #ifdef PNG_DEBUG
02674                         if (dp < row || dp+3 > row+png_ptr->row_buf_size)
02675                         {
02676                            printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
02677                              row, dp, row+png_ptr->row_buf_size);
02678                            printf("row_buf=%d\n",png_ptr->row_buf_size);
02679                         }
02680 #endif
02681                         png_memcpy(dp, v, 4);
02682                         dp -= 4;
02683                      }
02684                      sptr -= 4;
02685                   }
02686                }
02687                else if (pixel_bytes == 6)
02688                {
02689                   for (i = width; i; i--)
02690                   {
02691                      png_byte v[8];
02692                      int j;
02693                      png_memcpy(v, sptr, 6);
02694                      for (j = 0; j < png_pass_inc[pass]; j++)
02695                      {
02696                         png_memcpy(dp, v, 6);
02697                         dp -= 6;
02698                      }
02699                      sptr -= 6;
02700                   }
02701                }
02702                else if (pixel_bytes == 8)
02703                {
02704                   for (i = width; i; i--)
02705                   {
02706                      png_byte v[8];
02707                      int j;
02708                      png_memcpy(v, sptr, 8);
02709                      for (j = 0; j < png_pass_inc[pass]; j++)
02710                      {
02711                         png_memcpy(dp, v, 8);
02712                         dp -= 8;
02713                      }
02714                      sptr -= 8;
02715                   }
02716                }
02717                else     /* GRR:  should never be reached */
02718                {
02719                   for (i = width; i; i--)
02720                   {
02721                      png_byte v[8];
02722                      int j;
02723                      png_memcpy(v, sptr, pixel_bytes);
02724                      for (j = 0; j < png_pass_inc[pass]; j++)
02725                      {
02726                         png_memcpy(dp, v, pixel_bytes);
02727                         dp -= pixel_bytes;
02728                      }
02729                      sptr -= pixel_bytes;
02730                   }
02731                }
02732 
02733             } /* end if (MMX not supported) */
02734             break;
02735          }
02736       } /* end switch (row_info->pixel_depth) */
02737 
02738       row_info->width = final_width;
02739 
02740       row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
02741    }
02742 
02743 } /* end png_do_read_interlace() */
02744 
02745 #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
02746 #endif /* PNG_READ_INTERLACING_SUPPORTED */
02747 
02748 
02749 
02750 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
02751 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
02752 
02753 // These variables are utilized in the functions below.  They are declared
02754 // globally here to ensure alignment on 8-byte boundaries.
02755 
02756 union uAll {
02757    long long use;
02758    double  align;
02759 } _LBCarryMask = {0x0101010101010101LL},
02760   _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
02761   _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
02762 
02763 #ifdef PNG_THREAD_UNSAFE_OK
02764 //===========================================================================//
02765 //                                                                           //
02766 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
02767 //                                                                           //
02768 //===========================================================================//
02769 
02770 // Optimized code for PNG Average filter decoder
02771 
02772 static void /* PRIVATE */
02773 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
02774                             png_bytep prev_row)
02775 {
02776    int bpp;
02777    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
02778    int dummy_value_S;
02779    int dummy_value_D;
02780 
02781    bpp = (row_info->pixel_depth + 7) >> 3;  // get # bytes per pixel
02782    _FullLength  = row_info->rowbytes;       // # of bytes to filter
02783 
02784    __asm__ __volatile__ (
02785       // initialize address pointers and offset
02786 #ifdef __PIC__
02787       "pushl %%ebx                 \n\t" // save index to Global Offset Table
02788 #endif
02789 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
02790       "xorl %%ebx, %%ebx           \n\t" // ebx:  x
02791       "movl %%edi, %%edx           \n\t"
02792 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
02793 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
02794       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
02795 
02796       "xorl %%eax,%%eax            \n\t"
02797 
02798       // Compute the Raw value for the first bpp bytes
02799       //    Raw(x) = Avg(x) + (Prior(x)/2)
02800    "avg_rlp:                       \n\t"
02801       "movb (%%esi,%%ebx,),%%al    \n\t" // load al with Prior(x)
02802       "incl %%ebx                  \n\t"
02803       "shrb %%al                   \n\t" // divide by 2
02804       "addb -1(%%edi,%%ebx,),%%al  \n\t" // add Avg(x); -1 to offset inc ebx
02805 //pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
02806       "cmpl %%ecx, %%ebx           \n\t"
02807       "movb %%al,-1(%%edi,%%ebx,)  \n\t" // write Raw(x); -1 to offset inc ebx
02808       "jb avg_rlp                  \n\t" // mov does not affect flags
02809 
02810       // get # of bytes to alignment
02811       "movl %%edi, _dif            \n\t" // take start of row
02812       "addl %%ebx, _dif            \n\t" // add bpp
02813       "addl $0xf, _dif             \n\t" // add 7+8 to incr past alignment bdry
02814       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
02815       "subl %%edi, _dif            \n\t" // subtract from start => value ebx at
02816       "jz avg_go                   \n\t" //  alignment
02817 
02818       // fix alignment
02819       // Compute the Raw value for the bytes up to the alignment boundary
02820       //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
02821       "xorl %%ecx, %%ecx           \n\t"
02822 
02823    "avg_lp1:                       \n\t"
02824       "xorl %%eax, %%eax           \n\t"
02825       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
02826       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
02827       "addw %%cx, %%ax             \n\t"
02828       "incl %%ebx                  \n\t"
02829       "shrw %%ax                   \n\t" // divide by 2
02830       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
02831       "cmpl _dif, %%ebx            \n\t" // check if at alignment boundary
02832       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
02833       "jb avg_lp1                  \n\t" // repeat until at alignment boundary
02834 
02835    "avg_go:                        \n\t"
02836       "movl _FullLength, %%eax     \n\t"
02837       "movl %%eax, %%ecx           \n\t"
02838       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
02839       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
02840       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
02841       "movl %%ecx, _MMXLength      \n\t"
02842 #ifdef __PIC__
02843       "popl %%ebx                  \n\t" // restore index to Global Offset Table
02844 #endif
02845 
02846       : "=c" (dummy_value_c),            // output regs (dummy)
02847         "=S" (dummy_value_S),
02848         "=D" (dummy_value_D)
02849 
02850       : "0" (bpp),       // ecx          // input regs
02851         "1" (prev_row),  // esi
02852         "2" (row)        // edi
02853 
02854       : "%eax", "%edx"                   // clobber list
02855 #ifndef __PIC__
02856       , "%ebx"
02857 #endif
02858       // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
02859       // (seems to work fine without...)
02860    );
02861 
02862    // now do the math for the rest of the row
02863    switch (bpp)
02864    {
02865       case 3:
02866       {
02867          _ActiveMask.use  = 0x0000000000ffffffLL;
02868          _ShiftBpp.use = 24;    // == 3 * 8
02869          _ShiftRem.use = 40;    // == 64 - 24
02870 
02871          __asm__ __volatile__ (
02872             // re-init address pointers and offset
02873             "movq _ActiveMask, %%mm7      \n\t"
02874             "movl _dif, %%ecx             \n\t" // ecx:  x = offset to
02875             "movq _LBCarryMask, %%mm5     \n\t" //  alignment boundary
02876 // preload  "movl row, %%edi              \n\t" // edi:  Avg(x)
02877             "movq _HBClearMask, %%mm4     \n\t"
02878 // preload  "movl prev_row, %%esi         \n\t" // esi:  Prior(x)
02879 
02880             // prime the pump:  load the first Raw(x-bpp) data set
02881             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
02882                                                 // (correct pos. in loop below)
02883          "avg_3lp:                        \n\t"
02884             "movq (%%edi,%%ecx,), %%mm0   \n\t" // load mm0 with Avg(x)
02885             "movq %%mm5, %%mm3            \n\t"
02886             "psrlq _ShiftRem, %%mm2       \n\t" // correct position Raw(x-bpp)
02887                                                 // data
02888             "movq (%%esi,%%ecx,), %%mm1   \n\t" // load mm1 with Prior(x)
02889             "movq %%mm7, %%mm6            \n\t"
02890             "pand %%mm1, %%mm3            \n\t" // get lsb for each prev_row byte
02891             "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
02892             "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each
02893                                                 // byte
02894             "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for
02895                                                 // each byte
02896             // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
02897             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
02898                                                 // LBCarrys
02899             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
02900                                                 // where both
02901                                // lsb's were == 1 (only valid for active group)
02902             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
02903             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
02904                                                 // byte
02905             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
02906                                                 // for each byte
02907             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 1
02908                                                 // bytes to add to Avg
02909             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
02910                                                 // Avg for each Active
02911                                //  byte
02912             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
02913             "psllq _ShiftBpp, %%mm6       \n\t" // shift the mm6 mask to cover
02914                                                 // bytes 3-5
02915             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
02916             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
02917             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
02918                                                 // LBCarrys
02919             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
02920                                                 // where both
02921                                // lsb's were == 1 (only valid for active group)
02922             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
02923             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
02924                                                 // byte
02925             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
02926                                                 // for each byte
02927             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2
02928                                                 // bytes to add to Avg
02929             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
02930                                                 // Avg for each Active
02931                                //  byte
02932 
02933             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
02934             "psllq _ShiftBpp, %%mm6       \n\t" // shift mm6 mask to cover last
02935                                                 // two
02936                                  // bytes
02937             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
02938             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
02939                               // Data only needs to be shifted once here to
02940                               // get the correct x-bpp offset.
02941             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
02942                                                 // LBCarrys
02943             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
02944                                                 // where both
02945                               // lsb's were == 1 (only valid for active group)
02946             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
02947             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
02948                                                 // byte
02949             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
02950                                                 // for each byte
02951             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2
02952                                                 // bytes to add to Avg
02953             "addl $8, %%ecx               \n\t"
02954             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
02955                                                 // Avg for each Active
02956                                                 // byte
02957             // now ready to write back to memory
02958             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
02959             // move updated Raw(x) to use as Raw(x-bpp) for next loop
02960             "cmpl _MMXLength, %%ecx       \n\t"
02961             "movq %%mm0, %%mm2            \n\t" // mov updated Raw(x) to mm2
02962             "jb avg_3lp                   \n\t"
02963 
02964             : "=S" (dummy_value_S),             // output regs (dummy)
02965               "=D" (dummy_value_D)
02966 
02967             : "0" (prev_row),  // esi           // input regs
02968               "1" (row)        // edi
02969 
02970             : "%ecx"                            // clobber list
02971 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
02972             , "%mm0", "%mm1", "%mm2", "%mm3"
02973             , "%mm4", "%mm5", "%mm6", "%mm7"
02974 #endif
02975          );
02976       }
02977       break;  // end 3 bpp
02978 
02979       case 6:
02980       case 4:
02981       //case 7:   // who wrote this?  PNG doesn't support 5 or 7 bytes/pixel
02982       //case 5:   // GRR BOGUS
02983       {
02984          _ActiveMask.use  = 0xffffffffffffffffLL; // use shift below to clear
02985                                                   // appropriate inactive bytes
02986          _ShiftBpp.use = bpp << 3;
02987          _ShiftRem.use = 64 - _ShiftBpp.use;
02988 
02989          __asm__ __volatile__ (
02990             "movq _HBClearMask, %%mm4    \n\t"
02991 
02992             // re-init address pointers and offset
02993             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to
02994                                                // alignment boundary
02995 
02996             // load _ActiveMask and clear all bytes except for 1st active group
02997             "movq _ActiveMask, %%mm7     \n\t"
02998 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
02999             "psrlq _ShiftRem, %%mm7      \n\t"
03000 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
03001             "movq %%mm7, %%mm6           \n\t"
03002             "movq _LBCarryMask, %%mm5    \n\t"
03003             "psllq _ShiftBpp, %%mm6      \n\t" // create mask for 2nd active
03004                                                // group
03005 
03006             // prime the pump:  load the first Raw(x-bpp) data set
03007             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
03008                                           // (we correct pos. in loop below)
03009          "avg_4lp:                       \n\t"
03010             "movq (%%edi,%%ecx,), %%mm0  \n\t"
03011             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
03012             "movq (%%esi,%%ecx,), %%mm1  \n\t"
03013             // add (Prev_row/2) to average
03014             "movq %%mm5, %%mm3           \n\t"
03015             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
03016             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
03017             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
03018                                                // byte
03019             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
03020                                                // each byte
03021             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
03022             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
03023                                                // LBCarrys
03024             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
03025                                                // where both
03026                               // lsb's were == 1 (only valid for active group)
03027             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03028             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
03029                                                // byte
03030             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
03031                                                // for each byte
03032             "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
03033                                                // bytes to add to Avg
03034             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
03035                                                // for each Active
03036                               // byte
03037             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
03038             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
03039             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
03040             "addl $8, %%ecx              \n\t"
03041             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
03042                                                // LBCarrys
03043             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
03044                                                // where both
03045                               // lsb's were == 1 (only valid for active group)
03046             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03047             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
03048                                                // byte
03049             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
03050                                                // for each byte
03051             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
03052                                                // bytes to add to Avg
03053             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
03054                                                // Avg for each Active
03055                               // byte
03056             "cmpl _MMXLength, %%ecx      \n\t"
03057             // now ready to write back to memory
03058             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
03059             // prep Raw(x-bpp) for next loop
03060             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
03061             "jb avg_4lp                  \n\t"
03062 
03063             : "=S" (dummy_value_S),            // output regs (dummy)
03064               "=D" (dummy_value_D)
03065 
03066             : "0" (prev_row),  // esi          // input regs
03067               "1" (row)        // edi
03068 
03069             : "%ecx"                           // clobber list
03070 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
03071             , "%mm0", "%mm1", "%mm2", "%mm3"
03072             , "%mm4", "%mm5", "%mm6", "%mm7"
03073 #endif
03074          );
03075       }
03076       break;  // end 4,6 bpp
03077 
03078       case 2:
03079       {
03080          _ActiveMask.use  = 0x000000000000ffffLL;
03081          _ShiftBpp.use = 16;   // == 2 * 8
03082          _ShiftRem.use = 48;   // == 64 - 16
03083 
03084          __asm__ __volatile__ (
03085             // load _ActiveMask
03086             "movq _ActiveMask, %%mm7     \n\t"
03087             // re-init address pointers and offset
03088             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to alignment
03089                                                // boundary
03090             "movq _LBCarryMask, %%mm5    \n\t"
03091 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
03092             "movq _HBClearMask, %%mm4    \n\t"
03093 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
03094 
03095             // prime the pump:  load the first Raw(x-bpp) data set
03096             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
03097                               // (we correct pos. in loop below)
03098          "avg_2lp:                       \n\t"
03099             "movq (%%edi,%%ecx,), %%mm0  \n\t"
03100             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
03101             "movq (%%esi,%%ecx,), %%mm1  \n\t" //  (GRR BUGFIX:  was psllq)
03102             // add (Prev_row/2) to average
03103             "movq %%mm5, %%mm3           \n\t"
03104             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
03105             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
03106             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
03107                                                // byte
03108             "movq %%mm7, %%mm6           \n\t"
03109             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
03110                                                // each byte
03111 
03112             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
03113             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
03114                                                // LBCarrys
03115             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
03116                                                // where both
03117                                                // lsb's were == 1 (only valid
03118                                                // for active group)
03119             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03120             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
03121                                                // byte
03122             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
03123                                                // for each byte
03124             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 1
03125                                                // bytes to add to Avg
03126             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
03127                                                // for each Active byte
03128 
03129             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
03130             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
03131                                                // bytes 2 & 3
03132             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
03133             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
03134             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
03135                                                // LBCarrys
03136             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
03137                                                // where both
03138                                                // lsb's were == 1 (only valid
03139                                                // for active group)
03140             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03141             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
03142                                                // byte
03143             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
03144                                                // for each byte
03145             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
03146                                                // bytes to add to Avg
03147             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
03148                                                // Avg for each Active byte
03149 
03150             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
03151             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
03152                                                // bytes 4 & 5
03153             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
03154             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
03155             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
03156                                                // LBCarrys
03157             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
03158                                                // where both lsb's were == 1
03159                                                // (only valid for active group)
03160             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03161             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
03162                                                // byte
03163             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
03164                                                // for each byte
03165             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
03166                                                // bytes to add to Avg
03167             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
03168                                                // Avg for each Active byte
03169 
03170             // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
03171             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
03172                                                // bytes 6 & 7
03173             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
03174             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
03175             "addl $8, %%ecx              \n\t"
03176             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
03177                                                // LBCarrys
03178             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
03179                                                // where both
03180                                                // lsb's were == 1 (only valid
03181                                                // for active group)
03182             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03183             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
03184                                                // byte
03185             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
03186                                                // for each byte
03187             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
03188                                                // bytes to add to Avg
03189             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
03190                                                // Avg for each Active byte
03191 
03192             "cmpl _MMXLength, %%ecx      \n\t"
03193             // now ready to write back to memory
03194             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
03195             // prep Raw(x-bpp) for next loop
03196             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
03197             "jb avg_2lp                  \n\t"
03198 
03199             : "=S" (dummy_value_S),            // output regs (dummy)
03200               "=D" (dummy_value_D)
03201 
03202             : "0" (prev_row),  // esi          // input regs
03203               "1" (row)        // edi
03204 
03205             : "%ecx"                           // clobber list
03206 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
03207             , "%mm0", "%mm1", "%mm2", "%mm3"
03208             , "%mm4", "%mm5", "%mm6", "%mm7"
03209 #endif
03210          );
03211       }
03212       break;  // end 2 bpp
03213 
03214       case 1:
03215       {
03216          __asm__ __volatile__ (
03217             // re-init address pointers and offset
03218 #ifdef __PIC__
03219             "pushl %%ebx                 \n\t" // save Global Offset Table index
03220 #endif
03221             "movl _dif, %%ebx            \n\t" // ebx:  x = offset to alignment
03222                                                // boundary
03223 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
03224             "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
03225             "jnb avg_1end                \n\t"
03226             // do Paeth decode for remaining bytes
03227 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
03228             "movl %%edi, %%edx           \n\t"
03229 // preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
03230             "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
03231             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
03232                                                //  in loop below
03233          "avg_1lp:                       \n\t"
03234             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
03235             "xorl %%eax, %%eax           \n\t"
03236             "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
03237             "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
03238             "addw %%cx, %%ax             \n\t"
03239             "incl %%ebx                  \n\t"
03240             "shrw %%ax                   \n\t" // divide by 2
03241             "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
03242                                                // inc ebx
03243             "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
03244             "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
03245                          // mov does not affect flags; -1 to offset inc ebx
03246             "jb avg_1lp                  \n\t"
03247 
03248          "avg_1end:                      \n\t"
03249 #ifdef __PIC__
03250             "popl %%ebx                  \n\t" // Global Offset Table index
03251 #endif
03252 
03253             : "=c" (dummy_value_c),            // output regs (dummy)
03254               "=S" (dummy_value_S),
03255               "=D" (dummy_value_D)
03256 
03257             : "0" (bpp),       // ecx          // input regs
03258               "1" (prev_row),  // esi
03259               "2" (row)        // edi
03260 
03261             : "%eax", "%edx"                   // clobber list
03262 #ifndef __PIC__
03263             , "%ebx"
03264 #endif
03265          );
03266       }
03267       return;  // end 1 bpp
03268 
03269       case 8:
03270       {
03271          __asm__ __volatile__ (
03272             // re-init address pointers and offset
03273             "movl _dif, %%ecx            \n\t" // ecx:  x == offset to alignment
03274             "movq _LBCarryMask, %%mm5    \n\t" //            boundary
03275 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
03276             "movq _HBClearMask, %%mm4    \n\t"
03277 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
03278 
03279             // prime the pump:  load the first Raw(x-bpp) data set
03280             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
03281                                       // (NO NEED to correct pos. in loop below)
03282 
03283          "avg_8lp:                       \n\t"
03284             "movq (%%edi,%%ecx,), %%mm0  \n\t"
03285             "movq %%mm5, %%mm3           \n\t"
03286             "movq (%%esi,%%ecx,), %%mm1  \n\t"
03287             "addl $8, %%ecx              \n\t"
03288             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
03289             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
03290             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
03291                                                //  where both lsb's were == 1
03292             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03293             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7, each byte
03294             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg, each byte
03295             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7, each byte
03296             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg, each
03297             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
03298             "cmpl _MMXLength, %%ecx      \n\t"
03299             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
03300             "movq %%mm0, %%mm2           \n\t" // reuse as Raw(x-bpp)
03301             "jb avg_8lp                  \n\t"
03302 
03303             : "=S" (dummy_value_S),            // output regs (dummy)
03304               "=D" (dummy_value_D)
03305 
03306             : "0" (prev_row),  // esi          // input regs
03307               "1" (row)        // edi
03308 
03309             : "%ecx"                           // clobber list
03310 #if 0  /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
03311             , "%mm0", "%mm1", "%mm2"
03312             , "%mm3", "%mm4", "%mm5"
03313 #endif
03314          );
03315       }
03316       break;  // end 8 bpp
03317 
03318       default:                  // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
03319       {
03320 
03321 #ifdef PNG_DEBUG
03322          // GRR:  PRINT ERROR HERE:  SHOULD NEVER BE REACHED
03323         png_debug(1,
03324         "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
03325 #endif
03326 
03327 #if 0
03328         __asm__ __volatile__ (
03329             "movq _LBCarryMask, %%mm5    \n\t"
03330             // re-init address pointers and offset
03331             "movl _dif, %%ebx            \n\t" // ebx:  x = offset to
03332                                                // alignment boundary
03333             "movl row, %%edi             \n\t" // edi:  Avg(x)
03334             "movq _HBClearMask, %%mm4    \n\t"
03335             "movl %%edi, %%edx           \n\t"
03336             "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
03337             "subl bpp, %%edx             \n\t" // edx:  Raw(x-bpp)
03338          "avg_Alp:                       \n\t"
03339             "movq (%%edi,%%ebx,), %%mm0  \n\t"
03340             "movq %%mm5, %%mm3           \n\t"
03341             "movq (%%esi,%%ebx,), %%mm1  \n\t"
03342             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
03343             "movq (%%edx,%%ebx,), %%mm2  \n\t"
03344             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
03345             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
03346                                                // where both lsb's were == 1
03347             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03348             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
03349                                                // byte
03350             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg for each
03351                                                // byte
03352             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
03353                                                // byte
03354             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
03355                                                // each byte
03356             "addl $8, %%ebx              \n\t"
03357             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
03358                                                // byte
03359             "cmpl _MMXLength, %%ebx      \n\t"
03360             "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
03361             "jb avg_Alp                  \n\t"
03362 
03363             : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
03364 
03365             : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
03366 
03367             : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
03368          );
03369 #endif /* 0 - NEVER REACHED */
03370       }
03371       break;
03372 
03373    } // end switch (bpp)
03374 
03375    __asm__ __volatile__ (
03376       // MMX acceleration complete; now do clean-up
03377       // check if any remaining bytes left to decode
03378 #ifdef __PIC__
03379       "pushl %%ebx                 \n\t" // save index to Global Offset Table
03380 #endif
03381       "movl _MMXLength, %%ebx      \n\t" // ebx:  x == offset bytes after MMX
03382 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
03383       "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
03384       "jnb avg_end                 \n\t"
03385 
03386       // do Avg decode for remaining bytes
03387 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
03388       "movl %%edi, %%edx           \n\t"
03389 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
03390       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
03391       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
03392 
03393    "avg_lp2:                       \n\t"
03394       // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
03395       "xorl %%eax, %%eax           \n\t"
03396       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
03397       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
03398       "addw %%cx, %%ax             \n\t"
03399       "incl %%ebx                  \n\t"
03400       "shrw %%ax                   \n\t" // divide by 2
03401       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
03402       "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
03403       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
03404       "jb avg_lp2                  \n\t" //  affect flags; -1 to offset inc ebx]
03405 
03406    "avg_end:                       \n\t"
03407       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
03408 #ifdef __PIC__
03409       "popl %%ebx                  \n\t" // restore index to Global Offset Table
03410 #endif
03411 
03412       : "=c" (dummy_value_c),            // output regs (dummy)
03413         "=S" (dummy_value_S),
03414         "=D" (dummy_value_D)
03415 
03416       : "0" (bpp),       // ecx          // input regs
03417         "1" (prev_row),  // esi
03418         "2" (row)        // edi
03419 
03420       : "%eax", "%edx"                   // clobber list
03421 #ifndef __PIC__
03422       , "%ebx"
03423 #endif
03424    );
03425 
03426 } /* end png_read_filter_row_mmx_avg() */
03427 #endif
03428 
03429 
03430 
03431 #ifdef PNG_THREAD_UNSAFE_OK
03432 //===========================================================================//
03433 //                                                                           //
03434 //         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         //
03435 //                                                                           //
03436 //===========================================================================//
03437 
03438 // Optimized code for PNG Paeth filter decoder
03439 
03440 static void /* PRIVATE */
03441 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
03442                               png_bytep prev_row)
03443 {
03444    int bpp;
03445    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
03446    int dummy_value_S;
03447    int dummy_value_D;
03448 
03449    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
03450    _FullLength  = row_info->rowbytes; // # of bytes to filter
03451 
03452    __asm__ __volatile__ (
03453 #ifdef __PIC__
03454       "pushl %%ebx                 \n\t" // save index to Global Offset Table
03455 #endif
03456       "xorl %%ebx, %%ebx           \n\t" // ebx:  x offset
03457 //pre "movl row, %%edi             \n\t"
03458       "xorl %%edx, %%edx           \n\t" // edx:  x-bpp offset
03459 //pre "movl prev_row, %%esi        \n\t"
03460       "xorl %%eax, %%eax           \n\t"
03461 
03462       // Compute the Raw value for the first bpp bytes
03463       // Note: the formula works out to be always
03464       //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
03465    "paeth_rlp:                     \n\t"
03466       "movb (%%edi,%%ebx,), %%al   \n\t"
03467       "addb (%%esi,%%ebx,), %%al   \n\t"
03468       "incl %%ebx                  \n\t"
03469 //pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx)
03470       "cmpl %%ecx, %%ebx           \n\t"
03471       "movb %%al, -1(%%edi,%%ebx,) \n\t"
03472       "jb paeth_rlp                \n\t"
03473       // get # of bytes to alignment
03474       "movl %%edi, _dif            \n\t" // take start of row
03475       "addl %%ebx, _dif            \n\t" // add bpp
03476       "xorl %%ecx, %%ecx           \n\t"
03477       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past alignment
03478                                          // boundary
03479       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
03480       "subl %%edi, _dif            \n\t" // subtract from start ==> value ebx
03481                                          // at alignment
03482       "jz paeth_go                 \n\t"
03483       // fix alignment
03484 
03485    "paeth_lp1:                     \n\t"
03486       "xorl %%eax, %%eax           \n\t"
03487       // pav = p - a = (a + b - c) - a = b - c
03488       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
03489       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
03490       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
03491       "movl %%eax, _patemp         \n\t" // Save pav for later use
03492       "xorl %%eax, %%eax           \n\t"
03493       // pbv = p - b = (a + b - c) - b = a - c
03494       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
03495       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
03496       "movl %%eax, %%ecx           \n\t"
03497       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03498       "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
03499       // pc = abs(pcv)
03500       "testl $0x80000000, %%eax    \n\t"
03501       "jz paeth_pca                \n\t"
03502       "negl %%eax                  \n\t" // reverse sign of neg values
03503 
03504    "paeth_pca:                     \n\t"
03505       "movl %%eax, _pctemp         \n\t" // save pc for later use
03506       // pb = abs(pbv)
03507       "testl $0x80000000, %%ecx    \n\t"
03508       "jz paeth_pba                \n\t"
03509       "negl %%ecx                  \n\t" // reverse sign of neg values
03510 
03511    "paeth_pba:                     \n\t"
03512       "movl %%ecx, _pbtemp         \n\t" // save pb for later use
03513       // pa = abs(pav)
03514       "movl _patemp, %%eax         \n\t"
03515       "testl $0x80000000, %%eax    \n\t"
03516       "jz paeth_paa                \n\t"
03517       "negl %%eax                  \n\t" // reverse sign of neg values
03518 
03519    "paeth_paa:                     \n\t"
03520       "movl %%eax, _patemp         \n\t" // save pa for later use
03521       // test if pa <= pb
03522       "cmpl %%ecx, %%eax           \n\t"
03523       "jna paeth_abb               \n\t"
03524       // pa > pb; now test if pb <= pc
03525       "cmpl _pctemp, %%ecx         \n\t"
03526       "jna paeth_bbc               \n\t"
03527       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
03528       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
03529       "jmp paeth_paeth             \n\t"
03530 
03531    "paeth_bbc:                     \n\t"
03532       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
03533       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
03534       "jmp paeth_paeth             \n\t"
03535 
03536    "paeth_abb:                     \n\t"
03537       // pa <= pb; now test if pa <= pc
03538       "cmpl _pctemp, %%eax         \n\t"
03539       "jna paeth_abc               \n\t"
03540       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
03541       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
03542       "jmp paeth_paeth             \n\t"
03543 
03544    "paeth_abc:                     \n\t"
03545       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
03546       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
03547 
03548    "paeth_paeth:                   \n\t"
03549       "incl %%ebx                  \n\t"
03550       "incl %%edx                  \n\t"
03551       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
03552       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
03553       "cmpl _dif, %%ebx            \n\t"
03554       "jb paeth_lp1                \n\t"
03555 
03556    "paeth_go:                      \n\t"
03557       "movl _FullLength, %%ecx     \n\t"
03558       "movl %%ecx, %%eax           \n\t"
03559       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
03560       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
03561       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
03562       "movl %%ecx, _MMXLength      \n\t"
03563 #ifdef __PIC__
03564       "popl %%ebx                  \n\t" // restore index to Global Offset Table
03565 #endif
03566 
03567       : "=c" (dummy_value_c),            // output regs (dummy)
03568         "=S" (dummy_value_S),
03569         "=D" (dummy_value_D)
03570 
03571       : "0" (bpp),       // ecx          // input regs
03572         "1" (prev_row),  // esi
03573         "2" (row)        // edi
03574 
03575       : "%eax", "%edx"                   // clobber list
03576 #ifndef __PIC__
03577       , "%ebx"
03578 #endif
03579    );
03580 
03581    // now do the math for the rest of the row
03582    switch (bpp)
03583    {
03584       case 3:
03585       {
03586          _ActiveMask.use = 0x0000000000ffffffLL;
03587          _ActiveMaskEnd.use = 0xffff000000000000LL;
03588          _ShiftBpp.use = 24;    // == bpp(3) * 8
03589          _ShiftRem.use = 40;    // == 64 - 24
03590 
03591          __asm__ __volatile__ (
03592             "movl _dif, %%ecx            \n\t"
03593 // preload  "movl row, %%edi             \n\t"
03594 // preload  "movl prev_row, %%esi        \n\t"
03595             "pxor %%mm0, %%mm0           \n\t"
03596             // prime the pump:  load the first Raw(x-bpp) data set
03597             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
03598          "paeth_3lp:                     \n\t"
03599             "psrlq _ShiftRem, %%mm1      \n\t" // shift last 3 bytes to 1st
03600                                                // 3 bytes
03601             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
03602             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
03603             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
03604             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
03605             "psrlq _ShiftRem, %%mm3      \n\t" // shift last 3 bytes to 1st
03606                                                // 3 bytes
03607             // pav = p - a = (a + b - c) - a = b - c
03608             "movq %%mm2, %%mm4           \n\t"
03609             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
03610             // pbv = p - b = (a + b - c) - b = a - c
03611             "movq %%mm1, %%mm5           \n\t"
03612             "psubw %%mm3, %%mm4          \n\t"
03613             "pxor %%mm7, %%mm7           \n\t"
03614             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03615             "movq %%mm4, %%mm6           \n\t"
03616             "psubw %%mm3, %%mm5          \n\t"
03617 
03618             // pa = abs(p-a) = abs(pav)
03619             // pb = abs(p-b) = abs(pbv)
03620             // pc = abs(p-c) = abs(pcv)
03621             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
03622             "paddw %%mm5, %%mm6          \n\t"
03623             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
03624             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
03625             "psubw %%mm0, %%mm4          \n\t"
03626             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
03627             "psubw %%mm0, %%mm4          \n\t"
03628             "psubw %%mm7, %%mm5          \n\t"
03629             "pxor %%mm0, %%mm0           \n\t"
03630             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
03631             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
03632             "psubw %%mm7, %%mm5          \n\t"
03633             "psubw %%mm0, %%mm6          \n\t"
03634             //  test pa <= pb
03635             "movq %%mm4, %%mm7           \n\t"
03636             "psubw %%mm0, %%mm6          \n\t"
03637             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
03638             "movq %%mm7, %%mm0           \n\t"
03639             // use mm7 mask to merge pa & pb
03640             "pand %%mm7, %%mm5           \n\t"
03641             // use mm0 mask copy to merge a & b
03642             "pand %%mm0, %%mm2           \n\t"
03643             "pandn %%mm4, %%mm7          \n\t"
03644             "pandn %%mm1, %%mm0          \n\t"
03645             "paddw %%mm5, %%mm7          \n\t"
03646             "paddw %%mm2, %%mm0          \n\t"
03647             //  test  ((pa <= pb)? pa:pb) <= pc
03648             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
03649             "pxor %%mm1, %%mm1           \n\t"
03650             "pand %%mm7, %%mm3           \n\t"
03651             "pandn %%mm0, %%mm7          \n\t"
03652             "paddw %%mm3, %%mm7          \n\t"
03653             "pxor %%mm0, %%mm0           \n\t"
03654             "packuswb %%mm1, %%mm7       \n\t"
03655             "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
03656             "pand _ActiveMask, %%mm7     \n\t"
03657             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
03658             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
03659             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
03660             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
03661             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
03662                                                // Raw(x-bpp)
03663             // now do Paeth for 2nd set of bytes (3-5)
03664             "psrlq _ShiftBpp, %%mm2      \n\t" // load b=Prior(x) step 2
03665             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
03666             "pxor %%mm7, %%mm7           \n\t"
03667             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
03668             // pbv = p - b = (a + b - c) - b = a - c
03669             "movq %%mm1, %%mm5           \n\t"
03670             // pav = p - a = (a + b - c) - a = b - c
03671             "movq %%mm2, %%mm4           \n\t"
03672             "psubw %%mm3, %%mm5          \n\t"
03673             "psubw %%mm3, %%mm4          \n\t"
03674             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
03675             //       pav + pbv = pbv + pav
03676             "movq %%mm5, %%mm6           \n\t"
03677             "paddw %%mm4, %%mm6          \n\t"
03678 
03679             // pa = abs(p-a) = abs(pav)
03680             // pb = abs(p-b) = abs(pbv)
03681             // pc = abs(p-c) = abs(pcv)
03682             "pcmpgtw %%mm5, %%mm0        \n\t" // create mask pbv bytes < 0
03683             "pcmpgtw %%mm4, %%mm7        \n\t" // create mask pav bytes < 0
03684             "pand %%mm5, %%mm0           \n\t" // only pbv bytes < 0 in mm0
03685             "pand %%mm4, %%mm7           \n\t" // only pav bytes < 0 in mm7
03686             "psubw %%mm0, %%mm5          \n\t"
03687             "psubw %%mm7, %%mm4          \n\t"
03688             "psubw %%mm0, %%mm5          \n\t"
03689             "psubw %%mm7, %%mm4          \n\t"
03690             "pxor %%mm0, %%mm0           \n\t"
03691             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
03692             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
03693             "psubw %%mm0, %%mm6          \n\t"
03694             //  test pa <= pb
03695             "movq %%mm4, %%mm7           \n\t"
03696             "psubw %%mm0, %%mm6          \n\t"
03697             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
03698             "movq %%mm7, %%mm0           \n\t"
03699             // use mm7 mask to merge pa & pb
03700             "pand %%mm7, %%mm5           \n\t"
03701             // use mm0 mask copy to merge a & b
03702             "pand %%mm0, %%mm2           \n\t"
03703             "pandn %%mm4, %%mm7          \n\t"
03704             "pandn %%mm1, %%mm0          \n\t"
03705             "paddw %%mm5, %%mm7          \n\t"
03706             "paddw %%mm2, %%mm0          \n\t"
03707             //  test  ((pa <= pb)? pa:pb) <= pc
03708             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
03709             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
03710             "pand %%mm7, %%mm3           \n\t"
03711             "pandn %%mm0, %%mm7          \n\t"
03712             "pxor %%mm1, %%mm1           \n\t"
03713             "paddw %%mm3, %%mm7          \n\t"
03714             "pxor %%mm0, %%mm0           \n\t"
03715             "packuswb %%mm1, %%mm7       \n\t"
03716             "movq %%mm2, %%mm3           \n\t" // load c=Prior(x-bpp) step 1
03717             "pand _ActiveMask, %%mm7     \n\t"
03718             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
03719             "psllq _ShiftBpp, %%mm7      \n\t" // shift bytes to 2nd group of
03720                                                // 3 bytes
03721              // pav = p - a = (a + b - c) - a = b - c
03722             "movq %%mm2, %%mm4           \n\t"
03723             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
03724             "psllq _ShiftBpp, %%mm3      \n\t" // load c=Prior(x-bpp) step 2
03725             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
03726             "movq %%mm7, %%mm1           \n\t"
03727             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
03728             "psllq _ShiftBpp, %%mm1      \n\t" // shift bytes
03729                                     // now mm1 will be used as Raw(x-bpp)
03730             // now do Paeth for 3rd, and final, set of bytes (6-7)
03731             "pxor %%mm7, %%mm7           \n\t"
03732             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
03733             "psubw %%mm3, %%mm4          \n\t"
03734             // pbv = p - b = (a + b - c) - b = a - c
03735             "movq %%mm1, %%mm5           \n\t"
03736             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03737             "movq %%mm4, %%mm6           \n\t"
03738             "psubw %%mm3, %%mm5          \n\t"
03739             "pxor %%mm0, %%mm0           \n\t"
03740             "paddw %%mm5, %%mm6          \n\t"
03741 
03742             // pa = abs(p-a) = abs(pav)
03743             // pb = abs(p-b) = abs(pbv)
03744             // pc = abs(p-c) = abs(pcv)
03745             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
03746             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
03747             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
03748             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
03749             "psubw %%mm0, %%mm4          \n\t"
03750             "psubw %%mm7, %%mm5          \n\t"
03751             "psubw %%mm0, %%mm4          \n\t"
03752             "psubw %%mm7, %%mm5          \n\t"
03753             "pxor %%mm0, %%mm0           \n\t"
03754             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
03755             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
03756             "psubw %%mm0, %%mm6          \n\t"
03757             //  test pa <= pb
03758             "movq %%mm4, %%mm7           \n\t"
03759             "psubw %%mm0, %%mm6          \n\t"
03760             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
03761             "movq %%mm7, %%mm0           \n\t"
03762             // use mm0 mask copy to merge a & b
03763             "pand %%mm0, %%mm2           \n\t"
03764             // use mm7 mask to merge pa & pb
03765             "pand %%mm7, %%mm5           \n\t"
03766             "pandn %%mm1, %%mm0          \n\t"
03767             "pandn %%mm4, %%mm7          \n\t"
03768             "paddw %%mm2, %%mm0          \n\t"
03769             "paddw %%mm5, %%mm7          \n\t"
03770             //  test  ((pa <= pb)? pa:pb) <= pc
03771             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
03772             "pand %%mm7, %%mm3           \n\t"
03773             "pandn %%mm0, %%mm7          \n\t"
03774             "paddw %%mm3, %%mm7          \n\t"
03775             "pxor %%mm1, %%mm1           \n\t"
03776             "packuswb %%mm7, %%mm1       \n\t"
03777             // step ecx to next set of 8 bytes and repeat loop til done
03778             "addl $8, %%ecx              \n\t"
03779             "pand _ActiveMaskEnd, %%mm1  \n\t"
03780             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
03781                                                  // Raw(x)
03782 
03783             "cmpl _MMXLength, %%ecx      \n\t"
03784             "pxor %%mm0, %%mm0           \n\t" // pxor does not affect flags
03785             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
03786                                  // mm1 will be used as Raw(x-bpp) next loop
03787                            // mm3 ready to be used as Prior(x-bpp) next loop
03788             "jb paeth_3lp                \n\t"
03789 
03790             : "=S" (dummy_value_S),             // output regs (dummy)
03791               "=D" (dummy_value_D)
03792 
03793             : "0" (prev_row),  // esi           // input regs
03794               "1" (row)        // edi
03795 
03796             : "%ecx"                            // clobber list
03797 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
03798             , "%mm0", "%mm1", "%mm2", "%mm3"
03799             , "%mm4", "%mm5", "%mm6", "%mm7"
03800 #endif
03801          );
03802       }
03803       break;  // end 3 bpp
03804 
03805       case 6:
03806       //case 7:   // GRR BOGUS
03807       //case 5:   // GRR BOGUS
03808       {
03809          _ActiveMask.use  = 0x00000000ffffffffLL;
03810          _ActiveMask2.use = 0xffffffff00000000LL;
03811          _ShiftBpp.use = bpp << 3;    // == bpp * 8
03812          _ShiftRem.use = 64 - _ShiftBpp.use;
03813 
03814          __asm__ __volatile__ (
03815             "movl _dif, %%ecx            \n\t"
03816 // preload  "movl row, %%edi             \n\t"
03817 // preload  "movl prev_row, %%esi        \n\t"
03818             // prime the pump:  load the first Raw(x-bpp) data set
03819             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
03820             "pxor %%mm0, %%mm0           \n\t"
03821 
03822          "paeth_6lp:                     \n\t"
03823             // must shift to position Raw(x-bpp) data
03824             "psrlq _ShiftRem, %%mm1      \n\t"
03825             // do first set of 4 bytes
03826             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
03827             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
03828             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
03829             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
03830             // must shift to position Prior(x-bpp) data
03831             "psrlq _ShiftRem, %%mm3      \n\t"
03832             // pav = p - a = (a + b - c) - a = b - c
03833             "movq %%mm2, %%mm4           \n\t"
03834             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
03835             // pbv = p - b = (a + b - c) - b = a - c
03836             "movq %%mm1, %%mm5           \n\t"
03837             "psubw %%mm3, %%mm4          \n\t"
03838             "pxor %%mm7, %%mm7           \n\t"
03839             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03840             "movq %%mm4, %%mm6           \n\t"
03841             "psubw %%mm3, %%mm5          \n\t"
03842             // pa = abs(p-a) = abs(pav)
03843             // pb = abs(p-b) = abs(pbv)
03844             // pc = abs(p-c) = abs(pcv)
03845             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
03846             "paddw %%mm5, %%mm6          \n\t"
03847             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
03848             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
03849             "psubw %%mm0, %%mm4          \n\t"
03850             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
03851             "psubw %%mm0, %%mm4          \n\t"
03852             "psubw %%mm7, %%mm5          \n\t"
03853             "pxor %%mm0, %%mm0           \n\t"
03854             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
03855             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
03856             "psubw %%mm7, %%mm5          \n\t"
03857             "psubw %%mm0, %%mm6          \n\t"
03858             //  test pa <= pb
03859             "movq %%mm4, %%mm7           \n\t"
03860             "psubw %%mm0, %%mm6          \n\t"
03861             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
03862             "movq %%mm7, %%mm0           \n\t"
03863             // use mm7 mask to merge pa & pb
03864             "pand %%mm7, %%mm5           \n\t"
03865             // use mm0 mask copy to merge a & b
03866             "pand %%mm0, %%mm2           \n\t"
03867             "pandn %%mm4, %%mm7          \n\t"
03868             "pandn %%mm1, %%mm0          \n\t"
03869             "paddw %%mm5, %%mm7          \n\t"
03870             "paddw %%mm2, %%mm0          \n\t"
03871             //  test  ((pa <= pb)? pa:pb) <= pc
03872             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
03873             "pxor %%mm1, %%mm1           \n\t"
03874             "pand %%mm7, %%mm3           \n\t"
03875             "pandn %%mm0, %%mm7          \n\t"
03876             "paddw %%mm3, %%mm7          \n\t"
03877             "pxor %%mm0, %%mm0           \n\t"
03878             "packuswb %%mm1, %%mm7       \n\t"
03879             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
03880             "pand _ActiveMask, %%mm7     \n\t"
03881             "psrlq _ShiftRem, %%mm3      \n\t"
03882             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x) step 1
03883             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
03884             "movq %%mm2, %%mm6           \n\t"
03885             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
03886             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
03887             "psllq _ShiftBpp, %%mm6      \n\t"
03888             "movq %%mm7, %%mm5           \n\t"
03889             "psrlq _ShiftRem, %%mm1      \n\t"
03890             "por %%mm6, %%mm3            \n\t"
03891             "psllq _ShiftBpp, %%mm5      \n\t"
03892             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
03893             "por %%mm5, %%mm1            \n\t"
03894             // do second set of 4 bytes
03895             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
03896             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
03897             // pav = p - a = (a + b - c) - a = b - c
03898             "movq %%mm2, %%mm4           \n\t"
03899             // pbv = p - b = (a + b - c) - b = a - c
03900             "movq %%mm1, %%mm5           \n\t"
03901             "psubw %%mm3, %%mm4          \n\t"
03902             "pxor %%mm7, %%mm7           \n\t"
03903             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03904             "movq %%mm4, %%mm6           \n\t"
03905             "psubw %%mm3, %%mm5          \n\t"
03906             // pa = abs(p-a) = abs(pav)
03907             // pb = abs(p-b) = abs(pbv)
03908             // pc = abs(p-c) = abs(pcv)
03909             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
03910             "paddw %%mm5, %%mm6          \n\t"
03911             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
03912             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
03913             "psubw %%mm0, %%mm4          \n\t"
03914             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
03915             "psubw %%mm0, %%mm4          \n\t"
03916             "psubw %%mm7, %%mm5          \n\t"
03917             "pxor %%mm0, %%mm0           \n\t"
03918             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
03919             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
03920             "psubw %%mm7, %%mm5          \n\t"
03921             "psubw %%mm0, %%mm6          \n\t"
03922             //  test pa <= pb
03923             "movq %%mm4, %%mm7           \n\t"
03924             "psubw %%mm0, %%mm6          \n\t"
03925             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
03926             "movq %%mm7, %%mm0           \n\t"
03927             // use mm7 mask to merge pa & pb
03928             "pand %%mm7, %%mm5           \n\t"
03929             // use mm0 mask copy to merge a & b
03930             "pand %%mm0, %%mm2           \n\t"
03931             "pandn %%mm4, %%mm7          \n\t"
03932             "pandn %%mm1, %%mm0          \n\t"
03933             "paddw %%mm5, %%mm7          \n\t"
03934             "paddw %%mm2, %%mm0          \n\t"
03935             //  test  ((pa <= pb)? pa:pb) <= pc
03936             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
03937             "pxor %%mm1, %%mm1           \n\t"
03938             "pand %%mm7, %%mm3           \n\t"
03939             "pandn %%mm0, %%mm7          \n\t"
03940             "pxor %%mm1, %%mm1           \n\t"
03941             "paddw %%mm3, %%mm7          \n\t"
03942             "pxor %%mm0, %%mm0           \n\t"
03943             // step ecx to next set of 8 bytes and repeat loop til done
03944             "addl $8, %%ecx              \n\t"
03945             "packuswb %%mm7, %%mm1       \n\t"
03946             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
03947             "cmpl _MMXLength, %%ecx      \n\t"
03948             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
03949                                 // mm1 will be used as Raw(x-bpp) next loop
03950             "jb paeth_6lp                \n\t"
03951 
03952             : "=S" (dummy_value_S),             // output regs (dummy)
03953               "=D" (dummy_value_D)
03954 
03955             : "0" (prev_row),  // esi           // input regs
03956               "1" (row)        // edi
03957 
03958             : "%ecx"                            // clobber list
03959 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
03960             , "%mm0", "%mm1", "%mm2", "%mm3"
03961             , "%mm4", "%mm5", "%mm6", "%mm7"
03962 #endif
03963          );
03964       }
03965       break;  // end 6 bpp
03966 
03967       case 4:
03968       {
03969          _ActiveMask.use  = 0x00000000ffffffffLL;
03970 
03971          __asm__ __volatile__ (
03972             "movl _dif, %%ecx            \n\t"
03973 // preload  "movl row, %%edi             \n\t"
03974 // preload  "movl prev_row, %%esi        \n\t"
03975             "pxor %%mm0, %%mm0           \n\t"
03976             // prime the pump:  load the first Raw(x-bpp) data set
03977             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
03978                                      //  a=Raw(x-bpp) bytes
03979          "paeth_4lp:                     \n\t"
03980             // do first set of 4 bytes
03981             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
03982             "punpckhbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
03983             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
03984             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
03985             // pav = p - a = (a + b - c) - a = b - c
03986             "movq %%mm2, %%mm4           \n\t"
03987             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
03988             // pbv = p - b = (a + b - c) - b = a - c
03989             "movq %%mm1, %%mm5           \n\t"
03990             "psubw %%mm3, %%mm4          \n\t"
03991             "pxor %%mm7, %%mm7           \n\t"
03992             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03993             "movq %%mm4, %%mm6           \n\t"
03994             "psubw %%mm3, %%mm5          \n\t"
03995             // pa = abs(p-a) = abs(pav)
03996             // pb = abs(p-b) = abs(pbv)
03997             // pc = abs(p-c) = abs(pcv)
03998             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
03999             "paddw %%mm5, %%mm6          \n\t"
04000             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
04001             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
04002             "psubw %%mm0, %%mm4          \n\t"
04003             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
04004             "psubw %%mm0, %%mm4          \n\t"
04005             "psubw %%mm7, %%mm5          \n\t"
04006             "pxor %%mm0, %%mm0           \n\t"
04007             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
04008             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
04009             "psubw %%mm7, %%mm5          \n\t"
04010             "psubw %%mm0, %%mm6          \n\t"
04011             //  test pa <= pb
04012             "movq %%mm4, %%mm7           \n\t"
04013             "psubw %%mm0, %%mm6          \n\t"
04014             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
04015             "movq %%mm7, %%mm0           \n\t"
04016             // use mm7 mask to merge pa & pb
04017             "pand %%mm7, %%mm5           \n\t"
04018             // use mm0 mask copy to merge a & b
04019             "pand %%mm0, %%mm2           \n\t"
04020             "pandn %%mm4, %%mm7          \n\t"
04021             "pandn %%mm1, %%mm0          \n\t"
04022             "paddw %%mm5, %%mm7          \n\t"
04023             "paddw %%mm2, %%mm0          \n\t"
04024             //  test  ((pa <= pb)? pa:pb) <= pc
04025             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
04026             "pxor %%mm1, %%mm1           \n\t"
04027             "pand %%mm7, %%mm3           \n\t"
04028             "pandn %%mm0, %%mm7          \n\t"
04029             "paddw %%mm3, %%mm7          \n\t"
04030             "pxor %%mm0, %%mm0           \n\t"
04031             "packuswb %%mm1, %%mm7       \n\t"
04032             "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
04033             "pand _ActiveMask, %%mm7     \n\t"
04034             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
04035             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
04036             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
04037             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
04038             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as Raw(x-bpp)
04039             // do second set of 4 bytes
04040             "punpckhbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
04041             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
04042             // pav = p - a = (a + b - c) - a = b - c
04043             "movq %%mm2, %%mm4           \n\t"
04044             // pbv = p - b = (a + b - c) - b = a - c
04045             "movq %%mm1, %%mm5           \n\t"
04046             "psubw %%mm3, %%mm4          \n\t"
04047             "pxor %%mm7, %%mm7           \n\t"
04048             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
04049             "movq %%mm4, %%mm6           \n\t"
04050             "psubw %%mm3, %%mm5          \n\t"
04051             // pa = abs(p-a) = abs(pav)
04052             // pb = abs(p-b) = abs(pbv)
04053             // pc = abs(p-c) = abs(pcv)
04054             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
04055             "paddw %%mm5, %%mm6          \n\t"
04056             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
04057             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
04058             "psubw %%mm0, %%mm4          \n\t"
04059             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
04060             "psubw %%mm0, %%mm4          \n\t"
04061             "psubw %%mm7, %%mm5          \n\t"
04062             "pxor %%mm0, %%mm0           \n\t"
04063             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
04064             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
04065             "psubw %%mm7, %%mm5          \n\t"
04066             "psubw %%mm0, %%mm6          \n\t"
04067             //  test pa <= pb
04068             "movq %%mm4, %%mm7           \n\t"
04069             "psubw %%mm0, %%mm6          \n\t"
04070             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
04071             "movq %%mm7, %%mm0           \n\t"
04072             // use mm7 mask to merge pa & pb
04073             "pand %%mm7, %%mm5           \n\t"
04074             // use mm0 mask copy to merge a & b
04075             "pand %%mm0, %%mm2           \n\t"
04076             "pandn %%mm4, %%mm7          \n\t"
04077             "pandn %%mm1, %%mm0          \n\t"
04078             "paddw %%mm5, %%mm7          \n\t"
04079             "paddw %%mm2, %%mm0          \n\t"
04080             //  test  ((pa <= pb)? pa:pb) <= pc
04081             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
04082             "pxor %%mm1, %%mm1           \n\t"
04083             "pand %%mm7, %%mm3           \n\t"
04084             "pandn %%mm0, %%mm7          \n\t"
04085             "pxor %%mm1, %%mm1           \n\t"
04086             "paddw %%mm3, %%mm7          \n\t"
04087             "pxor %%mm0, %%mm0           \n\t"
04088             // step ecx to next set of 8 bytes and repeat loop til done
04089             "addl $8, %%ecx              \n\t"
04090             "packuswb %%mm7, %%mm1       \n\t"
04091             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
04092             "cmpl _MMXLength, %%ecx      \n\t"
04093             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
04094                                 // mm1 will be used as Raw(x-bpp) next loop
04095             "jb paeth_4lp                \n\t"
04096 
04097             : "=S" (dummy_value_S),             // output regs (dummy)
04098               "=D" (dummy_value_D)
04099 
04100             : "0" (prev_row),  // esi           // input regs
04101               "1" (row)        // edi
04102 
04103             : "%ecx"                            // clobber list
04104 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
04105             , "%mm0", "%mm1", "%mm2", "%mm3"
04106             , "%mm4", "%mm5", "%mm6", "%mm7"
04107 #endif
04108          );
04109       }
04110       break;  // end 4 bpp
04111 
04112       case 8:                          // bpp == 8
04113       {
04114          _ActiveMask.use  = 0x00000000ffffffffLL;
04115 
04116          __asm__ __volatile__ (
04117             "movl _dif, %%ecx            \n\t"
04118 // preload  "movl row, %%edi             \n\t"
04119 // preload  "movl prev_row, %%esi        \n\t"
04120             "pxor %%mm0, %%mm0           \n\t"
04121             // prime the pump:  load the first Raw(x-bpp) data set
04122             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
04123                                        //  a=Raw(x-bpp) bytes
04124          "paeth_8lp:                     \n\t"
04125             // do first set of 4 bytes
04126             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
04127             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
04128             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
04129             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
04130             // pav = p - a = (a + b - c) - a = b - c
04131             "movq %%mm2, %%mm4           \n\t"
04132             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
04133             // pbv = p - b = (a + b - c) - b = a - c
04134             "movq %%mm1, %%mm5           \n\t"
04135             "psubw %%mm3, %%mm4          \n\t"
04136             "pxor %%mm7, %%mm7           \n\t"
04137             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
04138             "movq %%mm4, %%mm6           \n\t"
04139             "psubw %%mm3, %%mm5          \n\t"
04140             // pa = abs(p-a) = abs(pav)
04141             // pb = abs(p-b) = abs(pbv)
04142             // pc = abs(p-c) = abs(pcv)
04143             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
04144             "paddw %%mm5, %%mm6          \n\t"
04145             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
04146             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
04147             "psubw %%mm0, %%mm4          \n\t"
04148             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
04149             "psubw %%mm0, %%mm4          \n\t"
04150             "psubw %%mm7, %%mm5          \n\t"
04151             "pxor %%mm0, %%mm0           \n\t"
04152             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
04153             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
04154             "psubw %%mm7, %%mm5          \n\t"
04155             "psubw %%mm0, %%mm6          \n\t"
04156             //  test pa <= pb
04157             "movq %%mm4, %%mm7           \n\t"
04158             "psubw %%mm0, %%mm6          \n\t"
04159             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
04160             "movq %%mm7, %%mm0           \n\t"
04161             // use mm7 mask to merge pa & pb
04162             "pand %%mm7, %%mm5           \n\t"
04163             // use mm0 mask copy to merge a & b
04164             "pand %%mm0, %%mm2           \n\t"
04165             "pandn %%mm4, %%mm7          \n\t"
04166             "pandn %%mm1, %%mm0          \n\t"
04167             "paddw %%mm5, %%mm7          \n\t"
04168             "paddw %%mm2, %%mm0          \n\t"
04169             //  test  ((pa <= pb)? pa:pb) <= pc
04170             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
04171             "pxor %%mm1, %%mm1           \n\t"
04172             "pand %%mm7, %%mm3           \n\t"
04173             "pandn %%mm0, %%mm7          \n\t"
04174             "paddw %%mm3, %%mm7          \n\t"
04175             "pxor %%mm0, %%mm0           \n\t"
04176             "packuswb %%mm1, %%mm7       \n\t"
04177             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
04178             "pand _ActiveMask, %%mm7     \n\t"
04179             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
04180             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
04181             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
04182             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
04183             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
04184 
04185             // do second set of 4 bytes
04186             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
04187             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
04188             // pav = p - a = (a + b - c) - a = b - c
04189             "movq %%mm2, %%mm4           \n\t"
04190             // pbv = p - b = (a + b - c) - b = a - c
04191             "movq %%mm1, %%mm5           \n\t"
04192             "psubw %%mm3, %%mm4          \n\t"
04193             "pxor %%mm7, %%mm7           \n\t"
04194             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
04195             "movq %%mm4, %%mm6           \n\t"
04196             "psubw %%mm3, %%mm5          \n\t"
04197             // pa = abs(p-a) = abs(pav)
04198             // pb = abs(p-b) = abs(pbv)
04199             // pc = abs(p-c) = abs(pcv)
04200             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
04201             "paddw %%mm5, %%mm6          \n\t"
04202             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
04203             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
04204             "psubw %%mm0, %%mm4          \n\t"
04205             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
04206             "psubw %%mm0, %%mm4          \n\t"
04207             "psubw %%mm7, %%mm5          \n\t"
04208             "pxor %%mm0, %%mm0           \n\t"
04209             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
04210             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
04211             "psubw %%mm7, %%mm5          \n\t"
04212             "psubw %%mm0, %%mm6          \n\t"
04213             //  test pa <= pb
04214             "movq %%mm4, %%mm7           \n\t"
04215             "psubw %%mm0, %%mm6          \n\t"
04216             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
04217             "movq %%mm7, %%mm0           \n\t"
04218             // use mm7 mask to merge pa & pb
04219             "pand %%mm7, %%mm5           \n\t"
04220             // use mm0 mask copy to merge a & b
04221             "pand %%mm0, %%mm2           \n\t"
04222             "pandn %%mm4, %%mm7          \n\t"
04223             "pandn %%mm1, %%mm0          \n\t"
04224             "paddw %%mm5, %%mm7          \n\t"
04225             "paddw %%mm2, %%mm0          \n\t"
04226             //  test  ((pa <= pb)? pa:pb) <= pc
04227             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
04228             "pxor %%mm1, %%mm1           \n\t"
04229             "pand %%mm7, %%mm3           \n\t"
04230             "pandn %%mm0, %%mm7          \n\t"
04231             "pxor %%mm1, %%mm1           \n\t"
04232             "paddw %%mm3, %%mm7          \n\t"
04233             "pxor %%mm0, %%mm0           \n\t"
04234             // step ecx to next set of 8 bytes and repeat loop til done
04235             "addl $8, %%ecx              \n\t"
04236             "packuswb %%mm7, %%mm1       \n\t"
04237             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
04238             "cmpl _MMXLength, %%ecx      \n\t"
04239             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
04240                             // mm1 will be used as Raw(x-bpp) next loop
04241             "jb paeth_8lp                \n\t"
04242 
04243             : "=S" (dummy_value_S),             // output regs (dummy)
04244               "=D" (dummy_value_D)
04245 
04246             : "0" (prev_row),  // esi           // input regs
04247               "1" (row)        // edi
04248 
04249             : "%ecx"                            // clobber list
04250 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
04251             , "%mm0", "%mm1", "%mm2", "%mm3"
04252             , "%mm4", "%mm5", "%mm6", "%mm7"
04253 #endif
04254          );
04255       }
04256       break;  // end 8 bpp
04257 
04258       case 1:                // bpp = 1
04259       case 2:                // bpp = 2
04260       default:               // bpp > 8
04261       {
04262          __asm__ __volatile__ (
04263 #ifdef __PIC__
04264             "pushl %%ebx                 \n\t" // save Global Offset Table index
04265 #endif
04266             "movl _dif, %%ebx            \n\t"
04267             "cmpl _FullLength, %%ebx     \n\t"
04268             "jnb paeth_dend              \n\t"
04269 
04270 // preload  "movl row, %%edi             \n\t"
04271 // preload  "movl prev_row, %%esi        \n\t"
04272             // do Paeth decode for remaining bytes
04273             "movl %%ebx, %%edx           \n\t"
04274 // preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
04275             "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
04276             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
04277 
04278          "paeth_dlp:                     \n\t"
04279             "xorl %%eax, %%eax           \n\t"
04280             // pav = p - a = (a + b - c) - a = b - c
04281             "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
04282             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
04283             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
04284             "movl %%eax, _patemp         \n\t" // Save pav for later use
04285             "xorl %%eax, %%eax           \n\t"
04286             // pbv = p - b = (a + b - c) - b = a - c
04287             "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
04288             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
04289             "movl %%eax, %%ecx           \n\t"
04290             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
04291             "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
04292             // pc = abs(pcv)
04293             "testl $0x80000000, %%eax    \n\t"
04294             "jz paeth_dpca               \n\t"
04295             "negl %%eax                  \n\t" // reverse sign of neg values
04296 
04297          "paeth_dpca:                    \n\t"
04298             "movl %%eax, _pctemp         \n\t" // save pc for later use
04299             // pb = abs(pbv)
04300             "testl $0x80000000, %%ecx    \n\t"
04301             "jz paeth_dpba               \n\t"
04302             "negl %%ecx                  \n\t" // reverse sign of neg values
04303 
04304          "paeth_dpba:                    \n\t"
04305             "movl %%ecx, _pbtemp         \n\t" // save pb for later use
04306             // pa = abs(pav)
04307             "movl _patemp, %%eax         \n\t"
04308             "testl $0x80000000, %%eax    \n\t"
04309             "jz paeth_dpaa               \n\t"
04310             "negl %%eax                  \n\t" // reverse sign of neg values
04311 
04312          "paeth_dpaa:                    \n\t"
04313             "movl %%eax, _patemp         \n\t" // save pa for later use
04314             // test if pa <= pb
04315             "cmpl %%ecx, %%eax           \n\t"
04316             "jna paeth_dabb              \n\t"
04317             // pa > pb; now test if pb <= pc
04318             "cmpl _pctemp, %%ecx         \n\t"
04319             "jna paeth_dbbc              \n\t"
04320             // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
04321             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
04322             "jmp paeth_dpaeth            \n\t"
04323 
04324          "paeth_dbbc:                    \n\t"
04325             // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
04326             "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
04327             "jmp paeth_dpaeth            \n\t"
04328 
04329          "paeth_dabb:                    \n\t"
04330             // pa <= pb; now test if pa <= pc
04331             "cmpl _pctemp, %%eax         \n\t"
04332             "jna paeth_dabc              \n\t"
04333             // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
04334             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
04335             "jmp paeth_dpaeth            \n\t"
04336 
04337          "paeth_dabc:                    \n\t"
04338             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
04339             "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
04340 
04341          "paeth_dpaeth:                  \n\t"
04342             "incl %%ebx                  \n\t"
04343             "incl %%edx                  \n\t"
04344             // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
04345             "addb %%cl, -1(%%edi,%%ebx,) \n\t"
04346             "cmpl _FullLength, %%ebx     \n\t"
04347             "jb paeth_dlp                \n\t"
04348 
04349          "paeth_dend:                    \n\t"
04350 #ifdef __PIC__
04351             "popl %%ebx                  \n\t" // index to Global Offset Table
04352 #endif
04353 
04354             : "=c" (dummy_value_c),            // output regs (dummy)
04355               "=S" (dummy_value_S),
04356               "=D" (dummy_value_D)
04357 
04358             : "0" (bpp),       // ecx          // input regs
04359               "1" (prev_row),  // esi
04360               "2" (row)        // edi
04361 
04362             : "%eax", "%edx"                   // clobber list
04363 #ifndef __PIC__
04364             , "%ebx"
04365 #endif
04366          );
04367       }
04368       return;                   // No need to go further with this one
04369 
04370    } // end switch (bpp)
04371 
04372    __asm__ __volatile__ (
04373       // MMX acceleration complete; now do clean-up
04374       // check if any remaining bytes left to decode
04375 #ifdef __PIC__
04376       "pushl %%ebx                 \n\t" // save index to Global Offset Table
04377 #endif
04378       "movl _MMXLength, %%ebx      \n\t"
04379       "cmpl _FullLength, %%ebx     \n\t"
04380       "jnb paeth_end               \n\t"
04381 //pre "movl row, %%edi             \n\t"
04382 //pre "movl prev_row, %%esi        \n\t"
04383       // do Paeth decode for remaining bytes
04384       "movl %%ebx, %%edx           \n\t"
04385 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
04386       "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
04387       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
04388 
04389    "paeth_lp2:                     \n\t"
04390       "xorl %%eax, %%eax           \n\t"
04391       // pav = p - a = (a + b - c) - a = b - c
04392       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
04393       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
04394       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
04395       "movl %%eax, _patemp         \n\t" // Save pav for later use
04396       "xorl %%eax, %%eax           \n\t"
04397       // pbv = p - b = (a + b - c) - b = a - c
04398       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
04399       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
04400       "movl %%eax, %%ecx           \n\t"
04401       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
04402       "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
04403       // pc = abs(pcv)
04404       "testl $0x80000000, %%eax    \n\t"
04405       "jz paeth_pca2               \n\t"
04406       "negl %%eax                  \n\t" // reverse sign of neg values
04407 
04408    "paeth_pca2:                    \n\t"
04409       "movl %%eax, _pctemp         \n\t" // save pc for later use
04410       // pb = abs(pbv)
04411       "testl $0x80000000, %%ecx    \n\t"
04412       "jz paeth_pba2               \n\t"
04413       "negl %%ecx                  \n\t" // reverse sign of neg values
04414 
04415    "paeth_pba2:                    \n\t"
04416       "movl %%ecx, _pbtemp         \n\t" // save pb for later use
04417       // pa = abs(pav)
04418       "movl _patemp, %%eax         \n\t"
04419       "testl $0x80000000, %%eax    \n\t"
04420       "jz paeth_paa2               \n\t"
04421       "negl %%eax                  \n\t" // reverse sign of neg values
04422 
04423    "paeth_paa2:                    \n\t"
04424       "movl %%eax, _patemp         \n\t" // save pa for later use
04425       // test if pa <= pb
04426       "cmpl %%ecx, %%eax           \n\t"
04427       "jna paeth_abb2              \n\t"
04428       // pa > pb; now test if pb <= pc
04429       "cmpl _pctemp, %%ecx         \n\t"
04430       "jna paeth_bbc2              \n\t"
04431       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
04432       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
04433       "jmp paeth_paeth2            \n\t"
04434 
04435    "paeth_bbc2:                    \n\t"
04436       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
04437       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
04438       "jmp paeth_paeth2            \n\t"
04439 
04440    "paeth_abb2:                    \n\t"
04441       // pa <= pb; now test if pa <= pc
04442       "cmpl _pctemp, %%eax         \n\t"
04443       "jna paeth_abc2              \n\t"
04444       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
04445       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
04446       "jmp paeth_paeth2            \n\t"
04447 
04448    "paeth_abc2:                    \n\t"
04449       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
04450       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
04451 
04452    "paeth_paeth2:                  \n\t"
04453       "incl %%ebx                  \n\t"
04454       "incl %%edx                  \n\t"
04455       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
04456       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
04457       "cmpl _FullLength, %%ebx     \n\t"
04458       "jb paeth_lp2                \n\t"
04459 
04460    "paeth_end:                     \n\t"
04461       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
04462 #ifdef __PIC__
04463       "popl %%ebx                  \n\t" // restore index to Global Offset Table
04464 #endif
04465 
04466       : "=c" (dummy_value_c),            // output regs (dummy)
04467         "=S" (dummy_value_S),
04468         "=D" (dummy_value_D)
04469 
04470       : "0" (bpp),       // ecx          // input regs
04471         "1" (prev_row),  // esi
04472         "2" (row)        // edi
04473 
04474       : "%eax", "%edx"                   // clobber list (no input regs!)
04475 #ifndef __PIC__
04476       , "%ebx"
04477 #endif
04478    );
04479 
04480 } /* end png_read_filter_row_mmx_paeth() */
04481 #endif
04482 
04483 
04484 
04485 
04486 #ifdef PNG_THREAD_UNSAFE_OK
04487 //===========================================================================//
04488 //                                                                           //
04489 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           //
04490 //                                                                           //
04491 //===========================================================================//
04492 
04493 // Optimized code for PNG Sub filter decoder
04494 
04495 static void /* PRIVATE */
04496 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
04497 {
04498    int bpp;
04499    int dummy_value_a;
04500    int dummy_value_D;
04501 
04502    bpp = (row_info->pixel_depth + 7) >> 3;   // calc number of bytes per pixel
04503    _FullLength = row_info->rowbytes - bpp;   // number of bytes to filter
04504 
04505    __asm__ __volatile__ (
04506 //pre "movl row, %%edi             \n\t"
04507       "movl %%edi, %%esi           \n\t" // lp = row
04508 //pre "movl bpp, %%eax             \n\t"
04509       "addl %%eax, %%edi           \n\t" // rp = row + bpp
04510 //irr "xorl %%eax, %%eax           \n\t"
04511       // get # of bytes to alignment
04512       "movl %%edi, _dif            \n\t" // take start of row
04513       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past
04514                                          //  alignment boundary
04515       "xorl %%ecx, %%ecx           \n\t"
04516       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
04517       "subl %%edi, _dif            \n\t" // subtract from start ==> value
04518       "jz sub_go                   \n\t" //  ecx at alignment
04519 
04520    "sub_lp1:                       \n\t" // fix alignment
04521       "movb (%%esi,%%ecx,), %%al   \n\t"
04522       "addb %%al, (%%edi,%%ecx,)   \n\t"
04523       "incl %%ecx                  \n\t"
04524       "cmpl _dif, %%ecx            \n\t"
04525       "jb sub_lp1                  \n\t"
04526 
04527    "sub_go:                        \n\t"
04528       "movl _FullLength, %%eax     \n\t"
04529       "movl %%eax, %%edx           \n\t"
04530       "subl %%ecx, %%edx           \n\t" // subtract alignment fix
04531       "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
04532       "subl %%edx, %%eax           \n\t" // drop over bytes from length
04533       "movl %%eax, _MMXLength      \n\t"
04534 
04535       : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04536         "=D" (dummy_value_D)    // 1
04537 
04538       : "0" (bpp),              // eax    // input regs
04539         "1" (row)               // edi
04540 
04541       : "%esi", "%ecx", "%edx"            // clobber list
04542 
04543 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
04544       , "%mm0", "%mm1", "%mm2", "%mm3"
04545       , "%mm4", "%mm5", "%mm6", "%mm7"
04546 #endif
04547    );
04548 
04549    // now do the math for the rest of the row
04550    switch (bpp)
04551    {
04552       case 3:
04553       {
04554          _ActiveMask.use  = 0x0000ffffff000000LL;
04555          _ShiftBpp.use = 24;       // == 3 * 8
04556          _ShiftRem.use  = 40;      // == 64 - 24
04557 
04558          __asm__ __volatile__ (
04559 // preload  "movl row, %%edi              \n\t"
04560             "movq _ActiveMask, %%mm7       \n\t" // load _ActiveMask for 2nd
04561                                                 //  active byte group
04562             "movl %%edi, %%esi            \n\t" // lp = row
04563 // preload  "movl bpp, %%eax              \n\t"
04564             "addl %%eax, %%edi            \n\t" // rp = row + bpp
04565             "movq %%mm7, %%mm6            \n\t"
04566             "movl _dif, %%edx             \n\t"
04567             "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
04568                                                 //  3rd active byte group
04569             // prime the pump:  load the first Raw(x-bpp) data set
04570             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
04571 
04572          "sub_3lp:                        \n\t" // shift data for adding first
04573             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
04574                                                 //  shift clears inactive bytes)
04575             // add 1st active group
04576             "movq (%%edi,%%edx,), %%mm0   \n\t"
04577             "paddb %%mm1, %%mm0           \n\t"
04578 
04579             // add 2nd active group
04580             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
04581             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
04582             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
04583             "paddb %%mm1, %%mm0           \n\t"
04584 
04585             // add 3rd active group
04586             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
04587             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
04588             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
04589             "addl $8, %%edx               \n\t"
04590             "paddb %%mm1, %%mm0           \n\t"
04591 
04592             "cmpl _MMXLength, %%edx       \n\t"
04593             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
04594             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
04595             "jb sub_3lp                   \n\t"
04596 
04597             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04598               "=D" (dummy_value_D)    // 1
04599 
04600             : "0" (bpp),              // eax    // input regs
04601               "1" (row)               // edi
04602 
04603             : "%edx", "%esi"                    // clobber list
04604 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
04605             , "%mm0", "%mm1", "%mm6", "%mm7"
04606 #endif
04607          );
04608       }
04609       break;
04610 
04611       case 1:
04612       {
04613          __asm__ __volatile__ (
04614             "movl _dif, %%edx            \n\t"
04615 // preload  "movl row, %%edi             \n\t"
04616             "cmpl _FullLength, %%edx     \n\t"
04617             "jnb sub_1end                \n\t"
04618             "movl %%edi, %%esi           \n\t" // lp = row
04619             "xorl %%eax, %%eax           \n\t"
04620 // preload  "movl bpp, %%eax             \n\t"
04621             "addl %%eax, %%edi           \n\t" // rp = row + bpp
04622 
04623          "sub_1lp:                       \n\t"
04624             "movb (%%esi,%%edx,), %%al   \n\t"
04625             "addb %%al, (%%edi,%%edx,)   \n\t"
04626             "incl %%edx                  \n\t"
04627             "cmpl _FullLength, %%edx     \n\t"
04628             "jb sub_1lp                  \n\t"
04629 
04630          "sub_1end:                      \n\t"
04631 
04632             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04633               "=D" (dummy_value_D)    // 1
04634 
04635             : "0" (bpp),              // eax    // input regs
04636               "1" (row)               // edi
04637 
04638             : "%edx", "%esi"                    // clobber list
04639          );
04640       }
04641       return;
04642 
04643       case 6:
04644       case 4:
04645       //case 7:   // GRR BOGUS
04646       //case 5:   // GRR BOGUS
04647       {
04648          _ShiftBpp.use = bpp << 3;
04649          _ShiftRem.use = 64 - _ShiftBpp.use;
04650 
04651          __asm__ __volatile__ (
04652 // preload  "movl row, %%edi              \n\t"
04653             "movl _dif, %%edx             \n\t"
04654             "movl %%edi, %%esi            \n\t" // lp = row
04655 // preload  "movl bpp, %%eax              \n\t"
04656             "addl %%eax, %%edi            \n\t" // rp = row + bpp
04657 
04658             // prime the pump:  load the first Raw(x-bpp) data set
04659             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
04660 
04661          "sub_4lp:                        \n\t" // shift data for adding first
04662             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
04663                                                 //  shift clears inactive bytes)
04664             "movq (%%edi,%%edx,), %%mm0   \n\t"
04665             "paddb %%mm1, %%mm0           \n\t"
04666 
04667             // add 2nd active group
04668             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
04669             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
04670             "addl $8, %%edx               \n\t"
04671             "paddb %%mm1, %%mm0           \n\t"
04672 
04673             "cmpl _MMXLength, %%edx       \n\t"
04674             "movq %%mm0, -8(%%edi,%%edx,) \n\t"
04675             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
04676             "jb sub_4lp                   \n\t"
04677 
04678             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04679               "=D" (dummy_value_D)    // 1
04680 
04681             : "0" (bpp),              // eax    // input regs
04682               "1" (row)               // edi
04683 
04684             : "%edx", "%esi"                    // clobber list
04685 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
04686             , "%mm0", "%mm1"
04687 #endif
04688          );
04689       }
04690       break;
04691 
04692       case 2:
04693       {
04694          _ActiveMask.use = 0x00000000ffff0000LL;
04695          _ShiftBpp.use = 16;       // == 2 * 8
04696          _ShiftRem.use = 48;       // == 64 - 16
04697 
04698          __asm__ __volatile__ (
04699             "movq _ActiveMask, %%mm7      \n\t" // load _ActiveMask for 2nd
04700                                                 //  active byte group
04701             "movl _dif, %%edx             \n\t"
04702             "movq %%mm7, %%mm6            \n\t"
04703 // preload  "movl row, %%edi              \n\t"
04704             "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
04705                                                 //  3rd active byte group
04706             "movl %%edi, %%esi            \n\t" // lp = row
04707             "movq %%mm6, %%mm5            \n\t"
04708 // preload  "movl bpp, %%eax              \n\t"
04709             "addl %%eax, %%edi            \n\t" // rp = row + bpp
04710             "psllq _ShiftBpp, %%mm5       \n\t" // move mask in mm5 to cover
04711                                                 //  4th active byte group
04712             // prime the pump:  load the first Raw(x-bpp) data set
04713             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
04714 
04715          "sub_2lp:                        \n\t" // shift data for adding first
04716             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
04717                                                 //  shift clears inactive bytes)
04718             // add 1st active group
04719             "movq (%%edi,%%edx,), %%mm0   \n\t"
04720             "paddb %%mm1, %%mm0           \n\t"
04721 
04722             // add 2nd active group
04723             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
04724             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
04725             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
04726             "paddb %%mm1, %%mm0           \n\t"
04727 
04728             // add 3rd active group
04729             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
04730             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
04731             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
04732             "paddb %%mm1, %%mm0           \n\t"
04733 
04734             // add 4th active group
04735             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
04736             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
04737             "pand %%mm5, %%mm1            \n\t" // mask to use 4th active group
04738             "addl $8, %%edx               \n\t"
04739             "paddb %%mm1, %%mm0           \n\t"
04740             "cmpl _MMXLength, %%edx       \n\t"
04741             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
04742             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
04743             "jb sub_2lp                   \n\t"
04744 
04745             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04746               "=D" (dummy_value_D)    // 1
04747 
04748             : "0" (bpp),              // eax    // input regs
04749               "1" (row)               // edi
04750 
04751             : "%edx", "%esi"                    // clobber list
04752 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
04753             , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
04754 #endif
04755          );
04756       }
04757       break;
04758 
04759       case 8:
04760       {
04761          __asm__ __volatile__ (
04762 // preload  "movl row, %%edi              \n\t"
04763             "movl _dif, %%edx             \n\t"
04764             "movl %%edi, %%esi            \n\t" // lp = row
04765 // preload  "movl bpp, %%eax              \n\t"
04766             "addl %%eax, %%edi            \n\t" // rp = row + bpp
04767             "movl _MMXLength, %%ecx       \n\t"
04768 
04769             // prime the pump:  load the first Raw(x-bpp) data set
04770             "movq -8(%%edi,%%edx,), %%mm7 \n\t"
04771             "andl $0x0000003f, %%ecx      \n\t" // calc bytes over mult of 64
04772 
04773          "sub_8lp:                        \n\t"
04774             "movq (%%edi,%%edx,), %%mm0   \n\t" // load Sub(x) for 1st 8 bytes
04775             "paddb %%mm7, %%mm0           \n\t"
04776             "movq 8(%%edi,%%edx,), %%mm1  \n\t" // load Sub(x) for 2nd 8 bytes
04777             "movq %%mm0, (%%edi,%%edx,)   \n\t" // write Raw(x) for 1st 8 bytes
04778 
04779             // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
04780             // This will be repeated for each group of 8 bytes with the 8th
04781             // group being used as the Raw(x-bpp) for the 1st group of the
04782             // next loop.
04783 
04784             "paddb %%mm0, %%mm1           \n\t"
04785             "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
04786             "movq %%mm1, 8(%%edi,%%edx,)  \n\t" // write Raw(x) for 2nd 8 bytes
04787             "paddb %%mm1, %%mm2           \n\t"
04788             "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
04789             "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
04790             "paddb %%mm2, %%mm3           \n\t"
04791             "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
04792             "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
04793             "paddb %%mm3, %%mm4           \n\t"
04794             "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
04795             "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
04796             "paddb %%mm4, %%mm5           \n\t"
04797             "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
04798             "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
04799             "paddb %%mm5, %%mm6           \n\t"
04800             "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
04801             "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
04802             "addl $64, %%edx              \n\t"
04803             "paddb %%mm6, %%mm7           \n\t"
04804             "cmpl %%ecx, %%edx            \n\t"
04805             "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
04806             "jb sub_8lp                   \n\t"
04807 
04808             "cmpl _MMXLength, %%edx       \n\t"
04809             "jnb sub_8lt8                 \n\t"
04810 
04811          "sub_8lpA:                       \n\t"
04812             "movq (%%edi,%%edx,), %%mm0   \n\t"
04813             "addl $8, %%edx               \n\t"
04814             "paddb %%mm7, %%mm0           \n\t"
04815             "cmpl _MMXLength, %%edx       \n\t"
04816             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
04817             "movq %%mm0, %%mm7            \n\t" // move calculated Raw(x) data
04818                                                 //  to mm1 to be new Raw(x-bpp)
04819                                                 //  for next loop
04820             "jb sub_8lpA                  \n\t"
04821 
04822          "sub_8lt8:                       \n\t"
04823 
04824             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04825               "=D" (dummy_value_D)    // 1
04826 
04827             : "0" (bpp),              // eax    // input regs
04828               "1" (row)               // edi
04829 
04830             : "%ecx", "%edx", "%esi"            // clobber list
04831 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
04832             , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
04833 #endif
04834          );
04835       }
04836       break;
04837 
04838       default:                // bpp greater than 8 bytes   GRR BOGUS
04839       {
04840          __asm__ __volatile__ (
04841             "movl _dif, %%edx             \n\t"
04842 // preload  "movl row, %%edi              \n\t"
04843             "movl %%edi, %%esi            \n\t" // lp = row
04844 // preload  "movl bpp, %%eax              \n\t"
04845             "addl %%eax, %%edi            \n\t" // rp = row + bpp
04846 
04847          "sub_Alp:                        \n\t"
04848             "movq (%%edi,%%edx,), %%mm0   \n\t"
04849             "movq (%%esi,%%edx,), %%mm1   \n\t"
04850             "addl $8, %%edx               \n\t"
04851             "paddb %%mm1, %%mm0           \n\t"
04852             "cmpl _MMXLength, %%edx       \n\t"
04853             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
04854                                                 //  -8 to offset addl edx
04855             "jb sub_Alp                   \n\t"
04856 
04857             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04858               "=D" (dummy_value_D)    // 1
04859 
04860             : "0" (bpp),              // eax    // input regs
04861               "1" (row)               // edi
04862 
04863             : "%edx", "%esi"                    // clobber list
04864 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
04865             , "%mm0", "%mm1"
04866 #endif
04867          );
04868       }
04869       break;
04870 
04871    } // end switch (bpp)
04872 
04873    __asm__ __volatile__ (
04874       "movl _MMXLength, %%edx       \n\t"
04875 //pre "movl row, %%edi              \n\t"
04876       "cmpl _FullLength, %%edx      \n\t"
04877       "jnb sub_end                  \n\t"
04878 
04879       "movl %%edi, %%esi            \n\t" // lp = row
04880 //pre "movl bpp, %%eax              \n\t"
04881       "addl %%eax, %%edi            \n\t" // rp = row + bpp
04882       "xorl %%eax, %%eax            \n\t"
04883 
04884    "sub_lp2:                        \n\t"
04885       "movb (%%esi,%%edx,), %%al    \n\t"
04886       "addb %%al, (%%edi,%%edx,)    \n\t"
04887       "incl %%edx                   \n\t"
04888       "cmpl _FullLength, %%edx      \n\t"
04889       "jb sub_lp2                   \n\t"
04890 
04891    "sub_end:                        \n\t"
04892       "EMMS                         \n\t" // end MMX instructions
04893 
04894       : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04895         "=D" (dummy_value_D)    // 1
04896 
04897       : "0" (bpp),              // eax    // input regs
04898         "1" (row)               // edi
04899 
04900       : "%edx", "%esi"                    // clobber list
04901    );
04902 
04903 } // end of png_read_filter_row_mmx_sub()
04904 #endif
04905 
04906 
04907 
04908 
04909 //===========================================================================//
04910 //                                                                           //
04911 //            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            //
04912 //                                                                           //
04913 //===========================================================================//
04914 
04915 // Optimized code for PNG Up filter decoder
04916 
04917 static void /* PRIVATE */
04918 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
04919                            png_bytep prev_row)
04920 {
04921    png_uint_32 len;
04922    int dummy_value_d;   // fix 'forbidden register 3 (dx) was spilled' error
04923    int dummy_value_S;
04924    int dummy_value_D;
04925 
04926    len = row_info->rowbytes;              // number of bytes to filter
04927 
04928    __asm__ __volatile__ (
04929 //pre "movl row, %%edi              \n\t"
04930       // get # of bytes to alignment
04931 #ifdef __PIC__
04932       "pushl %%ebx                  \n\t"
04933 #endif
04934       "movl %%edi, %%ecx            \n\t"
04935       "xorl %%ebx, %%ebx            \n\t"
04936       "addl $0x7, %%ecx             \n\t"
04937       "xorl %%eax, %%eax            \n\t"
04938       "andl $0xfffffff8, %%ecx      \n\t"
04939 //pre "movl prev_row, %%esi         \n\t"
04940       "subl %%edi, %%ecx            \n\t"
04941       "jz up_go                     \n\t"
04942 
04943    "up_lp1:                         \n\t" // fix alignment
04944       "movb (%%edi,%%ebx,), %%al    \n\t"
04945       "addb (%%esi,%%ebx,), %%al    \n\t"
04946       "incl %%ebx                   \n\t"
04947       "cmpl %%ecx, %%ebx            \n\t"
04948       "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
04949       "jb up_lp1                    \n\t" //  offset incl ebx
04950 
04951    "up_go:                          \n\t"
04952 //pre "movl len, %%edx              \n\t"
04953       "movl %%edx, %%ecx            \n\t"
04954       "subl %%ebx, %%edx            \n\t" // subtract alignment fix
04955       "andl $0x0000003f, %%edx      \n\t" // calc bytes over mult of 64
04956       "subl %%edx, %%ecx            \n\t" // drop over bytes from length
04957 
04958       // unrolled loop - use all MMX registers and interleave to reduce
04959       // number of branch instructions (loops) and reduce partial stalls
04960    "up_loop:                        \n\t"
04961       "movq (%%esi,%%ebx,), %%mm1   \n\t"
04962       "movq (%%edi,%%ebx,), %%mm0   \n\t"
04963       "movq 8(%%esi,%%ebx,), %%mm3  \n\t"
04964       "paddb %%mm1, %%mm0           \n\t"
04965       "movq 8(%%edi,%%ebx,), %%mm2  \n\t"
04966       "movq %%mm0, (%%edi,%%ebx,)   \n\t"
04967       "paddb %%mm3, %%mm2           \n\t"
04968       "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
04969       "movq %%mm2, 8(%%edi,%%ebx,)  \n\t"
04970       "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
04971       "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
04972       "paddb %%mm5, %%mm4           \n\t"
04973       "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
04974       "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
04975       "paddb %%mm7, %%mm6           \n\t"
04976       "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
04977       "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
04978       "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
04979       "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
04980       "paddb %%mm1, %%mm0           \n\t"
04981       "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
04982       "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
04983       "paddb %%mm3, %%mm2           \n\t"
04984       "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
04985       "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
04986       "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
04987       "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
04988       "paddb %%mm5, %%mm4           \n\t"
04989       "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
04990       "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
04991       "addl $64, %%ebx              \n\t"
04992       "paddb %%mm7, %%mm6           \n\t"
04993       "cmpl %%ecx, %%ebx            \n\t"
04994       "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
04995       "jb up_loop                   \n\t" //  -8 to offset addl ebx
04996 
04997       "cmpl $0, %%edx               \n\t" // test for bytes over mult of 64
04998       "jz up_end                    \n\t"
04999 
05000       "cmpl $8, %%edx               \n\t" // test for less than 8 bytes
05001       "jb up_lt8                    \n\t" //  [added by lcreeve at netins.net]
05002 
05003       "addl %%edx, %%ecx            \n\t"
05004       "andl $0x00000007, %%edx      \n\t" // calc bytes over mult of 8
05005       "subl %%edx, %%ecx            \n\t" // drop over bytes from length
05006       "jz up_lt8                    \n\t"
05007 
05008    "up_lpA:                         \n\t" // use MMX regs to update 8 bytes sim.
05009       "movq (%%esi,%%ebx,), %%mm1   \n\t"
05010       "movq (%%edi,%%ebx,), %%mm0   \n\t"
05011       "addl $8, %%ebx               \n\t"
05012       "paddb %%mm1, %%mm0           \n\t"
05013       "cmpl %%ecx, %%ebx            \n\t"
05014       "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
05015       "jb up_lpA                    \n\t" //  offset add ebx
05016       "cmpl $0, %%edx               \n\t" // test for bytes over mult of 8
05017       "jz up_end                    \n\t"
05018 
05019    "up_lt8:                         \n\t"
05020       "xorl %%eax, %%eax            \n\t"
05021       "addl %%edx, %%ecx            \n\t" // move over byte count into counter
05022 
05023    "up_lp2:                         \n\t" // use x86 regs for remaining bytes
05024       "movb (%%edi,%%ebx,), %%al    \n\t"
05025       "addb (%%esi,%%ebx,), %%al    \n\t"
05026       "incl %%ebx                   \n\t"
05027       "cmpl %%ecx, %%ebx            \n\t"
05028       "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
05029       "jb up_lp2                    \n\t" //  offset inc ebx
05030 
05031    "up_end:                         \n\t"
05032       "EMMS                         \n\t" // conversion of filtered row complete
05033 #ifdef __PIC__
05034       "popl %%ebx                   \n\t"
05035 #endif
05036 
05037       : "=d" (dummy_value_d),   // 0      // output regs (dummy)
05038         "=S" (dummy_value_S),   // 1
05039         "=D" (dummy_value_D)    // 2
05040 
05041       : "0" (len),              // edx    // input regs
05042         "1" (prev_row),         // esi
05043         "2" (row)               // edi
05044 
05045       : "%eax", "%ecx"            // clobber list (no input regs!)
05046 #ifndef __PIC__
05047       , "%ebx"
05048 #endif
05049 
05050 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
05051       , "%mm0", "%mm1", "%mm2", "%mm3"
05052       , "%mm4", "%mm5", "%mm6", "%mm7"
05053 #endif
05054    );
05055 
05056 } // end of png_read_filter_row_mmx_up()
05057 
05058 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
05059 
05060 
05061 
05062 
05063 /*===========================================================================*/
05064 /*                                                                           */
05065 /*                   P N G _ R E A D _ F I L T E R _ R O W                   */
05066 /*                                                                           */
05067 /*===========================================================================*/
05068 
05069 
05070 /* Optimized png_read_filter_row routines */
05071 
05072 void /* PRIVATE */
05073 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
05074    row, png_bytep prev_row, int filter)
05075 {
05076 #ifdef PNG_DEBUG
05077    char filnm[10];
05078 #endif
05079 
05080 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
05081 /* GRR:  these are superseded by png_ptr->asm_flags: */
05082 #define UseMMX_sub    1   // GRR:  converted 20000730
05083 #define UseMMX_up     1   // GRR:  converted 20000729
05084 #define UseMMX_avg    1   // GRR:  converted 20000828 (+ 16-bit bugfix 20000916)
05085 #define UseMMX_paeth  1   // GRR:  converted 20000828
05086 
05087    if (_mmx_supported == 2) {
05088        /* this should have happened in png_init_mmx_flags() already */
05089 #if !defined(PNG_1_0_X)
05090        png_warning(png_ptr, "asm_flags may not have been initialized");
05091 #endif
05092        png_mmx_support();
05093    }
05094 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
05095 
05096 #ifdef PNG_DEBUG
05097    png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
05098    switch (filter)
05099    {
05100       case 0: sprintf(filnm, "none");
05101          break;
05102       case 1: sprintf(filnm, "sub-%s",
05103 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
05104 #if !defined(PNG_1_0_X)
05105         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : 
05106 #endif
05107 #endif
05108 "x86");
05109          break;
05110       case 2: sprintf(filnm, "up-%s",
05111 #ifdef PNG_ASSEMBLER_CODE_SUPPORTED
05112 #if !defined(PNG_1_0_X)
05113         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
05114 #endif
05115 #endif
05116  "x86");
05117          break;
05118       case 3: sprintf(filnm, "avg-%s",
05119 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
05120 #if !defined(PNG_1_0_X)
05121         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
05122 #endif
05123 #endif
05124  "x86");
05125          break;
05126       case 4: sprintf(filnm, "Paeth-%s",
05127 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
05128 #if !defined(PNG_1_0_X)
05129         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
05130 #endif
05131 #endif
05132 "x86");
05133          break;
05134       default: sprintf(filnm, "unknw");
05135          break;
05136    }
05137    png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
05138    png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
05139    png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
05140       (int)((row_info->pixel_depth + 7) >> 3));
05141    png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
05142 #endif /* PNG_DEBUG */
05143 
05144    switch (filter)
05145    {
05146       case PNG_FILTER_VALUE_NONE:
05147          break;
05148 
05149       case PNG_FILTER_VALUE_SUB:
05150 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
05151 #if !defined(PNG_1_0_X)
05152          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
05153              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
05154              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
05155 #else
05156          if (_mmx_supported)
05157 #endif
05158          {
05159             png_read_filter_row_mmx_sub(row_info, row);
05160          }
05161          else
05162 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
05163          {
05164             png_uint_32 i;
05165             png_uint_32 istop = row_info->rowbytes;
05166             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
05167             png_bytep rp = row + bpp;
05168             png_bytep lp = row;
05169 
05170             for (i = bpp; i < istop; i++)
05171             {
05172                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
05173                rp++;
05174             }
05175          }  /* end !UseMMX_sub */
05176          break;
05177 
05178       case PNG_FILTER_VALUE_UP:
05179 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
05180 #if !defined(PNG_1_0_X)
05181          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
05182              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
05183              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
05184 #else
05185          if (_mmx_supported)
05186 #endif
05187          {
05188             png_read_filter_row_mmx_up(row_info, row, prev_row);
05189          }
05190           else
05191 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
05192          {
05193             png_uint_32 i;
05194             png_uint_32 istop = row_info->rowbytes;
05195             png_bytep rp = row;
05196             png_bytep pp = prev_row;
05197 
05198             for (i = 0; i < istop; ++i)
05199             {
05200                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
05201                rp++;
05202             }
05203          }  /* end !UseMMX_up */
05204          break;
05205 
05206       case PNG_FILTER_VALUE_AVG:
05207 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
05208 #if !defined(PNG_1_0_X)
05209          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
05210              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
05211              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
05212 #else
05213          if (_mmx_supported)
05214 #endif
05215          {
05216             png_read_filter_row_mmx_avg(row_info, row, prev_row);
05217          }
05218          else
05219 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
05220          {
05221             png_uint_32 i;
05222             png_bytep rp = row;
05223             png_bytep pp = prev_row;
05224             png_bytep lp = row;
05225             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
05226             png_uint_32 istop = row_info->rowbytes - bpp;
05227 
05228             for (i = 0; i < bpp; i++)
05229             {
05230                *rp = (png_byte)(((int)(*rp) +
05231                   ((int)(*pp++) >> 1)) & 0xff);
05232                rp++;
05233             }
05234 
05235             for (i = 0; i < istop; i++)
05236             {
05237                *rp = (png_byte)(((int)(*rp) +
05238                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
05239                rp++;
05240             }
05241          }  /* end !UseMMX_avg */
05242          break;
05243 
05244       case PNG_FILTER_VALUE_PAETH:
05245 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
05246 #if !defined(PNG_1_0_X)
05247          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
05248              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
05249              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
05250 #else
05251          if (_mmx_supported)
05252 #endif
05253          {
05254             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
05255          }
05256          else
05257 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
05258          {
05259             png_uint_32 i;
05260             png_bytep rp = row;
05261             png_bytep pp = prev_row;
05262             png_bytep lp = row;
05263             png_bytep cp = prev_row;
05264             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
05265             png_uint_32 istop = row_info->rowbytes - bpp;
05266 
05267             for (i = 0; i < bpp; i++)
05268             {
05269                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
05270                rp++;
05271             }
05272 
05273             for (i = 0; i < istop; i++)   /* use leftover rp,pp */
05274             {
05275                int a, b, c, pa, pb, pc, p;
05276 
05277                a = *lp++;
05278                b = *pp++;
05279                c = *cp++;
05280 
05281                p = b - c;
05282                pc = a - c;
05283 
05284 #ifdef PNG_USE_ABS
05285                pa = abs(p);
05286                pb = abs(pc);
05287                pc = abs(p + pc);
05288 #else
05289                pa = p < 0 ? -p : p;
05290                pb = pc < 0 ? -pc : pc;
05291                pc = (p + pc) < 0 ? -(p + pc) : p + pc;
05292 #endif
05293 
05294                /*
05295                   if (pa <= pb && pa <= pc)
05296                      p = a;
05297                   else if (pb <= pc)
05298                      p = b;
05299                   else
05300                      p = c;
05301                 */
05302 
05303                p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
05304 
05305                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
05306                rp++;
05307             }
05308          }  /* end !UseMMX_paeth */
05309          break;
05310 
05311       default:
05312          png_warning(png_ptr, "Ignoring bad row-filter type");
05313          *row=0;
05314          break;
05315    }
05316 }
05317 
05318 #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
05319 
05320 
05321 /*===========================================================================*/
05322 /*                                                                           */
05323 /*                      P N G _ M M X _ S U P P O R T                        */
05324 /*                                                                           */
05325 /*===========================================================================*/
05326 
05327 /* GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
05328  *             (2) all instructions compile with gcc 2.7.2.3 and later
05329  *             (3) the function is moved down here to prevent gcc from
05330  *                  inlining it in multiple places and then barfing be-
05331  *                  cause the ".NOT_SUPPORTED" label is multiply defined
05332  *             [is there a way to signal that a *single* function should
05333  *              not be inlined?  is there a way to modify the label for
05334  *              each inlined instance, e.g., by appending _1, _2, etc.?
05335  *              maybe if don't use leading "." in label name? (nope...sigh)]
05336  */
05337 
05338 int PNGAPI
05339 png_mmx_support(void)
05340 {
05341 #if defined(PNG_MMX_CODE_SUPPORTED)
05342     __asm__ __volatile__ (
05343         "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
05344         "pushl %%ecx          \n\t"  // so does ecx...
05345         "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
05346 //      ".byte  0x66          \n\t"  // convert 16-bit pushf to 32-bit pushfd
05347 //      "pushf                \n\t"  // 16-bit pushf
05348         "pushfl               \n\t"  // save Eflag to stack
05349         "popl %%eax           \n\t"  // get Eflag from stack into eax
05350         "movl %%eax, %%ecx    \n\t"  // make another copy of Eflag in ecx
05351         "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
05352         "pushl %%eax          \n\t"  // save modified Eflag back to stack
05353 //      ".byte  0x66          \n\t"  // convert 16-bit popf to 32-bit popfd
05354 //      "popf                 \n\t"  // 16-bit popf
05355         "popfl                \n\t"  // restore modified value to Eflag reg
05356         "pushfl               \n\t"  // save Eflag to stack
05357         "popl %%eax           \n\t"  // get Eflag from stack
05358         "pushl %%ecx          \n\t"  // save original Eflag to stack
05359         "popfl                \n\t"  // restore original Eflag
05360         "xorl %%ecx, %%eax    \n\t"  // compare new Eflag with original Eflag
05361         "jz 0f                \n\t"  // if same, CPUID instr. is not supported
05362 
05363         "xorl %%eax, %%eax    \n\t"  // set eax to zero
05364 //      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode)
05365         "cpuid                \n\t"  // get the CPU identification info
05366         "cmpl $1, %%eax       \n\t"  // make sure eax return non-zero value
05367         "jl 0f                \n\t"  // if eax is zero, MMX is not supported
05368 
05369         "xorl %%eax, %%eax    \n\t"  // set eax to zero and...
05370         "incl %%eax           \n\t"  // ...increment eax to 1.  This pair is
05371                                      // faster than the instruction "mov eax, 1"
05372         "cpuid                \n\t"  // get the CPU identification info again
05373         "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
05374         "cmpl $0, %%edx       \n\t"  // 0 = MMX not supported
05375         "jz 0f                \n\t"  // non-zero = yes, MMX IS supported
05376 
05377         "movl $1, %%eax       \n\t"  // set return value to 1
05378         "jmp  1f              \n\t"  // DONE:  have MMX support
05379 
05380     "0:                       \n\t"  // .NOT_SUPPORTED: target label for jump instructions
05381         "movl $0, %%eax       \n\t"  // set return value to 0
05382     "1:                       \n\t"  // .RETURN: target label for jump instructions
05383         "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
05384         "popl %%edx           \n\t"  // restore edx
05385         "popl %%ecx           \n\t"  // restore ecx
05386         "popl %%ebx           \n\t"  // restore ebx
05387 
05388 //      "ret                  \n\t"  // DONE:  no MMX support
05389                                      // (fall through to standard C "ret")
05390 
05391         :                            // output list (none)
05392 
05393         :                            // any variables used on input (none)
05394 
05395         : "%eax"                     // clobber list
05396 //      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually
05397 //      , "memory"   // if write to a variable gcc thought was in a reg
05398 //      , "cc"       // "condition codes" (flag bits)
05399     );
05400 #else     
05401     _mmx_supported = 0;
05402 #endif /* PNG_MMX_CODE_SUPPORTED */
05403 
05404     return _mmx_supported;
05405 }
05406 
05407 
05408 #endif /* PNG_USE_PNGGCCRD */

Generated on Fri Nov 28 00:06:22 2008 for elphel by  doxygen 1.5.1