00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #define PNG_INTERNAL
00027 #include "png.h"
00028
00029 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
00030
00031 static int mmx_supported=2;
00032
00033
00034 int PNGAPI
00035 png_mmx_support(void)
00036 {
00037 int mmx_supported_local = 0;
00038 _asm {
00039 push ebx
00040 push ecx
00041 push edx
00042
00043 pushfd
00044 pop eax
00045 mov ecx, eax
00046 xor eax, 0x200000
00047 push eax
00048
00049 popfd
00050 pushfd
00051 pop eax
00052 push ecx
00053 popfd
00054 xor eax, ecx
00055 jz NOT_SUPPORTED
00056
00057
00058
00059 xor eax, eax
00060
00061 _asm _emit 0x0f
00062 _asm _emit 0xa2
00063
00064 cmp eax, 1
00065 jl NOT_SUPPORTED
00066
00067 xor eax, eax
00068 inc eax
00069
00070
00071 _asm _emit 0x0f
00072 _asm _emit 0xa2
00073
00074 and edx, 0x00800000
00075 cmp edx, 0
00076 jz NOT_SUPPORTED
00077
00078 mov mmx_supported_local, 1
00079
00080 NOT_SUPPORTED:
00081 mov eax, mmx_supported_local
00082 pop edx
00083 pop ecx
00084 pop ebx
00085 }
00086
00087
00088
00089
00090 mmx_supported = mmx_supported_local;
00091 return mmx_supported_local;
00092 }
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108 void
00109 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
00110 {
00111 #ifdef PNG_USE_LOCAL_ARRAYS
00112 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
00113 #endif
00114
00115 png_debug(1,"in png_combine_row_asm\n");
00116
00117 if (mmx_supported == 2) {
00118 #if !defined(PNG_1_0_X)
00119
00120 png_warning(png_ptr, "asm_flags may not have been initialized");
00121 #endif
00122 png_mmx_support();
00123 }
00124
00125 if (mask == 0xff)
00126 {
00127 png_memcpy(row, png_ptr->row_buf + 1,
00128 (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
00129 png_ptr->width));
00130 }
00131
00132
00133 else
00134 {
00135 switch (png_ptr->row_info.pixel_depth)
00136 {
00137 case 1:
00138 {
00139 png_bytep sp;
00140 png_bytep dp;
00141 int s_inc, s_start, s_end;
00142 int m;
00143 int shift;
00144 png_uint_32 i;
00145
00146 sp = png_ptr->row_buf + 1;
00147 dp = row;
00148 m = 0x80;
00149 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
00150 if (png_ptr->transformations & PNG_PACKSWAP)
00151 {
00152 s_start = 0;
00153 s_end = 7;
00154 s_inc = 1;
00155 }
00156 else
00157 #endif
00158 {
00159 s_start = 7;
00160 s_end = 0;
00161 s_inc = -1;
00162 }
00163
00164 shift = s_start;
00165
00166 for (i = 0; i < png_ptr->width; i++)
00167 {
00168 if (m & mask)
00169 {
00170 int value;
00171
00172 value = (*sp >> shift) & 0x1;
00173 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
00174 *dp |= (png_byte)(value << shift);
00175 }
00176
00177 if (shift == s_end)
00178 {
00179 shift = s_start;
00180 sp++;
00181 dp++;
00182 }
00183 else
00184 shift += s_inc;
00185
00186 if (m == 1)
00187 m = 0x80;
00188 else
00189 m >>= 1;
00190 }
00191 break;
00192 }
00193
00194 case 2:
00195 {
00196 png_bytep sp;
00197 png_bytep dp;
00198 int s_start, s_end, s_inc;
00199 int m;
00200 int shift;
00201 png_uint_32 i;
00202 int value;
00203
00204 sp = png_ptr->row_buf + 1;
00205 dp = row;
00206 m = 0x80;
00207 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
00208 if (png_ptr->transformations & PNG_PACKSWAP)
00209 {
00210 s_start = 0;
00211 s_end = 6;
00212 s_inc = 2;
00213 }
00214 else
00215 #endif
00216 {
00217 s_start = 6;
00218 s_end = 0;
00219 s_inc = -2;
00220 }
00221
00222 shift = s_start;
00223
00224 for (i = 0; i < png_ptr->width; i++)
00225 {
00226 if (m & mask)
00227 {
00228 value = (*sp >> shift) & 0x3;
00229 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
00230 *dp |= (png_byte)(value << shift);
00231 }
00232
00233 if (shift == s_end)
00234 {
00235 shift = s_start;
00236 sp++;
00237 dp++;
00238 }
00239 else
00240 shift += s_inc;
00241 if (m == 1)
00242 m = 0x80;
00243 else
00244 m >>= 1;
00245 }
00246 break;
00247 }
00248
00249 case 4:
00250 {
00251 png_bytep sp;
00252 png_bytep dp;
00253 int s_start, s_end, s_inc;
00254 int m;
00255 int shift;
00256 png_uint_32 i;
00257 int value;
00258
00259 sp = png_ptr->row_buf + 1;
00260 dp = row;
00261 m = 0x80;
00262 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
00263 if (png_ptr->transformations & PNG_PACKSWAP)
00264 {
00265 s_start = 0;
00266 s_end = 4;
00267 s_inc = 4;
00268 }
00269 else
00270 #endif
00271 {
00272 s_start = 4;
00273 s_end = 0;
00274 s_inc = -4;
00275 }
00276 shift = s_start;
00277
00278 for (i = 0; i < png_ptr->width; i++)
00279 {
00280 if (m & mask)
00281 {
00282 value = (*sp >> shift) & 0xf;
00283 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
00284 *dp |= (png_byte)(value << shift);
00285 }
00286
00287 if (shift == s_end)
00288 {
00289 shift = s_start;
00290 sp++;
00291 dp++;
00292 }
00293 else
00294 shift += s_inc;
00295 if (m == 1)
00296 m = 0x80;
00297 else
00298 m >>= 1;
00299 }
00300 break;
00301 }
00302
00303 case 8:
00304 {
00305 png_bytep srcptr;
00306 png_bytep dstptr;
00307 png_uint_32 len;
00308 int m;
00309 int diff, unmask;
00310
00311 __int64 mask0=0x0102040810204080;
00312
00313 #if !defined(PNG_1_0_X)
00314 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00315 )
00316 #else
00317 if (mmx_supported)
00318 #endif
00319 {
00320 srcptr = png_ptr->row_buf + 1;
00321 dstptr = row;
00322 m = 0x80;
00323 unmask = ~mask;
00324 len = png_ptr->width &~7;
00325 diff = png_ptr->width & 7;
00326
00327 _asm
00328 {
00329 movd mm7, unmask
00330 psubb mm6,mm6
00331 punpcklbw mm7,mm7
00332 punpcklwd mm7,mm7
00333 punpckldq mm7,mm7
00334
00335 movq mm0,mask0
00336
00337 pand mm0,mm7
00338 pcmpeqb mm0,mm6
00339
00340 mov ecx,len
00341 mov esi,srcptr
00342 mov ebx,dstptr
00343 cmp ecx,0
00344 je mainloop8end
00345
00346 mainloop8:
00347 movq mm4,[esi]
00348 pand mm4,mm0
00349 movq mm6,mm0
00350 pandn mm6,[ebx]
00351 por mm4,mm6
00352 movq [ebx],mm4
00353
00354 add esi,8
00355 add ebx,8
00356 sub ecx,8
00357
00358 ja mainloop8
00359 mainloop8end:
00360
00361 mov ecx,diff
00362 cmp ecx,0
00363 jz end8
00364
00365 mov edx,mask
00366 sal edx,24
00367
00368 secondloop8:
00369 sal edx,1
00370 jnc skip8
00371 mov al,[esi]
00372 mov [ebx],al
00373 skip8:
00374 inc esi
00375 inc ebx
00376
00377 dec ecx
00378 jnz secondloop8
00379 end8:
00380 emms
00381 }
00382 }
00383 else
00384 {
00385 register unsigned int incr1, initial_val, final_val;
00386 png_size_t pixel_bytes;
00387 png_uint_32 i;
00388 register int disp = png_pass_inc[png_ptr->pass];
00389 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
00390
00391 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
00392 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
00393 pixel_bytes;
00394 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
00395 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
00396 final_val = png_ptr->width*pixel_bytes;
00397 incr1 = (disp)*pixel_bytes;
00398 for (i = initial_val; i < final_val; i += incr1)
00399 {
00400 png_memcpy(dstptr, srcptr, pixel_bytes);
00401 srcptr += incr1;
00402 dstptr += incr1;
00403 }
00404 }
00405
00406 break;
00407 }
00408
00409 case 16:
00410 {
00411 png_bytep srcptr;
00412 png_bytep dstptr;
00413 png_uint_32 len;
00414 int unmask, diff;
00415 __int64 mask1=0x0101020204040808,
00416 mask0=0x1010202040408080;
00417
00418 #if !defined(PNG_1_0_X)
00419 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00420 )
00421 #else
00422 if (mmx_supported)
00423 #endif
00424 {
00425 srcptr = png_ptr->row_buf + 1;
00426 dstptr = row;
00427
00428 unmask = ~mask;
00429 len = (png_ptr->width)&~7;
00430 diff = (png_ptr->width)&7;
00431 _asm
00432 {
00433 movd mm7, unmask
00434 psubb mm6,mm6
00435 punpcklbw mm7,mm7
00436 punpcklwd mm7,mm7
00437 punpckldq mm7,mm7
00438
00439 movq mm0,mask0
00440 movq mm1,mask1
00441
00442 pand mm0,mm7
00443 pand mm1,mm7
00444
00445 pcmpeqb mm0,mm6
00446 pcmpeqb mm1,mm6
00447
00448 mov ecx,len
00449 mov esi,srcptr
00450 mov ebx,dstptr
00451 cmp ecx,0
00452 jz mainloop16end
00453
00454 mainloop16:
00455 movq mm4,[esi]
00456 pand mm4,mm0
00457 movq mm6,mm0
00458 movq mm7,[ebx]
00459 pandn mm6,mm7
00460 por mm4,mm6
00461 movq [ebx],mm4
00462
00463 movq mm5,[esi+8]
00464 pand mm5,mm1
00465 movq mm7,mm1
00466 movq mm6,[ebx+8]
00467 pandn mm7,mm6
00468 por mm5,mm7
00469 movq [ebx+8],mm5
00470
00471 add esi,16
00472 add ebx,16
00473 sub ecx,8
00474
00475 ja mainloop16
00476
00477 mainloop16end:
00478 mov ecx,diff
00479 cmp ecx,0
00480 jz end16
00481
00482 mov edx,mask
00483 sal edx,24
00484 secondloop16:
00485 sal edx,1
00486 jnc skip16
00487 mov ax,[esi]
00488 mov [ebx],ax
00489 skip16:
00490 add esi,2
00491 add ebx,2
00492
00493 dec ecx
00494 jnz secondloop16
00495 end16:
00496 emms
00497 }
00498 }
00499 else
00500 {
00501 register unsigned int incr1, initial_val, final_val;
00502 png_size_t pixel_bytes;
00503 png_uint_32 i;
00504 register int disp = png_pass_inc[png_ptr->pass];
00505 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
00506
00507 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
00508 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
00509 pixel_bytes;
00510 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
00511 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
00512 final_val = png_ptr->width*pixel_bytes;
00513 incr1 = (disp)*pixel_bytes;
00514 for (i = initial_val; i < final_val; i += incr1)
00515 {
00516 png_memcpy(dstptr, srcptr, pixel_bytes);
00517 srcptr += incr1;
00518 dstptr += incr1;
00519 }
00520 }
00521
00522 break;
00523 }
00524
00525 case 24:
00526 {
00527 png_bytep srcptr;
00528 png_bytep dstptr;
00529 png_uint_32 len;
00530 int unmask, diff;
00531
00532 __int64 mask2=0x0101010202020404,
00533 mask1=0x0408080810101020,
00534 mask0=0x2020404040808080;
00535
00536 srcptr = png_ptr->row_buf + 1;
00537 dstptr = row;
00538
00539 unmask = ~mask;
00540 len = (png_ptr->width)&~7;
00541 diff = (png_ptr->width)&7;
00542
00543 #if !defined(PNG_1_0_X)
00544 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00545 )
00546 #else
00547 if (mmx_supported)
00548 #endif
00549 {
00550 _asm
00551 {
00552 movd mm7, unmask
00553 psubb mm6,mm6
00554 punpcklbw mm7,mm7
00555 punpcklwd mm7,mm7
00556 punpckldq mm7,mm7
00557
00558 movq mm0,mask0
00559 movq mm1,mask1
00560 movq mm2,mask2
00561
00562 pand mm0,mm7
00563 pand mm1,mm7
00564 pand mm2,mm7
00565
00566 pcmpeqb mm0,mm6
00567 pcmpeqb mm1,mm6
00568 pcmpeqb mm2,mm6
00569
00570 mov ecx,len
00571 mov esi,srcptr
00572 mov ebx,dstptr
00573 cmp ecx,0
00574 jz mainloop24end
00575
00576 mainloop24:
00577 movq mm4,[esi]
00578 pand mm4,mm0
00579 movq mm6,mm0
00580 movq mm7,[ebx]
00581 pandn mm6,mm7
00582 por mm4,mm6
00583 movq [ebx],mm4
00584
00585
00586 movq mm5,[esi+8]
00587 pand mm5,mm1
00588 movq mm7,mm1
00589 movq mm6,[ebx+8]
00590 pandn mm7,mm6
00591 por mm5,mm7
00592 movq [ebx+8],mm5
00593
00594 movq mm6,[esi+16]
00595 pand mm6,mm2
00596 movq mm4,mm2
00597 movq mm7,[ebx+16]
00598 pandn mm4,mm7
00599 por mm6,mm4
00600 movq [ebx+16],mm6
00601
00602 add esi,24
00603 add ebx,24
00604 sub ecx,8
00605
00606 ja mainloop24
00607
00608 mainloop24end:
00609 mov ecx,diff
00610 cmp ecx,0
00611 jz end24
00612
00613 mov edx,mask
00614 sal edx,24
00615 secondloop24:
00616 sal edx,1
00617 jnc skip24
00618 mov ax,[esi]
00619 mov [ebx],ax
00620 xor eax,eax
00621 mov al,[esi+2]
00622 mov [ebx+2],al
00623 skip24:
00624 add esi,3
00625 add ebx,3
00626
00627 dec ecx
00628 jnz secondloop24
00629
00630 end24:
00631 emms
00632 }
00633 }
00634 else
00635 {
00636 register unsigned int incr1, initial_val, final_val;
00637 png_size_t pixel_bytes;
00638 png_uint_32 i;
00639 register int disp = png_pass_inc[png_ptr->pass];
00640 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
00641
00642 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
00643 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
00644 pixel_bytes;
00645 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
00646 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
00647 final_val = png_ptr->width*pixel_bytes;
00648 incr1 = (disp)*pixel_bytes;
00649 for (i = initial_val; i < final_val; i += incr1)
00650 {
00651 png_memcpy(dstptr, srcptr, pixel_bytes);
00652 srcptr += incr1;
00653 dstptr += incr1;
00654 }
00655 }
00656
00657 break;
00658 }
00659
00660 case 32:
00661 {
00662 png_bytep srcptr;
00663 png_bytep dstptr;
00664 png_uint_32 len;
00665 int unmask, diff;
00666
00667 __int64 mask3=0x0101010102020202,
00668 mask2=0x0404040408080808,
00669 mask1=0x1010101020202020,
00670 mask0=0x4040404080808080;
00671
00672 srcptr = png_ptr->row_buf + 1;
00673 dstptr = row;
00674
00675 unmask = ~mask;
00676 len = (png_ptr->width)&~7;
00677 diff = (png_ptr->width)&7;
00678
00679 #if !defined(PNG_1_0_X)
00680 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00681 )
00682 #else
00683 if (mmx_supported)
00684 #endif
00685 {
00686 _asm
00687 {
00688 movd mm7, unmask
00689 psubb mm6,mm6
00690 punpcklbw mm7,mm7
00691 punpcklwd mm7,mm7
00692 punpckldq mm7,mm7
00693
00694 movq mm0,mask0
00695 movq mm1,mask1
00696 movq mm2,mask2
00697 movq mm3,mask3
00698
00699 pand mm0,mm7
00700 pand mm1,mm7
00701 pand mm2,mm7
00702 pand mm3,mm7
00703
00704 pcmpeqb mm0,mm6
00705 pcmpeqb mm1,mm6
00706 pcmpeqb mm2,mm6
00707 pcmpeqb mm3,mm6
00708
00709 mov ecx,len
00710 mov esi,srcptr
00711 mov ebx,dstptr
00712
00713 cmp ecx,0
00714 jz mainloop32end
00715
00716 mainloop32:
00717 movq mm4,[esi]
00718 pand mm4,mm0
00719 movq mm6,mm0
00720 movq mm7,[ebx]
00721 pandn mm6,mm7
00722 por mm4,mm6
00723 movq [ebx],mm4
00724
00725 movq mm5,[esi+8]
00726 pand mm5,mm1
00727 movq mm7,mm1
00728 movq mm6,[ebx+8]
00729 pandn mm7,mm6
00730 por mm5,mm7
00731 movq [ebx+8],mm5
00732
00733 movq mm6,[esi+16]
00734 pand mm6,mm2
00735 movq mm4,mm2
00736 movq mm7,[ebx+16]
00737 pandn mm4,mm7
00738 por mm6,mm4
00739 movq [ebx+16],mm6
00740
00741 movq mm7,[esi+24]
00742 pand mm7,mm3
00743 movq mm5,mm3
00744 movq mm4,[ebx+24]
00745 pandn mm5,mm4
00746 por mm7,mm5
00747 movq [ebx+24],mm7
00748
00749 add esi,32
00750 add ebx,32
00751 sub ecx,8
00752
00753 ja mainloop32
00754
00755 mainloop32end:
00756 mov ecx,diff
00757 cmp ecx,0
00758 jz end32
00759
00760 mov edx,mask
00761 sal edx,24
00762 secondloop32:
00763 sal edx,1
00764 jnc skip32
00765 mov eax,[esi]
00766 mov [ebx],eax
00767 skip32:
00768 add esi,4
00769 add ebx,4
00770
00771 dec ecx
00772 jnz secondloop32
00773
00774 end32:
00775 emms
00776 }
00777 }
00778 else
00779 {
00780 register unsigned int incr1, initial_val, final_val;
00781 png_size_t pixel_bytes;
00782 png_uint_32 i;
00783 register int disp = png_pass_inc[png_ptr->pass];
00784 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
00785
00786 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
00787 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
00788 pixel_bytes;
00789 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
00790 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
00791 final_val = png_ptr->width*pixel_bytes;
00792 incr1 = (disp)*pixel_bytes;
00793 for (i = initial_val; i < final_val; i += incr1)
00794 {
00795 png_memcpy(dstptr, srcptr, pixel_bytes);
00796 srcptr += incr1;
00797 dstptr += incr1;
00798 }
00799 }
00800
00801 break;
00802 }
00803
00804 case 48:
00805 {
00806 png_bytep srcptr;
00807 png_bytep dstptr;
00808 png_uint_32 len;
00809 int unmask, diff;
00810
00811 __int64 mask5=0x0101010101010202,
00812 mask4=0x0202020204040404,
00813 mask3=0x0404080808080808,
00814 mask2=0x1010101010102020,
00815 mask1=0x2020202040404040,
00816 mask0=0x4040808080808080;
00817
00818 #if !defined(PNG_1_0_X)
00819 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00820 )
00821 #else
00822 if (mmx_supported)
00823 #endif
00824 {
00825 srcptr = png_ptr->row_buf + 1;
00826 dstptr = row;
00827
00828 unmask = ~mask;
00829 len = (png_ptr->width)&~7;
00830 diff = (png_ptr->width)&7;
00831 _asm
00832 {
00833 movd mm7, unmask
00834 psubb mm6,mm6
00835 punpcklbw mm7,mm7
00836 punpcklwd mm7,mm7
00837 punpckldq mm7,mm7
00838
00839 movq mm0,mask0
00840 movq mm1,mask1
00841 movq mm2,mask2
00842 movq mm3,mask3
00843 movq mm4,mask4
00844 movq mm5,mask5
00845
00846 pand mm0,mm7
00847 pand mm1,mm7
00848 pand mm2,mm7
00849 pand mm3,mm7
00850 pand mm4,mm7
00851 pand mm5,mm7
00852
00853 pcmpeqb mm0,mm6
00854 pcmpeqb mm1,mm6
00855 pcmpeqb mm2,mm6
00856 pcmpeqb mm3,mm6
00857 pcmpeqb mm4,mm6
00858 pcmpeqb mm5,mm6
00859
00860 mov ecx,len
00861 mov esi,srcptr
00862 mov ebx,dstptr
00863
00864 cmp ecx,0
00865 jz mainloop48end
00866
00867 mainloop48:
00868 movq mm7,[esi]
00869 pand mm7,mm0
00870 movq mm6,mm0
00871 pandn mm6,[ebx]
00872 por mm7,mm6
00873 movq [ebx],mm7
00874
00875 movq mm6,[esi+8]
00876 pand mm6,mm1
00877 movq mm7,mm1
00878 pandn mm7,[ebx+8]
00879 por mm6,mm7
00880 movq [ebx+8],mm6
00881
00882 movq mm6,[esi+16]
00883 pand mm6,mm2
00884 movq mm7,mm2
00885 pandn mm7,[ebx+16]
00886 por mm6,mm7
00887 movq [ebx+16],mm6
00888
00889 movq mm7,[esi+24]
00890 pand mm7,mm3
00891 movq mm6,mm3
00892 pandn mm6,[ebx+24]
00893 por mm7,mm6
00894 movq [ebx+24],mm7
00895
00896 movq mm6,[esi+32]
00897 pand mm6,mm4
00898 movq mm7,mm4
00899 pandn mm7,[ebx+32]
00900 por mm6,mm7
00901 movq [ebx+32],mm6
00902
00903 movq mm7,[esi+40]
00904 pand mm7,mm5
00905 movq mm6,mm5
00906 pandn mm6,[ebx+40]
00907 por mm7,mm6
00908 movq [ebx+40],mm7
00909
00910 add esi,48
00911 add ebx,48
00912 sub ecx,8
00913
00914 ja mainloop48
00915 mainloop48end:
00916
00917 mov ecx,diff
00918 cmp ecx,0
00919 jz end48
00920
00921 mov edx,mask
00922 sal edx,24
00923
00924 secondloop48:
00925 sal edx,1
00926 jnc skip48
00927 mov eax,[esi]
00928 mov [ebx],eax
00929 skip48:
00930 add esi,4
00931 add ebx,4
00932
00933 dec ecx
00934 jnz secondloop48
00935
00936 end48:
00937 emms
00938 }
00939 }
00940 else
00941 {
00942 register unsigned int incr1, initial_val, final_val;
00943 png_size_t pixel_bytes;
00944 png_uint_32 i;
00945 register int disp = png_pass_inc[png_ptr->pass];
00946 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
00947
00948 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
00949 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
00950 pixel_bytes;
00951 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
00952 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
00953 final_val = png_ptr->width*pixel_bytes;
00954 incr1 = (disp)*pixel_bytes;
00955 for (i = initial_val; i < final_val; i += incr1)
00956 {
00957 png_memcpy(dstptr, srcptr, pixel_bytes);
00958 srcptr += incr1;
00959 dstptr += incr1;
00960 }
00961 }
00962
00963 break;
00964 }
00965
00966 default:
00967 {
00968 png_bytep sptr;
00969 png_bytep dp;
00970 png_size_t pixel_bytes;
00971 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
00972 unsigned int i;
00973 register int disp = png_pass_inc[png_ptr->pass];
00974 register unsigned int incr1, initial_val, final_val;
00975
00976 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
00977 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
00978 pixel_bytes;
00979 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
00980 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
00981 final_val = png_ptr->width*pixel_bytes;
00982 incr1 = (disp)*pixel_bytes;
00983 for (i = initial_val; i < final_val; i += incr1)
00984 {
00985 png_memcpy(dp, sptr, pixel_bytes);
00986 sptr += incr1;
00987 dp += incr1;
00988 }
00989 break;
00990 }
00991 }
00992 }
00993
00994 }
00995
00996
00997 #if defined(PNG_READ_INTERLACING_SUPPORTED)
00998
00999 void
01000 png_do_read_interlace(png_structp png_ptr)
01001 {
01002 png_row_infop row_info = &(png_ptr->row_info);
01003 png_bytep row = png_ptr->row_buf + 1;
01004 int pass = png_ptr->pass;
01005 png_uint_32 transformations = png_ptr->transformations;
01006 #ifdef PNG_USE_LOCAL_ARRAYS
01007 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
01008 #endif
01009
01010 png_debug(1,"in png_do_read_interlace\n");
01011
01012 if (mmx_supported == 2) {
01013 #if !defined(PNG_1_0_X)
01014
01015 png_warning(png_ptr, "asm_flags may not have been initialized");
01016 #endif
01017 png_mmx_support();
01018 }
01019
01020 if (row != NULL && row_info != NULL)
01021 {
01022 png_uint_32 final_width;
01023
01024 final_width = row_info->width * png_pass_inc[pass];
01025
01026 switch (row_info->pixel_depth)
01027 {
01028 case 1:
01029 {
01030 png_bytep sp, dp;
01031 int sshift, dshift;
01032 int s_start, s_end, s_inc;
01033 png_byte v;
01034 png_uint_32 i;
01035 int j;
01036
01037 sp = row + (png_size_t)((row_info->width - 1) >> 3);
01038 dp = row + (png_size_t)((final_width - 1) >> 3);
01039 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
01040 if (transformations & PNG_PACKSWAP)
01041 {
01042 sshift = (int)((row_info->width + 7) & 7);
01043 dshift = (int)((final_width + 7) & 7);
01044 s_start = 7;
01045 s_end = 0;
01046 s_inc = -1;
01047 }
01048 else
01049 #endif
01050 {
01051 sshift = 7 - (int)((row_info->width + 7) & 7);
01052 dshift = 7 - (int)((final_width + 7) & 7);
01053 s_start = 0;
01054 s_end = 7;
01055 s_inc = 1;
01056 }
01057
01058 for (i = row_info->width; i; i--)
01059 {
01060 v = (png_byte)((*sp >> sshift) & 0x1);
01061 for (j = 0; j < png_pass_inc[pass]; j++)
01062 {
01063 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
01064 *dp |= (png_byte)(v << dshift);
01065 if (dshift == s_end)
01066 {
01067 dshift = s_start;
01068 dp--;
01069 }
01070 else
01071 dshift += s_inc;
01072 }
01073 if (sshift == s_end)
01074 {
01075 sshift = s_start;
01076 sp--;
01077 }
01078 else
01079 sshift += s_inc;
01080 }
01081 break;
01082 }
01083
01084 case 2:
01085 {
01086 png_bytep sp, dp;
01087 int sshift, dshift;
01088 int s_start, s_end, s_inc;
01089 png_uint_32 i;
01090
01091 sp = row + (png_size_t)((row_info->width - 1) >> 2);
01092 dp = row + (png_size_t)((final_width - 1) >> 2);
01093 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
01094 if (transformations & PNG_PACKSWAP)
01095 {
01096 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
01097 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
01098 s_start = 6;
01099 s_end = 0;
01100 s_inc = -2;
01101 }
01102 else
01103 #endif
01104 {
01105 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
01106 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
01107 s_start = 0;
01108 s_end = 6;
01109 s_inc = 2;
01110 }
01111
01112 for (i = row_info->width; i; i--)
01113 {
01114 png_byte v;
01115 int j;
01116
01117 v = (png_byte)((*sp >> sshift) & 0x3);
01118 for (j = 0; j < png_pass_inc[pass]; j++)
01119 {
01120 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
01121 *dp |= (png_byte)(v << dshift);
01122 if (dshift == s_end)
01123 {
01124 dshift = s_start;
01125 dp--;
01126 }
01127 else
01128 dshift += s_inc;
01129 }
01130 if (sshift == s_end)
01131 {
01132 sshift = s_start;
01133 sp--;
01134 }
01135 else
01136 sshift += s_inc;
01137 }
01138 break;
01139 }
01140
01141 case 4:
01142 {
01143 png_bytep sp, dp;
01144 int sshift, dshift;
01145 int s_start, s_end, s_inc;
01146 png_uint_32 i;
01147
01148 sp = row + (png_size_t)((row_info->width - 1) >> 1);
01149 dp = row + (png_size_t)((final_width - 1) >> 1);
01150 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
01151 if (transformations & PNG_PACKSWAP)
01152 {
01153 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
01154 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
01155 s_start = 4;
01156 s_end = 0;
01157 s_inc = -4;
01158 }
01159 else
01160 #endif
01161 {
01162 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
01163 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
01164 s_start = 0;
01165 s_end = 4;
01166 s_inc = 4;
01167 }
01168
01169 for (i = row_info->width; i; i--)
01170 {
01171 png_byte v;
01172 int j;
01173
01174 v = (png_byte)((*sp >> sshift) & 0xf);
01175 for (j = 0; j < png_pass_inc[pass]; j++)
01176 {
01177 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
01178 *dp |= (png_byte)(v << dshift);
01179 if (dshift == s_end)
01180 {
01181 dshift = s_start;
01182 dp--;
01183 }
01184 else
01185 dshift += s_inc;
01186 }
01187 if (sshift == s_end)
01188 {
01189 sshift = s_start;
01190 sp--;
01191 }
01192 else
01193 sshift += s_inc;
01194 }
01195 break;
01196 }
01197
01198 default:
01199 {
01200 __int64 const4 = 0x0000000000FFFFFF;
01201
01202 __int64 const6 = 0x00000000000000FF;
01203 png_bytep sptr, dp;
01204 png_uint_32 i;
01205 png_size_t pixel_bytes;
01206 int width = row_info->width;
01207
01208 pixel_bytes = (row_info->pixel_depth >> 3);
01209
01210 sptr = row + (width - 1) * pixel_bytes;
01211 dp = row + (final_width - 1) * pixel_bytes;
01212
01213
01214
01215
01216
01217 #if !defined(PNG_1_0_X)
01218 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
01219 )
01220 #else
01221 if (mmx_supported)
01222 #endif
01223 {
01224 if (pixel_bytes == 3)
01225 {
01226 if (((pass == 0) || (pass == 1)) && width)
01227 {
01228 _asm
01229 {
01230 mov esi, sptr
01231 mov edi, dp
01232 mov ecx, width
01233 sub edi, 21
01234 loop_pass0:
01235 movd mm0, [esi] ; X X X X X v2 v1 v0
01236 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
01237 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
01238 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
01239 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
01240 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
01241 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
01242 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
01243 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
01244 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
01245 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
01246 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
01247 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
01248 movq [edi+16] , mm4
01249 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
01250 movq [edi+8] , mm3
01251 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
01252 sub esi, 3
01253 movq [edi], mm0
01254 sub edi, 24
01255
01256 dec ecx
01257 jnz loop_pass0
01258 EMMS
01259 }
01260 }
01261 else if (((pass == 2) || (pass == 3)) && width)
01262 {
01263 _asm
01264 {
01265 mov esi, sptr
01266 mov edi, dp
01267 mov ecx, width
01268 sub edi, 9
01269 loop_pass2:
01270 movd mm0, [esi] ; X X X X X v2 v1 v0
01271 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
01272 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
01273 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
01274 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
01275 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
01276 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
01277 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
01278 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
01279 movq [edi+4], mm0 ; move to memory
01280 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
01281 movd [edi], mm0 ; move to memory
01282 sub esi, 3
01283 sub edi, 12
01284 dec ecx
01285 jnz loop_pass2
01286 EMMS
01287 }
01288 }
01289 else if (width)
01290 {
01291 int width_mmx = ((width >> 1) << 1) - 8;
01292 if (width_mmx < 0)
01293 width_mmx = 0;
01294 width -= width_mmx;
01295 if (width_mmx)
01296 {
01297 _asm
01298 {
01299 mov esi, sptr
01300 mov edi, dp
01301 mov ecx, width_mmx
01302 sub esi, 3
01303 sub edi, 9
01304 loop_pass4:
01305 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
01306 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
01307 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
01308 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
01309 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
01310 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
01311 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
01312 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
01313 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
01314 movq [edi], mm0 ; move quad to memory
01315 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
01316 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
01317 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
01318 movd [edi+8], mm6 ; move double to memory
01319 sub esi, 6
01320 sub edi, 12
01321 sub ecx, 2
01322 jnz loop_pass4
01323 EMMS
01324 }
01325 }
01326
01327 sptr -= width_mmx*3;
01328 dp -= width_mmx*6;
01329 for (i = width; i; i--)
01330 {
01331 png_byte v[8];
01332 int j;
01333
01334 png_memcpy(v, sptr, 3);
01335 for (j = 0; j < png_pass_inc[pass]; j++)
01336 {
01337 png_memcpy(dp, v, 3);
01338 dp -= 3;
01339 }
01340 sptr -= 3;
01341 }
01342 }
01343 }
01344
01345 else if (pixel_bytes == 1)
01346 {
01347 if (((pass == 0) || (pass == 1)) && width)
01348 {
01349 int width_mmx = ((width >> 2) << 2);
01350 width -= width_mmx;
01351 if (width_mmx)
01352 {
01353 _asm
01354 {
01355 mov esi, sptr
01356 mov edi, dp
01357 mov ecx, width_mmx
01358 sub edi, 31
01359 sub esi, 3
01360 loop1_pass0:
01361 movd mm0, [esi] ; X X X X v0 v1 v2 v3
01362 movq mm1, mm0 ; X X X X v0 v1 v2 v3
01363 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
01364 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
01365 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
01366 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
01367 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
01368 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
01369 movq [edi], mm0 ; move to memory v3
01370 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
01371 movq [edi+8], mm3 ; move to memory v2
01372 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
01373 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
01374 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
01375 movq [edi+16], mm2 ; move to memory v1
01376 movq [edi+24], mm4 ; move to memory v0
01377 sub esi, 4
01378 sub edi, 32
01379 sub ecx, 4
01380 jnz loop1_pass0
01381 EMMS
01382 }
01383 }
01384
01385 sptr -= width_mmx;
01386 dp -= width_mmx*8;
01387 for (i = width; i; i--)
01388 {
01389 int j;
01390
01391
01392
01393
01394
01395
01396
01397
01398
01399
01400
01401
01402
01403
01404
01405
01406
01407
01408
01409 for (j = 0; j < png_pass_inc[pass]; j++)
01410 *dp-- = *sptr;
01411 sptr--;
01412 }
01413 }
01414 else if (((pass == 2) || (pass == 3)) && width)
01415 {
01416 int width_mmx = ((width >> 2) << 2);
01417 width -= width_mmx;
01418 if (width_mmx)
01419 {
01420 _asm
01421 {
01422 mov esi, sptr
01423 mov edi, dp
01424 mov ecx, width_mmx
01425 sub edi, 15
01426 sub esi, 3
01427 loop1_pass2:
01428 movd mm0, [esi] ; X X X X v0 v1 v2 v3
01429 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
01430 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
01431 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
01432 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
01433 movq [edi], mm0 ; move to memory v2 and v3
01434 sub esi, 4
01435 movq [edi+8], mm1 ; move to memory v1 and v0
01436 sub edi, 16
01437 sub ecx, 4
01438 jnz loop1_pass2
01439 EMMS
01440 }
01441 }
01442
01443 sptr -= width_mmx;
01444 dp -= width_mmx*4;
01445 for (i = width; i; i--)
01446 {
01447 int j;
01448
01449 for (j = 0; j < png_pass_inc[pass]; j++)
01450 {
01451 *dp-- = *sptr;
01452 }
01453 sptr --;
01454 }
01455 }
01456 else if (width)
01457 {
01458 int width_mmx = ((width >> 3) << 3);
01459 width -= width_mmx;
01460 if (width_mmx)
01461 {
01462 _asm
01463 {
01464 mov esi, sptr
01465 mov edi, dp
01466 mov ecx, width_mmx
01467 sub edi, 15
01468 sub esi, 7
01469 loop1_pass4:
01470 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
01471 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
01472 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
01473
01474 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
01475 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
01476 sub esi, 8
01477 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
01478
01479 sub edi, 16
01480 sub ecx, 8
01481 jnz loop1_pass4
01482 EMMS
01483 }
01484 }
01485
01486 sptr -= width_mmx;
01487 dp -= width_mmx*2;
01488 for (i = width; i; i--)
01489 {
01490 int j;
01491
01492 for (j = 0; j < png_pass_inc[pass]; j++)
01493 {
01494 *dp-- = *sptr;
01495 }
01496 sptr --;
01497 }
01498 }
01499 }
01500
01501 else if (pixel_bytes == 2)
01502 {
01503 if (((pass == 0) || (pass == 1)) && width)
01504 {
01505 int width_mmx = ((width >> 1) << 1);
01506 width -= width_mmx;
01507 if (width_mmx)
01508 {
01509 _asm
01510 {
01511 mov esi, sptr
01512 mov edi, dp
01513 mov ecx, width_mmx
01514 sub esi, 2
01515 sub edi, 30
01516 loop2_pass0:
01517 movd mm0, [esi] ; X X X X v1 v0 v3 v2
01518 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
01519 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
01520 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
01521 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
01522 movq [edi], mm0
01523 movq [edi + 8], mm0
01524 movq [edi + 16], mm1
01525 movq [edi + 24], mm1
01526 sub esi, 4
01527 sub edi, 32
01528 sub ecx, 2
01529 jnz loop2_pass0
01530 EMMS
01531 }
01532 }
01533
01534 sptr -= (width_mmx*2 - 2);
01535 dp -= (width_mmx*16 - 2);
01536 for (i = width; i; i--)
01537 {
01538 png_byte v[8];
01539 int j;
01540 sptr -= 2;
01541 png_memcpy(v, sptr, 2);
01542 for (j = 0; j < png_pass_inc[pass]; j++)
01543 {
01544 dp -= 2;
01545 png_memcpy(dp, v, 2);
01546 }
01547 }
01548 }
01549 else if (((pass == 2) || (pass == 3)) && width)
01550 {
01551 int width_mmx = ((width >> 1) << 1) ;
01552 width -= width_mmx;
01553 if (width_mmx)
01554 {
01555 _asm
01556 {
01557 mov esi, sptr
01558 mov edi, dp
01559 mov ecx, width_mmx
01560 sub esi, 2
01561 sub edi, 14
01562 loop2_pass2:
01563 movd mm0, [esi] ; X X X X v1 v0 v3 v2
01564 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
01565 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
01566 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
01567 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
01568 movq [edi], mm0
01569 sub esi, 4
01570 movq [edi + 8], mm1
01571
01572 sub edi, 16
01573 sub ecx, 2
01574 jnz loop2_pass2
01575 EMMS
01576 }
01577 }
01578
01579 sptr -= (width_mmx*2 - 2);
01580 dp -= (width_mmx*8 - 2);
01581 for (i = width; i; i--)
01582 {
01583 png_byte v[8];
01584 int j;
01585 sptr -= 2;
01586 png_memcpy(v, sptr, 2);
01587 for (j = 0; j < png_pass_inc[pass]; j++)
01588 {
01589 dp -= 2;
01590 png_memcpy(dp, v, 2);
01591 }
01592 }
01593 }
01594 else if (width)
01595 {
01596 int width_mmx = ((width >> 1) << 1) ;
01597 width -= width_mmx;
01598 if (width_mmx)
01599 {
01600 _asm
01601 {
01602 mov esi, sptr
01603 mov edi, dp
01604 mov ecx, width_mmx
01605 sub esi, 2
01606 sub edi, 6
01607 loop2_pass4:
01608 movd mm0, [esi] ; X X X X v1 v0 v3 v2
01609 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
01610 sub esi, 4
01611 movq [edi], mm0
01612 sub edi, 8
01613 sub ecx, 2
01614 jnz loop2_pass4
01615 EMMS
01616 }
01617 }
01618
01619 sptr -= (width_mmx*2 - 2);
01620 dp -= (width_mmx*4 - 2);
01621 for (i = width; i; i--)
01622 {
01623 png_byte v[8];
01624 int j;
01625 sptr -= 2;
01626 png_memcpy(v, sptr, 2);
01627 for (j = 0; j < png_pass_inc[pass]; j++)
01628 {
01629 dp -= 2;
01630 png_memcpy(dp, v, 2);
01631 }
01632 }
01633 }
01634 }
01635
01636 else if (pixel_bytes == 4)
01637 {
01638 if (((pass == 0) || (pass == 1)) && width)
01639 {
01640 int width_mmx = ((width >> 1) << 1) ;
01641 width -= width_mmx;
01642 if (width_mmx)
01643 {
01644 _asm
01645 {
01646 mov esi, sptr
01647 mov edi, dp
01648 mov ecx, width_mmx
01649 sub esi, 4
01650 sub edi, 60
01651 loop4_pass0:
01652 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
01653 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
01654 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
01655 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
01656 movq [edi], mm0
01657 movq [edi + 8], mm0
01658 movq [edi + 16], mm0
01659 movq [edi + 24], mm0
01660 movq [edi+32], mm1
01661 movq [edi + 40], mm1
01662 movq [edi+ 48], mm1
01663 sub esi, 8
01664 movq [edi + 56], mm1
01665 sub edi, 64
01666 sub ecx, 2
01667 jnz loop4_pass0
01668 EMMS
01669 }
01670 }
01671
01672 sptr -= (width_mmx*4 - 4);
01673 dp -= (width_mmx*32 - 4);
01674 for (i = width; i; i--)
01675 {
01676 png_byte v[8];
01677 int j;
01678 sptr -= 4;
01679 png_memcpy(v, sptr, 4);
01680 for (j = 0; j < png_pass_inc[pass]; j++)
01681 {
01682 dp -= 4;
01683 png_memcpy(dp, v, 4);
01684 }
01685 }
01686 }
01687 else if (((pass == 2) || (pass == 3)) && width)
01688 {
01689 int width_mmx = ((width >> 1) << 1) ;
01690 width -= width_mmx;
01691 if (width_mmx)
01692 {
01693 _asm
01694 {
01695 mov esi, sptr
01696 mov edi, dp
01697 mov ecx, width_mmx
01698 sub esi, 4
01699 sub edi, 28
01700 loop4_pass2:
01701 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
01702 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
01703 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
01704 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
01705 movq [edi], mm0
01706 movq [edi + 8], mm0
01707 movq [edi+16], mm1
01708 movq [edi + 24], mm1
01709 sub esi, 8
01710 sub edi, 32
01711 sub ecx, 2
01712 jnz loop4_pass2
01713 EMMS
01714 }
01715 }
01716
01717 sptr -= (width_mmx*4 - 4);
01718 dp -= (width_mmx*16 - 4);
01719 for (i = width; i; i--)
01720 {
01721 png_byte v[8];
01722 int j;
01723 sptr -= 4;
01724 png_memcpy(v, sptr, 4);
01725 for (j = 0; j < png_pass_inc[pass]; j++)
01726 {
01727 dp -= 4;
01728 png_memcpy(dp, v, 4);
01729 }
01730 }
01731 }
01732 else if (width)
01733 {
01734 int width_mmx = ((width >> 1) << 1) ;
01735 width -= width_mmx;
01736 if (width_mmx)
01737 {
01738 _asm
01739 {
01740 mov esi, sptr
01741 mov edi, dp
01742 mov ecx, width_mmx
01743 sub esi, 4
01744 sub edi, 12
01745 loop4_pass4:
01746 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
01747 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
01748 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
01749 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
01750 movq [edi], mm0
01751 sub esi, 8
01752 movq [edi + 8], mm1
01753 sub edi, 16
01754 sub ecx, 2
01755 jnz loop4_pass4
01756 EMMS
01757 }
01758 }
01759
01760 sptr -= (width_mmx*4 - 4);
01761 dp -= (width_mmx*8 - 4);
01762 for (i = width; i; i--)
01763 {
01764 png_byte v[8];
01765 int j;
01766 sptr -= 4;
01767 png_memcpy(v, sptr, 4);
01768 for (j = 0; j < png_pass_inc[pass]; j++)
01769 {
01770 dp -= 4;
01771 png_memcpy(dp, v, 4);
01772 }
01773 }
01774 }
01775
01776 }
01777
01778 else if (pixel_bytes == 6)
01779 {
01780 for (i = width; i; i--)
01781 {
01782 png_byte v[8];
01783 int j;
01784 png_memcpy(v, sptr, 6);
01785 for (j = 0; j < png_pass_inc[pass]; j++)
01786 {
01787 png_memcpy(dp, v, 6);
01788 dp -= 6;
01789 }
01790 sptr -= 6;
01791 }
01792 }
01793
01794 else
01795 {
01796 for (i = width; i; i--)
01797 {
01798 png_byte v[8];
01799 int j;
01800 png_memcpy(v, sptr, pixel_bytes);
01801 for (j = 0; j < png_pass_inc[pass]; j++)
01802 {
01803 png_memcpy(dp, v, pixel_bytes);
01804 dp -= pixel_bytes;
01805 }
01806 sptr-= pixel_bytes;
01807 }
01808 }
01809 }
01810
01811 else
01812
01813 {
01814 if (pixel_bytes == 1)
01815 {
01816 for (i = width; i; i--)
01817 {
01818 int j;
01819 for (j = 0; j < png_pass_inc[pass]; j++)
01820 *dp-- = *sptr;
01821 sptr--;
01822 }
01823 }
01824 else if (pixel_bytes == 3)
01825 {
01826 for (i = width; i; i--)
01827 {
01828 png_byte v[8];
01829 int j;
01830 png_memcpy(v, sptr, pixel_bytes);
01831 for (j = 0; j < png_pass_inc[pass]; j++)
01832 {
01833 png_memcpy(dp, v, pixel_bytes);
01834 dp -= pixel_bytes;
01835 }
01836 sptr -= pixel_bytes;
01837 }
01838 }
01839 else if (pixel_bytes == 2)
01840 {
01841 for (i = width; i; i--)
01842 {
01843 png_byte v[8];
01844 int j;
01845 png_memcpy(v, sptr, pixel_bytes);
01846 for (j = 0; j < png_pass_inc[pass]; j++)
01847 {
01848 png_memcpy(dp, v, pixel_bytes);
01849 dp -= pixel_bytes;
01850 }
01851 sptr -= pixel_bytes;
01852 }
01853 }
01854 else if (pixel_bytes == 4)
01855 {
01856 for (i = width; i; i--)
01857 {
01858 png_byte v[8];
01859 int j;
01860 png_memcpy(v, sptr, pixel_bytes);
01861 for (j = 0; j < png_pass_inc[pass]; j++)
01862 {
01863 png_memcpy(dp, v, pixel_bytes);
01864 dp -= pixel_bytes;
01865 }
01866 sptr -= pixel_bytes;
01867 }
01868 }
01869 else if (pixel_bytes == 6)
01870 {
01871 for (i = width; i; i--)
01872 {
01873 png_byte v[8];
01874 int j;
01875 png_memcpy(v, sptr, pixel_bytes);
01876 for (j = 0; j < png_pass_inc[pass]; j++)
01877 {
01878 png_memcpy(dp, v, pixel_bytes);
01879 dp -= pixel_bytes;
01880 }
01881 sptr -= pixel_bytes;
01882 }
01883 }
01884 else
01885 {
01886 for (i = width; i; i--)
01887 {
01888 png_byte v[8];
01889 int j;
01890 png_memcpy(v, sptr, pixel_bytes);
01891 for (j = 0; j < png_pass_inc[pass]; j++)
01892 {
01893 png_memcpy(dp, v, pixel_bytes);
01894 dp -= pixel_bytes;
01895 }
01896 sptr -= pixel_bytes;
01897 }
01898 }
01899
01900 }
01901 break;
01902 }
01903 }
01904
01905 row_info->width = final_width;
01906
01907 row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
01908 }
01909
01910 }
01911
01912 #endif
01913
01914
01915
01916
01917
01918 union uAll {
01919 __int64 use;
01920 double align;
01921 } LBCarryMask = {0x0101010101010101},
01922 HBClearMask = {0x7f7f7f7f7f7f7f7f},
01923 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
01924
01925
01926
01927 void
01928 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
01929 , png_bytep prev_row)
01930 {
01931 int bpp;
01932 png_uint_32 FullLength;
01933 png_uint_32 MMXLength;
01934
01935 int diff;
01936
01937 bpp = (row_info->pixel_depth + 7) >> 3;
01938 FullLength = row_info->rowbytes;
01939 _asm {
01940
01941 mov edi, row
01942 xor ebx, ebx
01943 mov edx, edi
01944 mov esi, prev_row
01945 sub edx, bpp
01946
01947 xor eax, eax
01948
01949
01950 davgrlp:
01951 mov al, [esi + ebx]
01952 inc ebx
01953 shr al, 1
01954 add al, [edi+ebx-1]
01955 cmp ebx, bpp
01956 mov [edi+ebx-1], al
01957
01958 jb davgrlp
01959
01960 mov diff, edi
01961 add diff, ebx
01962 add diff, 0xf
01963 and diff, 0xfffffff8
01964 sub diff, edi
01965 jz davggo
01966
01967
01968
01969 xor ecx, ecx
01970 davglp1:
01971 xor eax, eax
01972 mov cl, [esi + ebx]
01973 mov al, [edx + ebx]
01974 add ax, cx
01975 inc ebx
01976 shr ax, 1
01977 add al, [edi+ebx-1]
01978 cmp ebx, diff
01979 mov [edi+ebx-1], al
01980
01981 jb davglp1
01982 davggo:
01983 mov eax, FullLength
01984 mov ecx, eax
01985 sub eax, ebx
01986 and eax, 0x00000007
01987 sub ecx, eax
01988 mov MMXLength, ecx
01989 }
01990
01991 switch ( bpp )
01992 {
01993 case 3:
01994 {
01995 ActiveMask.use = 0x0000000000ffffff;
01996 ShiftBpp.use = 24;
01997 ShiftRem.use = 40;
01998 _asm {
01999
02000 movq mm7, ActiveMask
02001 mov ebx, diff
02002 movq mm5, LBCarryMask
02003 mov edi, row
02004 movq mm4, HBClearMask
02005 mov esi, prev_row
02006
02007 movq mm2, [edi + ebx - 8]
02008
02009 davg3lp:
02010 movq mm0, [edi + ebx]
02011
02012 movq mm3, mm5
02013 psrlq mm2, ShiftRem
02014 movq mm1, [esi + ebx]
02015 movq mm6, mm7
02016 pand mm3, mm1
02017 psrlq mm1, 1
02018 pand mm1, mm4
02019 paddb mm0, mm1
02020
02021 movq mm1, mm3
02022 pand mm1, mm2
02023
02024 psrlq mm2, 1
02025 pand mm2, mm4
02026 paddb mm2, mm1
02027 pand mm2, mm6
02028 paddb mm0, mm2
02029
02030
02031 psllq mm6, ShiftBpp
02032 movq mm2, mm0
02033 psllq mm2, ShiftBpp
02034 movq mm1, mm3
02035 pand mm1, mm2
02036
02037 psrlq mm2, 1
02038 pand mm2, mm4
02039 paddb mm2, mm1
02040 pand mm2, mm6
02041 paddb mm0, mm2
02042
02043
02044
02045 psllq mm6, ShiftBpp
02046
02047 movq mm2, mm0
02048 psllq mm2, ShiftBpp
02049
02050
02051 movq mm1, mm3
02052 pand mm1, mm2
02053
02054 psrlq mm2, 1
02055 pand mm2, mm4
02056 paddb mm2, mm1
02057 pand mm2, mm6
02058 add ebx, 8
02059 paddb mm0, mm2
02060
02061
02062
02063 movq [edi + ebx - 8], mm0
02064
02065 cmp ebx, MMXLength
02066 movq mm2, mm0
02067 jb davg3lp
02068 }
02069 }
02070 break;
02071
02072 case 6:
02073 case 4:
02074 case 7:
02075 case 5:
02076 {
02077 ActiveMask.use = 0xffffffffffffffff;
02078
02079 ShiftBpp.use = bpp << 3;
02080 ShiftRem.use = 64 - ShiftBpp.use;
02081 _asm {
02082 movq mm4, HBClearMask
02083
02084 mov ebx, diff
02085
02086 movq mm7, ActiveMask
02087 mov edi, row
02088 psrlq mm7, ShiftRem
02089 mov esi, prev_row
02090 movq mm6, mm7
02091 movq mm5, LBCarryMask
02092 psllq mm6, ShiftBpp
02093
02094 movq mm2, [edi + ebx - 8]
02095
02096 davg4lp:
02097 movq mm0, [edi + ebx]
02098 psrlq mm2, ShiftRem
02099 movq mm1, [esi + ebx]
02100
02101 movq mm3, mm5
02102 pand mm3, mm1
02103 psrlq mm1, 1
02104 pand mm1, mm4
02105 paddb mm0, mm1
02106
02107 movq mm1, mm3
02108 pand mm1, mm2
02109
02110 psrlq mm2, 1
02111 pand mm2, mm4
02112 paddb mm2, mm1
02113 pand mm2, mm7
02114 paddb mm0, mm2
02115
02116
02117 movq mm2, mm0
02118 psllq mm2, ShiftBpp
02119 add ebx, 8
02120 movq mm1, mm3
02121 pand mm1, mm2
02122
02123 psrlq mm2, 1
02124 pand mm2, mm4
02125 paddb mm2, mm1
02126 pand mm2, mm6
02127 paddb mm0, mm2
02128
02129 cmp ebx, MMXLength
02130
02131 movq [edi + ebx - 8], mm0
02132
02133 movq mm2, mm0
02134 jb davg4lp
02135 }
02136 }
02137 break;
02138 case 2:
02139 {
02140 ActiveMask.use = 0x000000000000ffff;
02141 ShiftBpp.use = 16;
02142 ShiftRem.use = 48;
02143 _asm {
02144
02145 movq mm7, ActiveMask
02146
02147 mov ebx, diff
02148 movq mm5, LBCarryMask
02149 mov edi, row
02150 movq mm4, HBClearMask
02151 mov esi, prev_row
02152
02153 movq mm2, [edi + ebx - 8]
02154
02155 davg2lp:
02156 movq mm0, [edi + ebx]
02157 psrlq mm2, ShiftRem
02158 movq mm1, [esi + ebx]
02159
02160 movq mm3, mm5
02161 pand mm3, mm1
02162 psrlq mm1, 1
02163 pand mm1, mm4
02164 movq mm6, mm7
02165 paddb mm0, mm1
02166
02167 movq mm1, mm3
02168 pand mm1, mm2
02169
02170 psrlq mm2, 1
02171 pand mm2, mm4
02172 paddb mm2, mm1
02173 pand mm2, mm6
02174 paddb mm0, mm2
02175
02176 psllq mm6, ShiftBpp
02177 movq mm2, mm0
02178 psllq mm2, ShiftBpp
02179 movq mm1, mm3
02180 pand mm1, mm2
02181
02182 psrlq mm2, 1
02183 pand mm2, mm4
02184 paddb mm2, mm1
02185 pand mm2, mm6
02186 paddb mm0, mm2
02187
02188
02189 psllq mm6, ShiftBpp
02190 movq mm2, mm0
02191 psllq mm2, ShiftBpp
02192
02193
02194 movq mm1, mm3
02195 pand mm1, mm2
02196
02197 psrlq mm2, 1
02198 pand mm2, mm4
02199 paddb mm2, mm1
02200 pand mm2, mm6
02201 paddb mm0, mm2
02202
02203
02204 psllq mm6, ShiftBpp
02205 movq mm2, mm0
02206 psllq mm2, ShiftBpp
02207
02208
02209 add ebx, 8
02210 movq mm1, mm3
02211 pand mm1, mm2
02212
02213 psrlq mm2, 1
02214 pand mm2, mm4
02215 paddb mm2, mm1
02216 pand mm2, mm6
02217 paddb mm0, mm2
02218
02219 cmp ebx, MMXLength
02220
02221 movq [edi + ebx - 8], mm0
02222
02223 movq mm2, mm0
02224 jb davg2lp
02225 }
02226 }
02227 break;
02228
02229 case 1:
02230 {
02231 _asm {
02232
02233 mov ebx, diff
02234 mov edi, row
02235 cmp ebx, FullLength
02236 jnb davg1end
02237
02238 mov esi, prev_row
02239 mov edx, edi
02240 xor ecx, ecx
02241 sub edx, bpp
02242 davg1lp:
02243
02244 xor eax, eax
02245 mov cl, [esi + ebx]
02246 mov al, [edx + ebx]
02247 add ax, cx
02248 inc ebx
02249 shr ax, 1
02250 add al, [edi+ebx-1]
02251 cmp ebx, FullLength
02252 mov [edi+ebx-1], al
02253
02254 jb davg1lp
02255 davg1end:
02256 }
02257 }
02258 return;
02259
02260 case 8:
02261 {
02262 _asm {
02263
02264 mov ebx, diff
02265 movq mm5, LBCarryMask
02266 mov edi, row
02267 movq mm4, HBClearMask
02268 mov esi, prev_row
02269
02270 movq mm2, [edi + ebx - 8]
02271
02272 davg8lp:
02273 movq mm0, [edi + ebx]
02274 movq mm3, mm5
02275 movq mm1, [esi + ebx]
02276 add ebx, 8
02277 pand mm3, mm1
02278 psrlq mm1, 1
02279 pand mm3, mm2
02280
02281 psrlq mm2, 1
02282 pand mm1, mm4
02283 paddb mm0, mm3
02284 pand mm2, mm4
02285 paddb mm0, mm1
02286 paddb mm0, mm2
02287 cmp ebx, MMXLength
02288 movq [edi + ebx - 8], mm0
02289 movq mm2, mm0
02290 jb davg8lp
02291 }
02292 }
02293 break;
02294 default:
02295 {
02296 _asm {
02297 movq mm5, LBCarryMask
02298
02299 mov ebx, diff
02300 mov edi, row
02301 movq mm4, HBClearMask
02302 mov edx, edi
02303 mov esi, prev_row
02304 sub edx, bpp
02305 davgAlp:
02306 movq mm0, [edi + ebx]
02307 movq mm3, mm5
02308 movq mm1, [esi + ebx]
02309 pand mm3, mm1
02310 movq mm2, [edx + ebx]
02311 psrlq mm1, 1
02312 pand mm3, mm2
02313
02314 psrlq mm2, 1
02315 pand mm1, mm4
02316 paddb mm0, mm3
02317 pand mm2, mm4
02318 paddb mm0, mm1
02319 add ebx, 8
02320 paddb mm0, mm2
02321 cmp ebx, MMXLength
02322 movq [edi + ebx - 8], mm0
02323 jb davgAlp
02324 }
02325 }
02326 break;
02327 }
02328
02329 _asm {
02330
02331
02332 mov ebx, MMXLength
02333 mov edi, row
02334 cmp ebx, FullLength
02335 jnb davgend
02336
02337 mov esi, prev_row
02338 mov edx, edi
02339 xor ecx, ecx
02340 sub edx, bpp
02341 davglp2:
02342
02343 xor eax, eax
02344 mov cl, [esi + ebx]
02345 mov al, [edx + ebx]
02346 add ax, cx
02347 inc ebx
02348 shr ax, 1
02349 add al, [edi+ebx-1]
02350 cmp ebx, FullLength
02351 mov [edi+ebx-1], al
02352
02353 jb davglp2
02354 davgend:
02355 emms
02356 }
02357 }
02358
02359
02360 void
02361 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
02362 png_bytep prev_row)
02363 {
02364 png_uint_32 FullLength;
02365 png_uint_32 MMXLength;
02366
02367 int bpp;
02368 int diff;
02369
02370 int patemp, pbtemp, pctemp;
02371
02372 bpp = (row_info->pixel_depth + 7) >> 3;
02373 FullLength = row_info->rowbytes;
02374 _asm
02375 {
02376 xor ebx, ebx
02377 mov edi, row
02378 xor edx, edx
02379 mov esi, prev_row
02380 xor eax, eax
02381
02382
02383
02384
02385 dpthrlp:
02386 mov al, [edi + ebx]
02387 add al, [esi + ebx]
02388 inc ebx
02389 cmp ebx, bpp
02390 mov [edi + ebx - 1], al
02391 jb dpthrlp
02392
02393 mov diff, edi
02394 add diff, ebx
02395 xor ecx, ecx
02396 add diff, 0xf
02397 and diff, 0xfffffff8
02398 sub diff, edi
02399 jz dpthgo
02400
02401 dpthlp1:
02402 xor eax, eax
02403
02404 mov al, [esi + ebx]
02405 mov cl, [esi + edx]
02406 sub eax, ecx
02407 mov patemp, eax
02408 xor eax, eax
02409
02410 mov al, [edi + edx]
02411 sub eax, ecx
02412 mov ecx, eax
02413
02414 add eax, patemp
02415
02416 test eax, 0x80000000
02417 jz dpthpca
02418 neg eax
02419 dpthpca:
02420 mov pctemp, eax
02421
02422 test ecx, 0x80000000
02423 jz dpthpba
02424 neg ecx
02425 dpthpba:
02426 mov pbtemp, ecx
02427
02428 mov eax, patemp
02429 test eax, 0x80000000
02430 jz dpthpaa
02431 neg eax
02432 dpthpaa:
02433 mov patemp, eax
02434
02435 cmp eax, ecx
02436 jna dpthabb
02437
02438 cmp ecx, pctemp
02439 jna dpthbbc
02440
02441 mov cl, [esi + edx]
02442 jmp dpthpaeth
02443 dpthbbc:
02444
02445 mov cl, [esi + ebx]
02446 jmp dpthpaeth
02447 dpthabb:
02448
02449 cmp eax, pctemp
02450 jna dpthabc
02451
02452 mov cl, [esi + edx]
02453 jmp dpthpaeth
02454 dpthabc:
02455
02456 mov cl, [edi + edx]
02457 dpthpaeth:
02458 inc ebx
02459 inc edx
02460
02461 add [edi + ebx - 1], cl
02462 cmp ebx, diff
02463 jb dpthlp1
02464 dpthgo:
02465 mov ecx, FullLength
02466 mov eax, ecx
02467 sub eax, ebx
02468 and eax, 0x00000007
02469 sub ecx, eax
02470 mov MMXLength, ecx
02471 }
02472
02473 switch ( bpp )
02474 {
02475 case 3:
02476 {
02477 ActiveMask.use = 0x0000000000ffffff;
02478 ActiveMaskEnd.use = 0xffff000000000000;
02479 ShiftBpp.use = 24;
02480 ShiftRem.use = 40;
02481 _asm
02482 {
02483 mov ebx, diff
02484 mov edi, row
02485 mov esi, prev_row
02486 pxor mm0, mm0
02487
02488 movq mm1, [edi+ebx-8]
02489 dpth3lp:
02490 psrlq mm1, ShiftRem
02491 movq mm2, [esi + ebx]
02492 punpcklbw mm1, mm0
02493 movq mm3, [esi+ebx-8]
02494 punpcklbw mm2, mm0
02495 psrlq mm3, ShiftRem
02496
02497 movq mm4, mm2
02498 punpcklbw mm3, mm0
02499
02500 movq mm5, mm1
02501 psubw mm4, mm3
02502 pxor mm7, mm7
02503
02504 movq mm6, mm4
02505 psubw mm5, mm3
02506
02507
02508
02509
02510 pcmpgtw mm0, mm4
02511 paddw mm6, mm5
02512 pand mm0, mm4
02513 pcmpgtw mm7, mm5
02514 psubw mm4, mm0
02515 pand mm7, mm5
02516 psubw mm4, mm0
02517 psubw mm5, mm7
02518 pxor mm0, mm0
02519 pcmpgtw mm0, mm6
02520 pand mm0, mm6
02521 psubw mm5, mm7
02522 psubw mm6, mm0
02523
02524 movq mm7, mm4
02525 psubw mm6, mm0
02526 pcmpgtw mm7, mm5
02527 movq mm0, mm7
02528
02529 pand mm5, mm7
02530
02531 pand mm2, mm0
02532 pandn mm7, mm4
02533 pandn mm0, mm1
02534 paddw mm7, mm5
02535 paddw mm0, mm2
02536
02537 pcmpgtw mm7, mm6
02538 pxor mm1, mm1
02539 pand mm3, mm7
02540 pandn mm7, mm0
02541 paddw mm7, mm3
02542 pxor mm0, mm0
02543 packuswb mm7, mm1
02544 movq mm3, [esi + ebx]
02545 pand mm7, ActiveMask
02546 movq mm2, mm3
02547 paddb mm7, [edi + ebx]
02548 punpcklbw mm3, mm0
02549 movq [edi + ebx], mm7
02550 movq mm1, mm7
02551
02552 psrlq mm2, ShiftBpp
02553 punpcklbw mm1, mm0
02554 pxor mm7, mm7
02555 punpcklbw mm2, mm0
02556
02557 movq mm5, mm1
02558
02559 movq mm4, mm2
02560 psubw mm5, mm3
02561 psubw mm4, mm3
02562
02563
02564 movq mm6, mm5
02565 paddw mm6, mm4
02566
02567
02568
02569
02570 pcmpgtw mm0, mm5
02571 pcmpgtw mm7, mm4
02572 pand mm0, mm5
02573 pand mm7, mm4
02574 psubw mm5, mm0
02575 psubw mm4, mm7
02576 psubw mm5, mm0
02577 psubw mm4, mm7
02578 pxor mm0, mm0
02579 pcmpgtw mm0, mm6
02580 pand mm0, mm6
02581 psubw mm6, mm0
02582
02583 movq mm7, mm4
02584 psubw mm6, mm0
02585 pcmpgtw mm7, mm5
02586 movq mm0, mm7
02587
02588 pand mm5, mm7
02589
02590 pand mm2, mm0
02591 pandn mm7, mm4
02592 pandn mm0, mm1
02593 paddw mm7, mm5
02594 paddw mm0, mm2
02595
02596 pcmpgtw mm7, mm6
02597 movq mm2, [esi + ebx]
02598 pand mm3, mm7
02599 pandn mm7, mm0
02600 pxor mm1, mm1
02601 paddw mm7, mm3
02602 pxor mm0, mm0
02603 packuswb mm7, mm1
02604 movq mm3, mm2
02605 pand mm7, ActiveMask
02606 punpckhbw mm2, mm0
02607 psllq mm7, ShiftBpp
02608
02609 movq mm4, mm2
02610 paddb mm7, [edi + ebx]
02611 psllq mm3, ShiftBpp
02612 movq [edi + ebx], mm7
02613 movq mm1, mm7
02614 punpckhbw mm3, mm0
02615 psllq mm1, ShiftBpp
02616
02617
02618 pxor mm7, mm7
02619 punpckhbw mm1, mm0
02620 psubw mm4, mm3
02621
02622 movq mm5, mm1
02623
02624 movq mm6, mm4
02625 psubw mm5, mm3
02626 pxor mm0, mm0
02627 paddw mm6, mm5
02628
02629
02630
02631
02632 pcmpgtw mm0, mm4
02633 pcmpgtw mm7, mm5
02634 pand mm0, mm4
02635 pand mm7, mm5
02636 psubw mm4, mm0
02637 psubw mm5, mm7
02638 psubw mm4, mm0
02639 psubw mm5, mm7
02640 pxor mm0, mm0
02641 pcmpgtw mm0, mm6
02642 pand mm0, mm6
02643 psubw mm6, mm0
02644
02645 movq mm7, mm4
02646 psubw mm6, mm0
02647 pcmpgtw mm7, mm5
02648 movq mm0, mm7
02649
02650 pand mm2, mm0
02651
02652 pand mm5, mm7
02653 pandn mm0, mm1
02654 pandn mm7, mm4
02655 paddw mm0, mm2
02656 paddw mm7, mm5
02657
02658 pcmpgtw mm7, mm6
02659 pand mm3, mm7
02660 pandn mm7, mm0
02661 paddw mm7, mm3
02662 pxor mm1, mm1
02663 packuswb mm1, mm7
02664
02665 add ebx, 8
02666 pand mm1, ActiveMaskEnd
02667 paddb mm1, [edi + ebx - 8]
02668
02669 cmp ebx, MMXLength
02670 pxor mm0, mm0
02671 movq [edi + ebx - 8], mm1
02672
02673
02674 jb dpth3lp
02675 }
02676 }
02677 break;
02678
02679 case 6:
02680 case 7:
02681 case 5:
02682 {
02683 ActiveMask.use = 0x00000000ffffffff;
02684 ActiveMask2.use = 0xffffffff00000000;
02685 ShiftBpp.use = bpp << 3;
02686 ShiftRem.use = 64 - ShiftBpp.use;
02687 _asm
02688 {
02689 mov ebx, diff
02690 mov edi, row
02691 mov esi, prev_row
02692
02693 movq mm1, [edi+ebx-8]
02694 pxor mm0, mm0
02695 dpth6lp:
02696
02697 psrlq mm1, ShiftRem
02698
02699 movq mm3, [esi+ebx-8]
02700 punpcklbw mm1, mm0
02701 movq mm2, [esi + ebx]
02702 punpcklbw mm2, mm0
02703
02704 psrlq mm3, ShiftRem
02705
02706 movq mm4, mm2
02707 punpcklbw mm3, mm0
02708
02709 movq mm5, mm1
02710 psubw mm4, mm3
02711 pxor mm7, mm7
02712
02713 movq mm6, mm4
02714 psubw mm5, mm3
02715
02716
02717
02718 pcmpgtw mm0, mm4
02719 paddw mm6, mm5
02720 pand mm0, mm4
02721 pcmpgtw mm7, mm5
02722 psubw mm4, mm0
02723 pand mm7, mm5
02724 psubw mm4, mm0
02725 psubw mm5, mm7
02726 pxor mm0, mm0
02727 pcmpgtw mm0, mm6
02728 pand mm0, mm6
02729 psubw mm5, mm7
02730 psubw mm6, mm0
02731
02732 movq mm7, mm4
02733 psubw mm6, mm0
02734 pcmpgtw mm7, mm5
02735 movq mm0, mm7
02736
02737 pand mm5, mm7
02738
02739 pand mm2, mm0
02740 pandn mm7, mm4
02741 pandn mm0, mm1
02742 paddw mm7, mm5
02743 paddw mm0, mm2
02744
02745 pcmpgtw mm7, mm6
02746 pxor mm1, mm1
02747 pand mm3, mm7
02748 pandn mm7, mm0
02749 paddw mm7, mm3
02750 pxor mm0, mm0
02751 packuswb mm7, mm1
02752 movq mm3, [esi + ebx - 8]
02753 pand mm7, ActiveMask
02754 psrlq mm3, ShiftRem
02755 movq mm2, [esi + ebx]
02756 paddb mm7, [edi + ebx]
02757 movq mm6, mm2
02758 movq [edi + ebx], mm7
02759 movq mm1, [edi+ebx-8]
02760 psllq mm6, ShiftBpp
02761 movq mm5, mm7
02762 psrlq mm1, ShiftRem
02763 por mm3, mm6
02764 psllq mm5, ShiftBpp
02765 punpckhbw mm3, mm0
02766 por mm1, mm5
02767
02768 punpckhbw mm2, mm0
02769 punpckhbw mm1, mm0
02770
02771 movq mm4, mm2
02772
02773 movq mm5, mm1
02774 psubw mm4, mm3
02775 pxor mm7, mm7
02776
02777 movq mm6, mm4
02778 psubw mm5, mm3
02779
02780
02781
02782 pcmpgtw mm0, mm4
02783 paddw mm6, mm5
02784 pand mm0, mm4
02785 pcmpgtw mm7, mm5
02786 psubw mm4, mm0
02787 pand mm7, mm5
02788 psubw mm4, mm0
02789 psubw mm5, mm7
02790 pxor mm0, mm0
02791 pcmpgtw mm0, mm6
02792 pand mm0, mm6
02793 psubw mm5, mm7
02794 psubw mm6, mm0
02795
02796 movq mm7, mm4
02797 psubw mm6, mm0
02798 pcmpgtw mm7, mm5
02799 movq mm0, mm7
02800
02801 pand mm5, mm7
02802
02803 pand mm2, mm0
02804 pandn mm7, mm4
02805 pandn mm0, mm1
02806 paddw mm7, mm5
02807 paddw mm0, mm2
02808
02809 pcmpgtw mm7, mm6
02810 pxor mm1, mm1
02811 pand mm3, mm7
02812 pandn mm7, mm0
02813 pxor mm1, mm1
02814 paddw mm7, mm3
02815 pxor mm0, mm0
02816
02817 add ebx, 8
02818 packuswb mm1, mm7
02819 paddb mm1, [edi + ebx - 8]
02820 cmp ebx, MMXLength
02821 movq [edi + ebx - 8], mm1
02822
02823 jb dpth6lp
02824 }
02825 }
02826 break;
02827
02828 case 4:
02829 {
02830 ActiveMask.use = 0x00000000ffffffff;
02831 _asm {
02832 mov ebx, diff
02833 mov edi, row
02834 mov esi, prev_row
02835 pxor mm0, mm0
02836
02837 movq mm1, [edi+ebx-8]
02838
02839 dpth4lp:
02840
02841 movq mm3, [esi+ebx-8]
02842 punpckhbw mm1, mm0
02843 movq mm2, [esi + ebx]
02844 punpcklbw mm2, mm0
02845
02846 movq mm4, mm2
02847 punpckhbw mm3, mm0
02848
02849 movq mm5, mm1
02850 psubw mm4, mm3
02851 pxor mm7, mm7
02852
02853 movq mm6, mm4
02854 psubw mm5, mm3
02855
02856
02857
02858 pcmpgtw mm0, mm4
02859 paddw mm6, mm5
02860 pand mm0, mm4
02861 pcmpgtw mm7, mm5
02862 psubw mm4, mm0
02863 pand mm7, mm5
02864 psubw mm4, mm0
02865 psubw mm5, mm7
02866 pxor mm0, mm0
02867 pcmpgtw mm0, mm6
02868 pand mm0, mm6
02869 psubw mm5, mm7
02870 psubw mm6, mm0
02871
02872 movq mm7, mm4
02873 psubw mm6, mm0
02874 pcmpgtw mm7, mm5
02875 movq mm0, mm7
02876
02877 pand mm5, mm7
02878
02879 pand mm2, mm0
02880 pandn mm7, mm4
02881 pandn mm0, mm1
02882 paddw mm7, mm5
02883 paddw mm0, mm2
02884
02885 pcmpgtw mm7, mm6
02886 pxor mm1, mm1
02887 pand mm3, mm7
02888 pandn mm7, mm0
02889 paddw mm7, mm3
02890 pxor mm0, mm0
02891 packuswb mm7, mm1
02892 movq mm3, [esi + ebx]
02893 pand mm7, ActiveMask
02894 movq mm2, mm3
02895 paddb mm7, [edi + ebx]
02896 punpcklbw mm3, mm0
02897 movq [edi + ebx], mm7
02898 movq mm1, mm7
02899
02900 punpckhbw mm2, mm0
02901 punpcklbw mm1, mm0
02902
02903 movq mm4, mm2
02904
02905 movq mm5, mm1
02906 psubw mm4, mm3
02907 pxor mm7, mm7
02908
02909 movq mm6, mm4
02910 psubw mm5, mm3
02911
02912
02913
02914 pcmpgtw mm0, mm4
02915 paddw mm6, mm5
02916 pand mm0, mm4
02917 pcmpgtw mm7, mm5
02918 psubw mm4, mm0
02919 pand mm7, mm5
02920 psubw mm4, mm0
02921 psubw mm5, mm7
02922 pxor mm0, mm0
02923 pcmpgtw mm0, mm6
02924 pand mm0, mm6
02925 psubw mm5, mm7
02926 psubw mm6, mm0
02927
02928 movq mm7, mm4
02929 psubw mm6, mm0
02930 pcmpgtw mm7, mm5
02931 movq mm0, mm7
02932
02933 pand mm5, mm7
02934
02935 pand mm2, mm0
02936 pandn mm7, mm4
02937 pandn mm0, mm1
02938 paddw mm7, mm5
02939 paddw mm0, mm2
02940
02941 pcmpgtw mm7, mm6
02942 pxor mm1, mm1
02943 pand mm3, mm7
02944 pandn mm7, mm0
02945 pxor mm1, mm1
02946 paddw mm7, mm3
02947 pxor mm0, mm0
02948
02949 add ebx, 8
02950 packuswb mm1, mm7
02951 paddb mm1, [edi + ebx - 8]
02952 cmp ebx, MMXLength
02953 movq [edi + ebx - 8], mm1
02954
02955 jb dpth4lp
02956 }
02957 }
02958 break;
02959 case 8:
02960 {
02961 ActiveMask.use = 0x00000000ffffffff;
02962 _asm {
02963 mov ebx, diff
02964 mov edi, row
02965 mov esi, prev_row
02966 pxor mm0, mm0
02967
02968 movq mm1, [edi+ebx-8]
02969
02970 dpth8lp:
02971
02972 movq mm3, [esi+ebx-8]
02973 punpcklbw mm1, mm0
02974 movq mm2, [esi + ebx]
02975 punpcklbw mm2, mm0
02976
02977 movq mm4, mm2
02978 punpcklbw mm3, mm0
02979
02980 movq mm5, mm1
02981 psubw mm4, mm3
02982 pxor mm7, mm7
02983
02984 movq mm6, mm4
02985 psubw mm5, mm3
02986
02987
02988
02989 pcmpgtw mm0, mm4
02990 paddw mm6, mm5
02991 pand mm0, mm4
02992 pcmpgtw mm7, mm5
02993 psubw mm4, mm0
02994 pand mm7, mm5
02995 psubw mm4, mm0
02996 psubw mm5, mm7
02997 pxor mm0, mm0
02998 pcmpgtw mm0, mm6
02999 pand mm0, mm6
03000 psubw mm5, mm7
03001 psubw mm6, mm0
03002
03003 movq mm7, mm4
03004 psubw mm6, mm0
03005 pcmpgtw mm7, mm5
03006 movq mm0, mm7
03007
03008 pand mm5, mm7
03009
03010 pand mm2, mm0
03011 pandn mm7, mm4
03012 pandn mm0, mm1
03013 paddw mm7, mm5
03014 paddw mm0, mm2
03015
03016 pcmpgtw mm7, mm6
03017 pxor mm1, mm1
03018 pand mm3, mm7
03019 pandn mm7, mm0
03020 paddw mm7, mm3
03021 pxor mm0, mm0
03022 packuswb mm7, mm1
03023 movq mm3, [esi+ebx-8]
03024 pand mm7, ActiveMask
03025 movq mm2, [esi + ebx]
03026 paddb mm7, [edi + ebx]
03027 punpckhbw mm3, mm0
03028 movq [edi + ebx], mm7
03029 movq mm1, [edi+ebx-8]
03030
03031
03032 punpckhbw mm2, mm0
03033 punpckhbw mm1, mm0
03034
03035 movq mm4, mm2
03036
03037 movq mm5, mm1
03038 psubw mm4, mm3
03039 pxor mm7, mm7
03040
03041 movq mm6, mm4
03042 psubw mm5, mm3
03043
03044
03045
03046 pcmpgtw mm0, mm4
03047 paddw mm6, mm5
03048 pand mm0, mm4
03049 pcmpgtw mm7, mm5
03050 psubw mm4, mm0
03051 pand mm7, mm5
03052 psubw mm4, mm0
03053 psubw mm5, mm7
03054 pxor mm0, mm0
03055 pcmpgtw mm0, mm6
03056 pand mm0, mm6
03057 psubw mm5, mm7
03058 psubw mm6, mm0
03059
03060 movq mm7, mm4
03061 psubw mm6, mm0
03062 pcmpgtw mm7, mm5
03063 movq mm0, mm7
03064
03065 pand mm5, mm7
03066
03067 pand mm2, mm0
03068 pandn mm7, mm4
03069 pandn mm0, mm1
03070 paddw mm7, mm5
03071 paddw mm0, mm2
03072
03073 pcmpgtw mm7, mm6
03074 pxor mm1, mm1
03075 pand mm3, mm7
03076 pandn mm7, mm0
03077 pxor mm1, mm1
03078 paddw mm7, mm3
03079 pxor mm0, mm0
03080
03081 add ebx, 8
03082 packuswb mm1, mm7
03083 paddb mm1, [edi + ebx - 8]
03084 cmp ebx, MMXLength
03085 movq [edi + ebx - 8], mm1
03086
03087 jb dpth8lp
03088 }
03089 }
03090 break;
03091
03092 case 1:
03093 case 2:
03094 default:
03095 {
03096 _asm {
03097 mov ebx, diff
03098 cmp ebx, FullLength
03099 jnb dpthdend
03100 mov edi, row
03101 mov esi, prev_row
03102
03103 mov edx, ebx
03104 xor ecx, ecx
03105 sub edx, bpp
03106 dpthdlp:
03107 xor eax, eax
03108
03109 mov al, [esi + ebx]
03110 mov cl, [esi + edx]
03111 sub eax, ecx
03112 mov patemp, eax
03113 xor eax, eax
03114
03115 mov al, [edi + edx]
03116 sub eax, ecx
03117 mov ecx, eax
03118
03119 add eax, patemp
03120
03121 test eax, 0x80000000
03122 jz dpthdpca
03123 neg eax
03124 dpthdpca:
03125 mov pctemp, eax
03126
03127 test ecx, 0x80000000
03128 jz dpthdpba
03129 neg ecx
03130 dpthdpba:
03131 mov pbtemp, ecx
03132
03133 mov eax, patemp
03134 test eax, 0x80000000
03135 jz dpthdpaa
03136 neg eax
03137 dpthdpaa:
03138 mov patemp, eax
03139
03140 cmp eax, ecx
03141 jna dpthdabb
03142
03143 cmp ecx, pctemp
03144 jna dpthdbbc
03145
03146 mov cl, [esi + edx]
03147 jmp dpthdpaeth
03148 dpthdbbc:
03149
03150 mov cl, [esi + ebx]
03151 jmp dpthdpaeth
03152 dpthdabb:
03153
03154 cmp eax, pctemp
03155 jna dpthdabc
03156
03157 mov cl, [esi + edx]
03158 jmp dpthdpaeth
03159 dpthdabc:
03160
03161 mov cl, [edi + edx]
03162 dpthdpaeth:
03163 inc ebx
03164 inc edx
03165
03166 add [edi + ebx - 1], cl
03167 cmp ebx, FullLength
03168 jb dpthdlp
03169 dpthdend:
03170 }
03171 }
03172 return;
03173 }
03174 _asm
03175 {
03176
03177
03178 mov ebx, MMXLength
03179 cmp ebx, FullLength
03180 jnb dpthend
03181 mov edi, row
03182 mov esi, prev_row
03183
03184 mov edx, ebx
03185 xor ecx, ecx
03186 sub edx, bpp
03187 dpthlp2:
03188 xor eax, eax
03189
03190 mov al, [esi + ebx]
03191 mov cl, [esi + edx]
03192 sub eax, ecx
03193 mov patemp, eax
03194 xor eax, eax
03195
03196 mov al, [edi + edx]
03197 sub eax, ecx
03198 mov ecx, eax
03199
03200 add eax, patemp
03201
03202 test eax, 0x80000000
03203 jz dpthpca2
03204 neg eax
03205 dpthpca2:
03206 mov pctemp, eax
03207
03208 test ecx, 0x80000000
03209 jz dpthpba2
03210 neg ecx
03211 dpthpba2:
03212 mov pbtemp, ecx
03213
03214 mov eax, patemp
03215 test eax, 0x80000000
03216 jz dpthpaa2
03217 neg eax
03218 dpthpaa2:
03219 mov patemp, eax
03220
03221 cmp eax, ecx
03222 jna dpthabb2
03223
03224 cmp ecx, pctemp
03225 jna dpthbbc2
03226
03227 mov cl, [esi + edx]
03228 jmp dpthpaeth2
03229 dpthbbc2:
03230
03231 mov cl, [esi + ebx]
03232 jmp dpthpaeth2
03233 dpthabb2:
03234
03235 cmp eax, pctemp
03236 jna dpthabc2
03237
03238 mov cl, [esi + edx]
03239 jmp dpthpaeth2
03240 dpthabc2:
03241
03242 mov cl, [edi + edx]
03243 dpthpaeth2:
03244 inc ebx
03245 inc edx
03246
03247 add [edi + ebx - 1], cl
03248 cmp ebx, FullLength
03249 jb dpthlp2
03250 dpthend:
03251 emms
03252 }
03253 }
03254
03255
03256 void
03257 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
03258 {
03259
03260 int bpp;
03261 png_uint_32 FullLength;
03262 png_uint_32 MMXLength;
03263 int diff;
03264
03265 bpp = (row_info->pixel_depth + 7) >> 3;
03266 FullLength = row_info->rowbytes - bpp;
03267 _asm {
03268 mov edi, row
03269 mov esi, edi
03270 add edi, bpp
03271 xor eax, eax
03272
03273 mov diff, edi
03274 add diff, 0xf
03275
03276 xor ebx, ebx
03277 and diff, 0xfffffff8
03278 sub diff, edi
03279
03280 jz dsubgo
03281
03282 dsublp1:
03283 mov al, [esi+ebx]
03284 add [edi+ebx], al
03285 inc ebx
03286 cmp ebx, diff
03287 jb dsublp1
03288 dsubgo:
03289 mov ecx, FullLength
03290 mov edx, ecx
03291 sub edx, ebx
03292 and edx, 0x00000007
03293 sub ecx, edx
03294 mov MMXLength, ecx
03295 }
03296
03297
03298 switch ( bpp )
03299 {
03300 case 3:
03301 {
03302 ActiveMask.use = 0x0000ffffff000000;
03303 ShiftBpp.use = 24;
03304 ShiftRem.use = 40;
03305 _asm {
03306 mov edi, row
03307 movq mm7, ActiveMask
03308 mov esi, edi
03309 add edi, bpp
03310 movq mm6, mm7
03311 mov ebx, diff
03312 psllq mm6, ShiftBpp
03313
03314
03315 movq mm1, [edi+ebx-8]
03316 dsub3lp:
03317 psrlq mm1, ShiftRem
03318
03319
03320 movq mm0, [edi+ebx]
03321 paddb mm0, mm1
03322
03323 movq mm1, mm0
03324 psllq mm1, ShiftBpp
03325 pand mm1, mm7
03326 paddb mm0, mm1
03327
03328 movq mm1, mm0
03329 psllq mm1, ShiftBpp
03330 pand mm1, mm6
03331 add ebx, 8
03332 paddb mm0, mm1
03333 cmp ebx, MMXLength
03334 movq [edi+ebx-8], mm0
03335
03336 movq mm1, mm0
03337 jb dsub3lp
03338 }
03339 }
03340 break;
03341
03342 case 1:
03343 {
03344
03345
03346
03347
03348
03349
03350
03351
03352
03353
03354
03355
03356 _asm {
03357 mov ebx, diff
03358 mov edi, row
03359 cmp ebx, FullLength
03360 jnb dsub1end
03361 mov esi, edi
03362 xor eax, eax
03363 add edi, bpp
03364 dsub1lp:
03365 mov al, [esi+ebx]
03366 add [edi+ebx], al
03367 inc ebx
03368 cmp ebx, FullLength
03369 jb dsub1lp
03370 dsub1end:
03371 }
03372 }
03373 return;
03374
03375 case 6:
03376 case 7:
03377 case 4:
03378 case 5:
03379 {
03380 ShiftBpp.use = bpp << 3;
03381 ShiftRem.use = 64 - ShiftBpp.use;
03382 _asm {
03383 mov edi, row
03384 mov ebx, diff
03385 mov esi, edi
03386 add edi, bpp
03387
03388 movq mm1, [edi+ebx-8]
03389 dsub4lp:
03390 psrlq mm1, ShiftRem
03391
03392 movq mm0, [edi+ebx]
03393 paddb mm0, mm1
03394
03395 movq mm1, mm0
03396 psllq mm1, ShiftBpp
03397
03398
03399 add ebx, 8
03400 paddb mm0, mm1
03401 cmp ebx, MMXLength
03402 movq [edi+ebx-8], mm0
03403 movq mm1, mm0
03404 jb dsub4lp
03405 }
03406 }
03407 break;
03408
03409 case 2:
03410 {
03411 ActiveMask.use = 0x00000000ffff0000;
03412 ShiftBpp.use = 16;
03413 ShiftRem.use = 48;
03414 _asm {
03415 movq mm7, ActiveMask
03416 mov ebx, diff
03417 movq mm6, mm7
03418 mov edi, row
03419 psllq mm6, ShiftBpp
03420
03421 mov esi, edi
03422 movq mm5, mm6
03423 add edi, bpp
03424 psllq mm5, ShiftBpp
03425
03426
03427 movq mm1, [edi+ebx-8]
03428 dsub2lp:
03429
03430 psrlq mm1, ShiftRem
03431
03432
03433 movq mm0, [edi+ebx]
03434 paddb mm0, mm1
03435
03436 movq mm1, mm0
03437 psllq mm1, ShiftBpp
03438 pand mm1, mm7
03439 paddb mm0, mm1
03440
03441 movq mm1, mm0
03442 psllq mm1, ShiftBpp
03443 pand mm1, mm6
03444 paddb mm0, mm1
03445
03446 movq mm1, mm0
03447 psllq mm1, ShiftBpp
03448 pand mm1, mm5
03449 add ebx, 8
03450 paddb mm0, mm1
03451 cmp ebx, MMXLength
03452 movq [edi+ebx-8], mm0
03453 movq mm1, mm0
03454 jb dsub2lp
03455 }
03456 }
03457 break;
03458 case 8:
03459 {
03460 _asm {
03461 mov edi, row
03462 mov ebx, diff
03463 mov esi, edi
03464 add edi, bpp
03465 mov ecx, MMXLength
03466 movq mm7, [edi+ebx-8]
03467
03468 and ecx, 0x0000003f
03469 dsub8lp:
03470 movq mm0, [edi+ebx]
03471 paddb mm0, mm7
03472 movq mm1, [edi+ebx+8]
03473 movq [edi+ebx], mm0
03474
03475
03476
03477
03478
03479 paddb mm1, mm0
03480 movq mm2, [edi+ebx+16]
03481 movq [edi+ebx+8], mm1
03482 paddb mm2, mm1
03483 movq mm3, [edi+ebx+24]
03484 movq [edi+ebx+16], mm2
03485 paddb mm3, mm2
03486 movq mm4, [edi+ebx+32]
03487 movq [edi+ebx+24], mm3
03488 paddb mm4, mm3
03489 movq mm5, [edi+ebx+40]
03490 movq [edi+ebx+32], mm4
03491 paddb mm5, mm4
03492 movq mm6, [edi+ebx+48]
03493 movq [edi+ebx+40], mm5
03494 paddb mm6, mm5
03495 movq mm7, [edi+ebx+56]
03496 movq [edi+ebx+48], mm6
03497 add ebx, 64
03498 paddb mm7, mm6
03499 cmp ebx, ecx
03500 movq [edi+ebx-8], mm7
03501 jb dsub8lp
03502 cmp ebx, MMXLength
03503 jnb dsub8lt8
03504 dsub8lpA:
03505 movq mm0, [edi+ebx]
03506 add ebx, 8
03507 paddb mm0, mm7
03508 cmp ebx, MMXLength
03509 movq [edi+ebx-8], mm0
03510 movq mm7, mm0
03511
03512 jb dsub8lpA
03513 dsub8lt8:
03514 }
03515 }
03516 break;
03517
03518 default:
03519 {
03520 _asm {
03521 mov ebx, diff
03522 mov edi, row
03523 mov esi, edi
03524 add edi, bpp
03525 dsubAlp:
03526 movq mm0, [edi+ebx]
03527 movq mm1, [esi+ebx]
03528 add ebx, 8
03529 paddb mm0, mm1
03530 cmp ebx, MMXLength
03531 movq [edi+ebx-8], mm0
03532
03533 jb dsubAlp
03534 }
03535 }
03536 break;
03537
03538 }
03539
03540 _asm {
03541 mov ebx, MMXLength
03542 mov edi, row
03543 cmp ebx, FullLength
03544 jnb dsubend
03545 mov esi, edi
03546 xor eax, eax
03547 add edi, bpp
03548 dsublp2:
03549 mov al, [esi+ebx]
03550 add [edi+ebx], al
03551 inc ebx
03552 cmp ebx, FullLength
03553 jb dsublp2
03554 dsubend:
03555 emms
03556 }
03557 }
03558
03559
03560 void
03561 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
03562 png_bytep prev_row)
03563 {
03564 png_uint_32 len;
03565 len = row_info->rowbytes;
03566 _asm {
03567 mov edi, row
03568
03569 mov ecx, edi
03570 xor ebx, ebx
03571 add ecx, 0x7
03572 xor eax, eax
03573 and ecx, 0xfffffff8
03574 mov esi, prev_row
03575 sub ecx, edi
03576 jz dupgo
03577
03578 duplp1:
03579 mov al, [edi+ebx]
03580 add al, [esi+ebx]
03581 inc ebx
03582 cmp ebx, ecx
03583 mov [edi + ebx-1], al
03584 jb duplp1
03585 dupgo:
03586 mov ecx, len
03587 mov edx, ecx
03588 sub edx, ebx
03589 and edx, 0x0000003f
03590 sub ecx, edx
03591
03592
03593 duploop:
03594 movq mm1, [esi+ebx]
03595 movq mm0, [edi+ebx]
03596 movq mm3, [esi+ebx+8]
03597 paddb mm0, mm1
03598 movq mm2, [edi+ebx+8]
03599 movq [edi+ebx], mm0
03600 paddb mm2, mm3
03601 movq mm5, [esi+ebx+16]
03602 movq [edi+ebx+8], mm2
03603 movq mm4, [edi+ebx+16]
03604 movq mm7, [esi+ebx+24]
03605 paddb mm4, mm5
03606 movq mm6, [edi+ebx+24]
03607 movq [edi+ebx+16], mm4
03608 paddb mm6, mm7
03609 movq mm1, [esi+ebx+32]
03610 movq [edi+ebx+24], mm6
03611 movq mm0, [edi+ebx+32]
03612 movq mm3, [esi+ebx+40]
03613 paddb mm0, mm1
03614 movq mm2, [edi+ebx+40]
03615 movq [edi+ebx+32], mm0
03616 paddb mm2, mm3
03617 movq mm5, [esi+ebx+48]
03618 movq [edi+ebx+40], mm2
03619 movq mm4, [edi+ebx+48]
03620 movq mm7, [esi+ebx+56]
03621 paddb mm4, mm5
03622 movq mm6, [edi+ebx+56]
03623 movq [edi+ebx+48], mm4
03624 add ebx, 64
03625 paddb mm6, mm7
03626 cmp ebx, ecx
03627 movq [edi+ebx-8], mm6
03628
03629 jb duploop
03630
03631 cmp edx, 0
03632 jz dupend
03633
03634
03635
03636
03637 cmp edx, 8
03638 jb duplt8
03639
03640
03641 add ecx, edx
03642 and edx, 0x00000007
03643 sub ecx, edx
03644 jz duplt8
03645
03646 duplpA:
03647 movq mm1, [esi+ebx]
03648 movq mm0, [edi+ebx]
03649 add ebx, 8
03650 paddb mm0, mm1
03651 cmp ebx, ecx
03652 movq [edi+ebx-8], mm0
03653 jb duplpA
03654 cmp edx, 0
03655 jz dupend
03656 duplt8:
03657 xor eax, eax
03658 add ecx, edx
03659
03660 duplp2:
03661 mov al, [edi + ebx]
03662 add al, [esi + ebx]
03663 inc ebx
03664 cmp ebx, ecx
03665 mov [edi + ebx-1], al
03666 jb duplp2
03667 dupend:
03668
03669 emms
03670 }
03671 }
03672
03673
03674
03675 void
03676 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
03677 row, png_bytep prev_row, int filter)
03678 {
03679 #ifdef PNG_DEBUG
03680 char filnm[10];
03681 #endif
03682
03683 if (mmx_supported == 2) {
03684 #if !defined(PNG_1_0_X)
03685
03686 png_warning(png_ptr, "asm_flags may not have been initialized");
03687 #endif
03688 png_mmx_support();
03689 }
03690
03691 #ifdef PNG_DEBUG
03692 png_debug(1, "in png_read_filter_row\n");
03693 switch (filter)
03694 {
03695 case 0: sprintf(filnm, "none");
03696 break;
03697 #if !defined(PNG_1_0_X)
03698 case 1: sprintf(filnm, "sub-%s",
03699 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
03700 break;
03701 case 2: sprintf(filnm, "up-%s",
03702 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
03703 break;
03704 case 3: sprintf(filnm, "avg-%s",
03705 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
03706 break;
03707 case 4: sprintf(filnm, "Paeth-%s",
03708 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
03709 break;
03710 #else
03711 case 1: sprintf(filnm, "sub");
03712 break;
03713 case 2: sprintf(filnm, "up");
03714 break;
03715 case 3: sprintf(filnm, "avg");
03716 break;
03717 case 4: sprintf(filnm, "Paeth");
03718 break;
03719 #endif
03720 default: sprintf(filnm, "unknw");
03721 break;
03722 }
03723 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
03724 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
03725 (int)((row_info->pixel_depth + 7) >> 3));
03726 png_debug1(0,"len=%8d, ", row_info->rowbytes);
03727 #endif
03728
03729 switch (filter)
03730 {
03731 case PNG_FILTER_VALUE_NONE:
03732 break;
03733
03734 case PNG_FILTER_VALUE_SUB:
03735 {
03736 #if !defined(PNG_1_0_X)
03737 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
03738 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
03739 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
03740 #else
03741 if (mmx_supported)
03742 #endif
03743 {
03744 png_read_filter_row_mmx_sub(row_info, row);
03745 }
03746 else
03747 {
03748 png_uint_32 i;
03749 png_uint_32 istop = row_info->rowbytes;
03750 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
03751 png_bytep rp = row + bpp;
03752 png_bytep lp = row;
03753
03754 for (i = bpp; i < istop; i++)
03755 {
03756 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
03757 rp++;
03758 }
03759 }
03760 break;
03761 }
03762
03763 case PNG_FILTER_VALUE_UP:
03764 {
03765 #if !defined(PNG_1_0_X)
03766 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
03767 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
03768 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
03769 #else
03770 if (mmx_supported)
03771 #endif
03772 {
03773 png_read_filter_row_mmx_up(row_info, row, prev_row);
03774 }
03775 else
03776 {
03777 png_uint_32 i;
03778 png_uint_32 istop = row_info->rowbytes;
03779 png_bytep rp = row;
03780 png_bytep pp = prev_row;
03781
03782 for (i = 0; i < istop; ++i)
03783 {
03784 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
03785 rp++;
03786 }
03787 }
03788 break;
03789 }
03790
03791 case PNG_FILTER_VALUE_AVG:
03792 {
03793 #if !defined(PNG_1_0_X)
03794 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
03795 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
03796 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
03797 #else
03798 if (mmx_supported)
03799 #endif
03800 {
03801 png_read_filter_row_mmx_avg(row_info, row, prev_row);
03802 }
03803 else
03804 {
03805 png_uint_32 i;
03806 png_bytep rp = row;
03807 png_bytep pp = prev_row;
03808 png_bytep lp = row;
03809 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
03810 png_uint_32 istop = row_info->rowbytes - bpp;
03811
03812 for (i = 0; i < bpp; i++)
03813 {
03814 *rp = (png_byte)(((int)(*rp) +
03815 ((int)(*pp++) >> 1)) & 0xff);
03816 rp++;
03817 }
03818
03819 for (i = 0; i < istop; i++)
03820 {
03821 *rp = (png_byte)(((int)(*rp) +
03822 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
03823 rp++;
03824 }
03825 }
03826 break;
03827 }
03828
03829 case PNG_FILTER_VALUE_PAETH:
03830 {
03831 #if !defined(PNG_1_0_X)
03832 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
03833 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
03834 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
03835 #else
03836 if (mmx_supported)
03837 #endif
03838 {
03839 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
03840 }
03841 else
03842 {
03843 png_uint_32 i;
03844 png_bytep rp = row;
03845 png_bytep pp = prev_row;
03846 png_bytep lp = row;
03847 png_bytep cp = prev_row;
03848 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
03849 png_uint_32 istop=row_info->rowbytes - bpp;
03850
03851 for (i = 0; i < bpp; i++)
03852 {
03853 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
03854 rp++;
03855 }
03856
03857 for (i = 0; i < istop; i++)
03858 {
03859 int a, b, c, pa, pb, pc, p;
03860
03861 a = *lp++;
03862 b = *pp++;
03863 c = *cp++;
03864
03865 p = b - c;
03866 pc = a - c;
03867
03868 #ifdef PNG_USE_ABS
03869 pa = abs(p);
03870 pb = abs(pc);
03871 pc = abs(p + pc);
03872 #else
03873 pa = p < 0 ? -p : p;
03874 pb = pc < 0 ? -pc : pc;
03875 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
03876 #endif
03877
03878
03879
03880
03881
03882
03883
03884
03885
03886
03887 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
03888
03889 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
03890 rp++;
03891 }
03892 }
03893 break;
03894 }
03895
03896 default:
03897 png_warning(png_ptr, "Ignoring bad row filter type");
03898 *row=0;
03899 break;
03900 }
03901 }
03902
03903 #endif