; match.asm -- Pentium-Pro optimized version of longest_match() ; ; Updated for zlib 1.1.3 and converted to MASM 6.1x ; Copyright (C) 2000 Dan Higdon ; and Chuck Walbourn ; Corrections by Cosmin Truta ; ; This is free software; you can redistribute it and/or modify it ; under the terms of the GNU General Public License. ; Based on match.S ; Written for zlib 1.1.2 ; Copyright (C) 1998 Brian Raiter ; ; Modified by Gilles Vollant (2005) for add gzhead and gzindex .686P .MODEL FLAT ;=========================================================================== ; EQUATES ;=========================================================================== MAX_MATCH EQU 258 MIN_MATCH EQU 3 MIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1) MAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7)) ;=========================================================================== ; STRUCTURES ;=========================================================================== ; This STRUCT assumes a 4-byte alignment DEFLATE_STATE STRUCT ds_strm dd ? ds_status dd ? ds_pending_buf dd ? ds_pending_buf_size dd ? ds_pending_out dd ? ds_pending dd ? ds_wrap dd ? ; gzhead and gzindex are added in zlib 1.2.2.2 (see deflate.h) ds_gzhead dd ? ds_gzindex dd ? ds_data_type db ? ds_method db ? db ? ; padding db ? ; padding ds_last_flush dd ? ds_w_size dd ? ; used ds_w_bits dd ? ds_w_mask dd ? ; used ds_window dd ? ; used ds_window_size dd ? ds_prev dd ? ; used ds_head dd ? ds_ins_h dd ? ds_hash_size dd ? ds_hash_bits dd ? ds_hash_mask dd ? ds_hash_shift dd ? ds_block_start dd ? ds_match_length dd ? ; used ds_prev_match dd ? ; used ds_match_available dd ? ds_strstart dd ? ; used ds_match_start dd ? ; used ds_lookahead dd ? ; used ds_prev_length dd ? ; used ds_max_chain_length dd ? ; used ds_max_laxy_match dd ? ds_level dd ? ds_strategy dd ? ds_good_match dd ? ; used ds_nice_match dd ? ; used ; Don't need anymore of the struct for match DEFLATE_STATE ENDS ;=========================================================================== ; CODE ;=========================================================================== _TEXT SEGMENT ;--------------------------------------------------------------------------- ; match_init ;--------------------------------------------------------------------------- ALIGN 4 PUBLIC _match_init _match_init PROC ; no initialization needed ret _match_init ENDP ;--------------------------------------------------------------------------- ; uInt longest_match(deflate_state *deflatestate, IPos curmatch) ;--------------------------------------------------------------------------- ALIGN 4 PUBLIC _longest_match _longest_match PROC ; Since this code uses EBP for a scratch register, the stack frame must ; be manually constructed and referenced relative to the ESP register. ; Stack image ; Variables chainlenwmask = 0 ; high word: current chain len ; low word: s->wmask window = 4 ; local copy of s->window windowbestlen = 8 ; s->window + bestlen scanend = 12 ; last two bytes of string scanstart = 16 ; first two bytes of string scanalign = 20 ; dword-misalignment of string nicematch = 24 ; a good enough match size bestlen = 28 ; size of best match so far scan = 32 ; ptr to string wanting match varsize = 36 ; number of bytes (also offset to last saved register) ; Saved Registers (actually pushed into place) ebx_save = 36 edi_save = 40 esi_save = 44 ebp_save = 48 ; Parameters retaddr = 52 deflatestate = 56 curmatch = 60 ; Save registers that the compiler may be using push ebp push edi push esi push ebx ; Allocate local variable space sub esp,varsize ; Retrieve the function arguments. ecx will hold cur_match ; throughout the entire function. edx will hold the pointer to the ; deflate_state structure during the function's setup (before ; entering the main loop). mov edx, [esp+deflatestate] ASSUME edx:PTR DEFLATE_STATE mov ecx, [esp+curmatch] ; uInt wmask = s->w_mask; ; unsigned chain_length = s->max_chain_length; ; if (s->prev_length >= s->good_match) { ; chain_length >>= 2; ; } mov eax, [edx].ds_prev_length mov ebx, [edx].ds_good_match cmp eax, ebx mov eax, [edx].ds_w_mask mov ebx, [edx].ds_max_chain_length jl SHORT LastMatchGood shr ebx, 2 LastMatchGood: ; chainlen is decremented once beforehand so that the function can ; use the sign flag instead of the zero flag for the exit test. ; It is then shifted into the high word, to make room for the wmask ; value, which it will always accompany. dec ebx shl ebx, 16 or ebx, eax mov [esp+chainlenwmask], ebx ; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; mov eax, [edx].ds_nice_match mov ebx, [edx].ds_lookahead cmp ebx, eax jl SHORT LookaheadLess mov ebx, eax LookaheadLess: mov [esp+nicematch], ebx ;/* register Bytef *scan = s->window + s->strstart; */ mov esi, [edx].ds_window mov [esp+window], esi mov ebp, [edx].ds_strstart lea edi, [esi+ebp] mov [esp+scan],edi ;/* Determine how many bytes the scan ptr is off from being */ ;/* dword-aligned. */ mov eax, edi neg eax and eax, 3 mov [esp+scanalign], eax ;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ ;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ mov eax, [edx].ds_w_size sub eax, MIN_LOOKAHEAD sub ebp, eax jg SHORT LimitPositive xor ebp, ebp LimitPositive: ;/* int best_len = s->prev_length; */ mov eax, [edx].ds_prev_length mov [esp+bestlen], eax ;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ add esi, eax mov [esp+windowbestlen], esi ;/* register ush scan_start = *(ushf*)scan; */ ;/* register ush scan_end = *(ushf*)(scan+best_len-1); */ ;/* Posf *prev = s->prev; */ movzx ebx, WORD PTR[edi] mov [esp+scanstart], ebx movzx ebx, WORD PTR[eax+edi-1] mov [esp+scanend], ebx mov edi, [edx].ds_prev ;/* Jump into the main loop. */ mov edx, [esp+chainlenwmask] jmp SHORT LoopEntry ;/* do { ; * match = s->window + cur_match; ; * if (*(ushf*)(match+best_len-1) != scan_end || ; * *(ushf*)match != scan_start) continue; ; * [...] ; * } while ((cur_match = prev[cur_match & wmask]) > limit ; * && --chain_length != 0); ; * ; * Here is the inner loop of the function. The function will spend the ; * majority of its time in this loop, and majority of that time will ; * be spent in the first ten instructions. ; * ; * Within this loop: ; * %ebx = scanend ; * %ecx = curmatch ; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) ; * %esi = windowbestlen - i.e., (window + bestlen) ; * %edi = prev ; * %ebp = limit ; */ ALIGN 4 LookupLoop: and ecx, edx movzx ecx, WORD PTR[edi+ecx*2] cmp ecx, ebp jbe LeaveNow sub edx, 000010000H js LeaveNow LoopEntry: movzx eax, WORD PTR[esi+ecx-1] cmp eax, ebx jnz SHORT LookupLoop mov eax, [esp+window] movzx eax, WORD PTR[eax+ecx] cmp eax, [esp+scanstart] jnz SHORT LookupLoop ;/* Store the current value of chainlen. */ mov [esp+chainlenwmask], edx ;/* Point %edi to the string under scrutiny, and %esi to the string we */ ;/* are hoping to match it up with. In actuality, %esi and %edi are */ ;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ ;/* initialized to -(MAX_MATCH_8 - scanalign). */ mov esi, [esp+window] mov edi, [esp+scan] add esi, ecx mov eax, [esp+scanalign] mov edx, -MAX_MATCH_8 lea edi, [edi+eax+MAX_MATCH_8] lea esi, [esi+eax+MAX_MATCH_8] ;/* Test the strings for equality, 8 bytes at a time. At the end, ; * adjust %edx so that it is offset to the exact byte that mismatched. ; * ; * We already know at this point that the first three bytes of the ; * strings match each other, and they can be safely passed over before ; * starting the compare loop. So what this code does is skip over 0-3 ; * bytes, as much as necessary in order to dword-align the %edi ; * pointer. (%esi will still be misaligned three times out of four.) ; * ; * It should be confessed that this loop usually does not represent ; * much of the total running time. Replacing it with a more ; * straightforward "rep cmpsb" would not drastically degrade ; * performance. ; */ LoopCmps: mov eax, DWORD PTR[esi+edx] xor eax, DWORD PTR[edi+edx] jnz SHORT LeaveLoopCmps mov eax, DWORD PTR[esi+edx+4] xor eax, DWORD PTR[edi+edx+4] jnz SHORT LeaveLoopCmps4 add edx, 8 jnz SHORT LoopCmps jmp LenMaximum ALIGN 4 LeaveLoopCmps4: add edx, 4 LeaveLoopCmps: test eax, 00000FFFFH jnz SHORT LenLower add edx, 2 shr eax, 16 LenLower: sub al, 1 adc edx, 0 ;/* Calculate the length of the match. If it is longer than MAX_MATCH, */ ;/* then automatically accept it as the best possible match and leave. */ lea eax, [edi+edx] mov edi, [esp+scan] sub eax, edi cmp eax, MAX_MATCH jge SHORT LenMaximum ;/* If the length of the match is not longer than the best match we */ ;/* have so far, then forget it and return to the lookup loop. */ mov edx, [esp+deflatestate] mov ebx, [esp+bestlen] cmp eax, ebx jg SHORT LongerMatch mov esi, [esp+windowbestlen] mov edi, [edx].ds_prev mov ebx, [esp+scanend] mov edx, [esp+chainlenwmask] jmp LookupLoop ALIGN 4 ;/* s->match_start = cur_match; */ ;/* best_len = len; */ ;/* if (len >= nice_match) break; */ ;/* scan_end = *(ushf*)(scan+best_len-1); */ LongerMatch: mov ebx, [esp+nicematch] mov [esp+bestlen], eax mov [edx].ds_match_start, ecx cmp eax, ebx jge SHORT LeaveNow mov esi, [esp+window] add esi, eax mov [esp+windowbestlen], esi movzx ebx, WORD PTR[edi+eax-1] mov edi, [edx].ds_prev mov [esp+scanend], ebx mov edx, [esp+chainlenwmask] jmp LookupLoop ALIGN 4 ;/* Accept the current string, with the maximum possible length. */ LenMaximum: mov edx, [esp+deflatestate] mov DWORD PTR[esp+bestlen], MAX_MATCH mov [edx].ds_match_start, ecx ;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ ;/* return s->lookahead; */ LeaveNow: mov edx, [esp+deflatestate] mov ebx, [esp+bestlen] mov eax, [edx].ds_lookahead cmp ebx, eax jg SHORT LookaheadRet mov eax, ebx LookaheadRet: ; Restore the stack and return from whence we came. add esp, varsize pop ebx pop esi pop edi pop ebp ret _longest_match ENDP _TEXT ENDS END