
    #i&                     h   d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZ  ej                  e      Z ej"                  d      Z ej"                  d      ZdZdZd	Z ej"                  d
      Z ej"                  d      Zdede
e   fdZd"dedededefdZde
e   de	eef   fdZde
e   de	eef   fdZeefde
e   dedede	eef   fdZde	eef   de	eef   fdZ dZ!dede	eef   defdZ"dede	eef   defdZ#de	eef   deddfdZ$dede	eef   fd Z%	 d#de	ee&f   fd!Z'y)$a  Dictionary-based compression using auto-learned codebooks.

Scans workspace memory files, learns high-frequency n-grams, builds a
codebook mapping long phrases to short `$XX` codes, and applies/reverses
substitutions for lossless compression.

Part of claw-compactor. License: MIT.
    N)Counter)Path)DictListOptionalSetTuplez\$[A-Z]{2,3}         z(\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\bz(/[A-Za-z0-9_.~-]+){3,}nreturnc           
         g }t        d      D ]U  }t        d      D ]E  }|j                  dt        d|z         z   t        d|z         z          t        |      | k\  sA|c c S  W t        d      D ]v  }t        d      D ]f  }t        d      D ]V  }|j                  dt        d|z         z   t        d|z         z   t        d|z         z          t        |      | k\  sP|c c c S  h x |S )zAGenerate *n* unique short codes: $AA..$ZZ, then $AAA.. if needed.   $A   )rangeappendchrlen)r   codesijks        :/home/crogers2287/claw-compactor/scripts/lib/dictionary.py_generate_codesr   %   s    E2Y r 	ALLs26{*Sa[895zQ	 2Y !r 	!A2Y !S3rAv;.R!V<s26{JKu:? L!	!! L    textmin_nmax_nc                    t               }| s|S | j                         }t        ||dz         D ]X  }t        t        |      |z
  dz         D ]9  }dj	                  ||||z          }t        |      t
        k\  s-||xx   dz  cc<   ; Z |S )z>Extract word n-grams from *text*, filtering by minimum length.    )r   splitr   r   joinMIN_PHRASE_LEN)r   r   r    counterwordsr   r   grams           r   _tokenize_ngramsr*   8   s    yGJJLE5%!)$ #s5zA~)* 	#A88E!AEN+D4yN*"	##
 Nr   textsc                 "   t               }| D ]Q  }t        j                  |      D ]7  }|j                  d      }dj	                  |dd       dz   }||xx   dz  cc<   9 S |j                         D ci c]  \  }}|dk\  s|| c}}S c c}}w )z?Find frequently occurring IP prefixes (3-octet) across *texts*..Nr
   r"      )r   _IP_REfindallr$   r%   items)r+   r'   r   ippartsprefixcounts          r   _extract_ip_prefixesr6   F   s    yG !..& 	!BHHSMEXXeBQi(3.FFOq O	!!
 07}}Mmfe%1*FEMMMMs   4BBc                    g }| D ];  }t         j                  |      D ]!  }|j                  |j                                # = t	        |      dk  ri S t               }|D ]N  }|j                  d      }t        dt	        |            D ]#  }dj                  |d|       }||xx   dz  cc<   % P |j                         D 	ci c]  \  }}	|	dk\  s||	 c}	}S c c}	}w )zNFind frequently occurring path prefixes (directory components) across *texts*.r.   /r
   Nr"   )
_PATH_REfinditerr   groupr   r   r$   r   r%   r1   )
r+   	all_pathsr   mr'   pathr3   depthr4   r5   s
             r   _extract_path_prefixesr@   Q   s    I (""4( 	(AQWWY'	(( 9~	 yG !

31c%j) 	!EXXeFUm,FFOq O	!! 07}}Mmfe%1*FEMMMMs   CCmin_freqmax_entriesc                 b   | si S t               }| D ]  }|j                  t        |              t        |       }|j	                         D ]7  \  }}t        |      t        k\  st        |j                  |d      |      ||<   9 t        |       }|j	                         D ]7  \  }	}t        |	      t        k\  st        |j                  |	d      |      ||	<   9 |j	                         D 
cg c]   \  }
}||k\  rt        |
      t        k\  r|
|f" }}
}|j                  d d       t        t        t        |      |            }i }t               }t        ||      D ]G  \  \  }
}}d}|D ]  }|
|v s||
v sd} n |r!|
||<   |j                  |
       t        |      |k\  sF |S  |S c c}}
w )zBuild a codebook from a list of text documents.

    Scans for high-frequency n-grams, IPs, and paths. Returns a dict
    mapping short codes ($XX) to the phrases they replace.
    r   c                 *    | d   t        | d         z  S )Nr"   r   r   xs    r   <lambda>z build_codebook.<locals>.<lambda>   s    !A$QqT"2 r   T)keyreverseF)r   updater*   r6   r1   r   r&   maxgetr@   sortr   minsetzipadd)r+   rA   rB   combinedr   ip_freqsr2   r5   
path_freqsr>   phrase
candidatesr   codebookused_phrases_countcodeskipexistings                      r   build_codebookr^   g   s    	 yH 0(./0 $E*H^^% ;	Er7n$x||B2E:HRL; (.J!'') ?et9& dA!6>HTN? &^^-FEHV!> 
J 
 OO2DOA CJ=>E!H UL"%j%"8 $$ 	H!X%7	  x=K'O O5s   -%F+rX   c                     | si S t        t        |             }|j                  d      r| S | j                         D ci c]  \  }}||
 c}}S c c}}w )zNormalize codebook to {code: phrase} format.
    
    Accepts either {code: phrase} or {phrase: code} format.
    Detects format by checking if keys start with '$'.
    r   )nextiter
startswithr1   )rX   	first_keyrV   r[   s       r   _normalize_codebookrd      sR     	T(^$IC  2:1ABfBBBs   Az DLR c                     | r|s| S t        |      }| j                  dt              }t        |j	                         d       D ]-  \  }}|j                  dt              }|j                  ||      }/ |S )zApply codebook substitutions to *text*. Lossless.
    
    Accepts codebook in either {code: phrase} or {phrase: code} format.
    Pre-existing '$' characters are escaped so they survive roundtrip.
    r   c                      t        | d          S )Nr"   rE   rF   s    r   rH   zcompress_text.<locals>.<lambda>       #ad) r   rI   )rd   replace_DOLLAR_ESCAPEsortedr1   )r   rX   
normalizedresultr[   rV   escaped_phrases          r   compress_textro      ss     x$X.J\\#~.Fz//17KL 6f^<56 Mr   c                     | r|s| S t        |      }| }t        |j                         d       D ]  \  }}|j                  ||      } |j                  t        d      }|S )z{Reverse codebook substitutions. Lossless.
    
    Accepts codebook in either {code: phrase} or {phrase: code} format.
    c                      t        | d          S )Nr   rE   rF   s    r   rH   z!decompress_text.<locals>.<lambda>   rg   r   rh   r   )rd   rk   r1   ri   rj   )r   rX   rl   rm   r[   rV   s         r   decompress_textrr      sh    
 x$X.JFz//17KL .ff-. ^^NC0FMr   r>   c                     t        |      }|j                  j                  dd       d| d}|j                  t	        j
                  |dd      d	       y
)zSave codebook to a JSON file.T)parentsexist_okr"   )versionentriesr.   F)indentensure_asciiutf-8encodingN)r   parentmkdir
write_textjsondumps)rX   r>   datas      r   save_codebookr      sI    :DKKdT2X.DOODJJtAEBWOUr   c                     t        |       } | j                         st        d|        t        j                  | j                  d            }t        |t              rd|vrt        d|        |d   S )zLoad codebook from a JSON file.zCodebook not found: rz   r{   rw   zInvalid codebook format: )	r   existsFileNotFoundErrorr   loads	read_text
isinstancedict
ValueError)r>   r   s     r   load_codebookr      sn    :D;;="6tf =>>::dnngn67DdD!Yd%:4TF;<<	?r   c                 *   || }|t        |      }t              }nnt        | t              rVt        |t              rF|}dj                  | j	                               }|}t        ||      t        |      }t              }nddddddS |r||z
  |z  dz  nd}t        |      }t        fd|D              }	t        d |j                         D              }
||z
  |
z
  }|r||z  dz  nd}||t        |d      t        |d      t        |      |	d	S )
u   Calculate compression statistics.
    
    Can be called as:
      compression_stats(texts_dict, codebook) — where texts_dict maps filenames to content
      compression_stats(original_str, compressed_str, codebook)
    
r   g        )original_charscompressed_charsgross_reduction_pctcodebook_entries
codes_usedd   c              3   ,   K   | ]  }|v sd   yw)r"   N ).0r[   
compresseds     r   	<genexpr>z$compression_stats.<locals>.<genexpr>  s     D41CQDs   	c              3   V   K   | ]!  \  }}t        |      t        |      z   d z    # yw)r.   NrE   )r   r   vs      r   r   z$compression_stats.<locals>.<genexpr>  s%     ODAqCFSVOa/Os   ')r.   )r   r   r   net_reduction_pctr   r   )
r   r   r   r%   valuesro   rd   sumr1   round)texts_or_originalcodebook_or_compressedrX   originalorig_lencomp_lenall_text	reductionrl   r   codebook_overhead	net_savednet_reductionr   s                @r   compression_statsr      s:    $+
x=z?	%t	,<RTX1Y)99.5578"8X6
x=z?"#SV$%Q8 	8 =E(X%1C7#I %X.JD:DDJ OJ<L<L<NOO8#&77I4<Y)C/#M #$$Y2"=!4M  r   )r.      )NN)(__doc__r   loggingrecollectionsr   pathlibr   typingr   r   r   r   r	   	getLogger__name__loggercompile_CODE_RE_RESERVED_REMIN_FREQr&   MAX_CODEBOOKr/   r9   intstrr   r*   r6   r@   r^   rd   rj   ro   rr   r   r   objectr   r   r   r   <module>r      s     	   3 3			8	$ 2::o&rzz/*  
?	@2::01s tCy &3 s s 7 NS	 Nd38n NN$s) NS#X N0 #8988 8 
#s(^	8vC$sCx. CT#s(^ C"  tCH~ # $# c3h C "VDcN V$ V4 V c3h  >B-	#v+-r   