
    #i                     8   d Z ddlZddlZddlmZ  ej
                  e      ZdZdZ		 ddl
Z
 e
j                  d      ZdZ	ej                  d       d	Zd
Z ej"                  d      ZdedefdZdedefdZdefdZy# eef$ r ej                  d       Y Gw xY w)a   Token estimation utilities.

Uses tiktoken when available, falls back to a CJK-aware heuristic.

For the heuristic:
- ASCII/Latin text: ~4 chars per token
- CJK characters: ~1.5 chars per token (tiktoken cl100k_base)

Part of claw-compactor. License: MIT.
    N)OptionalFzgpt-4Tz.tiktoken available, using cl100k_base encodingz/tiktoken unavailable, using CJK-aware heuristic   g      ?z6[\u4e00-\u9fff\u3400-\u4dbf\u3000-\u303f\uff00-\uffef]textreturnc                     | syt        t        j                  |             }t        |       |z
  }|t        z  }|t        z  }t        dt        ||z               S )z{Estimate tokens using CJK-aware heuristic.

    CJK characters are counted at ~1.5 chars/token, everything else at ~4.
    r      )len_CJK_REfindallCJK_CHARS_PER_TOKENCHARS_PER_TOKENmaxint)r   	cjk_charsother_chars
cjk_tokensother_tokenss        6/home/crogers2287/claw-compactor/scripts/lib/tokens.py_heuristic_tokensr   $   sW    
 GOOD)*Id)i'K00J0Lq#j</011    c                     | t        d      | syt        r$t        t        t        j	                  |             S t        |       S )zEstimate the number of tokens in *text*.

    Uses tiktoken (cl100k_base) when available, otherwise a CJK-aware
    heuristic.  Returns 0 for empty strings.
    Raises TypeError if *text* is None.
    z-estimate_tokens() requires a string, got Noner   )	TypeError_tiktoken_available_encoderr	   encoder   )r   s    r   estimate_tokensr   2   sB     |GHHx38??4())T""r   c                      t         S )z5Return True if tiktoken is being used for estimation.)r    r   r   using_tiktokenr   B   s    r   )__doc__reloggingtypingr   	getLogger__name__loggerr   r   tiktokenencoding_for_modeldebugImportError	Exceptionr   r   compiler
   strr   r   r   boolr   r   r   r   <module>r/      s   	 
  			8	$ D*x**73H
LLAB   "**N
O2C 2C 2## ## #  Q 	Y D
LLBCDs   )A> >BB