
    #i(              	          U d Z ddlZddlZddlmZ ddlmZmZmZm	Z	  ej                  e      Zi ddddd	d
ddddddddddddddddddddddddd d!d"d#d$d%iZeeef   ed&<    ej                   d'j#                  d( eD                    Z ej                   d)ej&                        Z ej                   d*ej*                        Z ej                   d+      Zd,ed-eeeeef      fd.Zd,ed-efd/Zd,ed-efd0Zd,ed-efd1Zd,ed-efd2Zd,ed-efd3Zd,ed-efd4Zd:d,ed5e d-efd6Z!d;d,ed7ed8ed-efd9Z"y)<zTMarkdown parsing and manipulation utilities.

Part of claw-compactor. License: MIT.
    N)SequenceMatcher)ListTupleDictOptionalu   ，,u   。.u   ；;u   ：:u   ！!u   ？?u   “"u   ”u   ‘'u   ’u   （(u   ）)u   【[u   】]u   、u   …z...   ——--u   ～~_ZH_PUNCT_MAP|c              #   F   K   | ]  }t        j                  |        y wN)reescape).0ks     8/home/crogers2287/claw-compactor/scripts/lib/markdown.py	<genexpr>r       s     "GA299Q<"Gs   !uP   [😀-🙏🌀-🗿🚀-🛿🇠-🇿✂-➰🤀-🧿🨀-🩯🩰-🫿☀-⛿]+z^(#{1,6})\s+(.*)z^[\s|:\-]+$textreturnc                    | sg S g }| j                  d      }d}d}g }|D ]  }t        j                  |      }|rsdj                  |      j	                         }|s|r|j                  |||f       |j                  d      j	                         }t        |j                  d            }g }|j                  |        dj                  |      j	                         }|s|r|j                  |||f       |S )zParse *text* into sections delimited by markdown headers.

    Returns a list of (header, body, level) tuples.
    A preamble (text before the first header) is returned with header=''.
    
 r         )split
_HEADER_REmatchjoinstripappendgrouplen)	r!   sectionslinescurrent_headercurrent_levelcurrent_body_lineslinembodys	            r   parse_sectionsr8   +   s     	+-HJJtENM$& ,T"99/0668D} EFWWQZ--/N
OM!#%%d+, 99'(..0D}=>O    c                     | syt        j                  dd|       } | j                  d      D cg c]  }|j                          }}dj	                  |      j                         S c c}w )z5Remove excessive blank lines and trailing whitespace.r%   z\n{3,}z

r$   )r   subr(   rstripr+   r,   )r!   r5   r1   s      r   strip_markdown_redundancyr=   O   sY    66)VT*D'+zz$'78tT[[]8E899U!!## 9s   A&c                    | syt               }g }| j                  d      D ]M  }|j                         }|s|j                  |       '||v r,|j	                  |       |j                  |       O dj                  |      S )z9Remove exact duplicate non-blank lines, preserving order.r%   r$   )setr(   r,   r-   addr+   )r!   seenresultr5   strippeds        r   remove_duplicate_linesrD   Z   s}    5DF

4  	::<MM$td	 99Vr9   c                 Z    | sy| j                  dd      } t        j                  d |       S )z=Replace Chinese fullwidth punctuation with ASCII equivalents.r%   r   r   c                 f    t         j                  | j                         | j                               S r   )r   getr.   )r6   s    r   <lambda>z/normalize_chinese_punctuation.<locals>.<lambda>s   s    m&7&7	1779&M r9   )replace_ZH_PUNCT_REr;   )r!   s    r   normalize_chinese_punctuationrK   m   s,    <<-DMtTTr9   c                 f    | syt         j                  d|       }t        j                  dd|      }|S )z$Remove emoji characters from *text*.r%   z  + )	_EMOJI_REr;   r   )r!   rB   s     r   strip_emojirO   v   s/    ]]2t$FVVFC(FMr9   c                 $   | syt        |       }|s| S dgt        |      z  }t        |      D ]?  \  }\  }}}|dkD  st        |dz
  dd      D ]  }||   \  }}}	|	dkD  s|	|k  sd||<    ? A g }
t        |      D ]t  \  }\  }}}|s|s|r|j	                         s||   s'|r|
j                  d|z  dz   |z          |j	                         r|
j                  |       |
j                  d       v d	j                  |
      j	                         S )
z>Remove markdown sections that have no meaningful body content.r%   Fr   r'   T#rM   r$   )r8   r/   	enumerateranger,   r-   r+   )r!   r0   	has_childidxheaderr7   levelpidx_plevelresult_liness              r   remove_empty_sectionsr]      s8   d#H #h-'I&/&9 ""fdE19cAgr2. '~1fA:&5.&*IdO	 !L&/&9 	 ""fdEd$**,y~ec 1F :;::<%B	  99\"((**r9   c                    | sy| j                  d      }g }d}|t        |      k  r||   }d|v rd|dz   t        |      k  rRt        j                  ||dz      j	                               r(|j	                         j	                  d      j                  d      D cg c]  }|j	                          }}|dz  }g }|t        |      k  rd||   v r||   j	                         r||   j	                         j	                  d      j                  d      D cg c]  }|j	                          }}|j                  |       |dz  }|t        |      k  rd||   v r||   j	                         rt        |      dk\  r.|D ](  }	|j                  dd	j                  |	      z   d
z          * nt        |      dk(  rN|D ]H  }	t        |	      dkD  r|	d   nd}
t        |	      dkD  r|	d   nd}|
s|s2|j                  d|
 d|        J n|D ]  }	g }t        |	      D ]U  \  }}|dk(  r|j                  |       |t        |      k  r|j                  ||    d|        E|j                  |       W |j                  dj                  |              n|j                  |       |dz  }|t        |      k  rdj                  |      S c c}w c c}w )zConvert markdown tables to compact key:value notation.

    A 2-column table becomes ``Key: Value`` lines.
    Multi-column tables become ``Col1 | Col2 | ...`` lines (no header row / separator).
    r%   r$   r   r   r'   r&      z| z | z |- z: =, )r(   r/   _TABLE_SEP_REr*   r,   r-   r+   rS   )r!   r1   rB   ir5   cheadersrowscellsrowr   vpartscicells                  r   compress_markdown_tablern      s    JJtEF	A
c%j.Qx$;1q53u:--2E2EeAPQElFXFXFZ2[*.**,*<*<S*A*G*G*LMQqwwyMGMFA$&Dc%j.SE!H_q9I,1!HNN,<,B,B3,G,M,Mc,RSqSSE"Q c%j.SE!H_q9I
 7|q  ACMM$C"84"?@AW" 5C"%c(Q,ABA"%c(Q,ABAA1#Rsm4	5   	4CE$-cN /D7!LL.#g,.!LLGBK=$)@A!LL./ MM$))E"23	4 MM$FAM c%j.P 99VG N Ts   KK	thresholdc                 j   | sy| j                  d      }t        j                  d      }g g fd}|D ]`  }|j                  |      }|r3j	                  |j                  d      |j                  d      |f       I |        j	                  |       b  |        dj                        S )zMerge bullet lines with high similarity.

    Uses SequenceMatcher ratio. When two bullets exceed *threshold*,
    keep the longer one.
    r%   r$   ^(\s*[-*+]\s+)(.*)c                     	sy t        	      } dgt        |       z  }t        t        |             D ]  }||   r	t        |dz   t        |             D ]d  }||   r	t        d | |   d   | |   d         j	                         }|k\  s6t        | |   d         t        | |   d         kD  rd||<    d||<   f  t        |       D ]   \  }\  }}}||   r
j                  |       " 	j                          y )NFr'   T)listr/   rT   r   ratiorS   r-   clear)kept
merged_outrd   jrt   rV   prefixcontent	full_linebulletsrB   ro   s            r   flush_bulletsz,merge_similar_bullets.<locals>.flush_bullets   s   G}"'3t9!4
s4y! 	-A!}1q5#d), 
-a='d1gaj$q'!*EKKMI%471:T!WQZ8(,
1(,
1
-	- 2;4 	)-C-&'9c?i(	) 	r9   r'   r&   )r(   r   compiler*   r-   r.   r+   )	r!   ro   r1   	bullet_rer}   r5   r6   r|   rB   s	    `     @@r   merge_similar_bulletsr      s     JJtE

01IF*,G0   OOD!NNAGGAJ
D9:OMM$  O99Vr9   	max_words	max_mergec                 "  	
 | syt        j                  d      }| j                  d      }g 
g d		
fd}|D ]  }|j                  |      }|r|j	                  d      j                         }|j	                  d      	t        |j                               |k  r(j                  |       t              |k\  s |         |        
j                  |        |        
j                  |         |        dj                  
      S )zCombine consecutive short bullet points into comma-separated form.

    Bullets with <= *max_words* words are candidates. Up to *max_merge*
    consecutive short bullets are joined into one line.
    r%   rq   r$   r`   c                      sy t              dk  rD ]  } j                  | z           n#j                  dj                        z          j                          y )Nr&   rb   )r/   r-   r+   ru   )sbry   rB   short_bulletss    r   flush_shortz(merge_short_bullets.<locals>.flush_short  s]    }"# +frk*+ MM&499]#;;<r9   r&   r'   )	r   r~   r(   r*   r.   r,   r/   r-   r+   )r!   r   r   r   r1   r   r5   r6   rz   ry   rB   r   s            @@@r   merge_short_bulletsr     s     

01IJJtEF!MF
   OOD!ggaj&&(GWWQZF7==?#y0$$W-}%2Md#MMM$   M99Vr9   )g?)   
   )#__doc__r   loggingdifflibr   typingr   r   r   r   	getLogger__name__loggerr   str__annotations__r~   r+   rJ   UNICODErN   	MULTILINEr)   rc   intr8   r=   rD   rK   rO   r]   rn   floatr   r    r9   r   <module>r      sL  
 
  # . .			8	$!c!S!"*C!193!@H#!OWY\!c!S!"*C!193! c! S! #+C! 2:3! c	! U	! %3D	! ;CC	!tCH~  rzz#(("G"GGH BJJ		 **	 RZZ+R\\:
 

>*! !eCcM&:!; !H$C $C $  &U U Uc c + + +D5# 5# 5p/ / / /d,c ,c ,# ,s ,r9   