
    #i6                        d Z ddlZddlZddlmZmZmZmZmZ  ej                  e
      ZdZdZefdededee   fd	Zd
ee   dee   defdZeefdee   dededeeeef      fdZdee   deeeef      dee   fdZy)a  Deduplication engine using shingle hashing.

Uses n-gram (shingle) fingerprinting for efficient near-duplicate detection
without O(n^2) pairwise comparison. Groups entries by section, then compares
shingle sets using Jaccard similarity.

Part of claw-compactor. License: MIT.
    N)ListDictAnySetTuple   g333333?textkreturnc                 H   | j                         }|st        d      hS t        |      |k  rt        dj                  |            hS t	               }t        t        |      |z
  dz         D ]3  }dj                  ||||z          }|j                  t        |             5 |S )zGenerate a set of k-word shingle hashes from *text*.

    Each shingle is a hash of *k* consecutive words.
    Returns a set of integer hashes.
         )splithashlenjoinsetrangeadd)r	   r
   wordsresultishingles         5/home/crogers2287/claw-compactor/scripts/lib/dedup.py	_shinglesr      s     JJLERz
5zA~SXXe_%&&uF3u:>A%& "((51q5>*

4=!" M    abc                 `    | s|sy| r|syt        | |z        }t        | |z        }|r||z  S dS )zCompute Jaccard similarity between two shingle sets.

    Returns 1.0 for identical sets, 0.0 for disjoint.
    If both are empty, returns 1.0.
    g      ?        r   )r   r   intersectionunions       r   jaccardr%   '   s@     QAq1u:LAJE#(<%1c1r   entries	thresholdc                    t        |       dk  rg S | D cg c]  }t        ||       }}t               }g }t        t        |             D ]  }||v r|g}d}	d}
t        |dz   t        |             D ]:  }||v rt	        ||   ||         }||k\  s |j                  |       |	|z  }	|
dz  }
< t        |      dkD  ss|
r|	|
z  n|}|j                  |t        |d      d       |j                  |        |S c c}w )a  Find near-duplicate groups among *entries*.

    Returns a list of dicts, each with:
        - indices: list of indices that are near-duplicates
        - similarity: average Jaccard similarity within the group

    Uses O(n^2) pairwise comparison with shingle hashing.
       r!   r   r      )indices
similarity)r   r   r   r   r%   appendroundupdate)r&   r'   r
   eshingle_setsusedgroupsr   group_indices	total_simcountjsimavg_sims                 r   find_duplicatesr:   6   s'    7|a	-45IaO5L5UD#%F3w<  '9	q1uc'l+ 	ADy,q/<?;Ci$$Q'S 	
	 }!+0i%'iGMM(#GQ/  KK&)', M5 6s   C9r3   c                      |st               S t               }|D ]5  }|d   }t        | fd      }|D ]  }||k7  s	|j                  |        7 t	               D cg c]  \  }}||vs| c}}S c c}}w )zMerge duplicate groups, keeping the longest entry in each group.

    Entries not in any group are passed through unchanged.
    r+   c                      t        |          S )Nr"   )idxr&   s    r   <lambda>z"merge_duplicates.<locals>.<lambda>r   s    C,= r   )key)listr   maxr   	enumerate)	r&   r3   removedgr+   bestr=   r   r0   s	   `        r   merge_duplicatesrF   c   s     G}G !I,7 => 	!Cd{C 	!	! $G,A$!Q0@AAAAs   !A6.A6)__doc__hashlibloggingtypingr   r   r   r   r   	getLogger__name__loggerSHINGLE_SIZESIMILARITY_THRESHOLDstrintr   floatr%   r:   rF    r   r   <module>rT      s      . .			8	$   #/ C C 3s8 $2s3x 2CH 2 2" ,*#Y** * 
$sCx.	*ZB#YBc3h B 
#YBr   