#!/usr/bin/env python3 """ Entity deduplication script using LLM to identify and merge duplicate entities. Processes all JSON files from ./results/ and creates a dedupe.json mapping file. """ import os import json import re from pathlib import Path from typing import Dict, List, Set from collections import defaultdict from openai import OpenAI from tqdm import tqdm from dotenv import load_dotenv class EntityDeduplicator: """Deduplicate entities using LLM assistance""" def __init__(self, api_url: str, api_key: str, model: str = "gpt-4o"): self.client = OpenAI(api_key=api_key, base_url=api_url) self.model = model self.results_dir = Path("./results") self.dedupe_file = Path("./dedupe.json") def load_all_entities(self) -> Dict[str, Set[str]]: """Load all unique entities from all JSON files""" entities = { "people": set(), "organizations": set(), "locations": set() } json_files = list(self.results_dir.glob("**/*.json")) print(f"Found {len(json_files)} JSON files to process") for json_file in tqdm(json_files, desc="Loading entities"): try: with open(json_file, 'r', encoding='utf-8') as f: data = json.load(f) if "entities" in data: for entity_type in ["people", "organizations", "locations"]: if entity_type in data["entities"]: entities[entity_type].update(data["entities"][entity_type]) except Exception as e: print(f"Warning: Could not load {json_file}: {e}") return {k: sorted(list(v)) for k, v in entities.items()} def get_deduplication_prompt(self, entity_type: str) -> str: """Get the system prompt for deduplication""" if entity_type == "people": examples = """Examples: {{ "Jeffrey Epstein": ["Jeffrey Epstein", "JEFFREY EPSTEIN", "Epstein", "EPSTEIN", "J. Epstein", "Jeffrey E. Epstein", "J Epstein", "Jeffery Epstein", "Mr. Epstein", "Jeffrey E.", "Epstein's"], "Ghislaine Maxwell": ["Ghislaine Maxwell", "GHISLAINE MAXWELL", "Maxwell", "G. Maxwell", "Ghislane Maxwell", "Ghislain Maxwell", "Ms. Maxwell"], "Bill Clinton": ["Bill Clinton", "BILL CLINTON", "Clinton", "William Clinton", "William J. Clinton", "President Clinton", "William Jefferson Clinton"], "Prince Andrew": ["Prince Andrew", "PRINCE ANDREW", "Andrew", "Duke of York", "HRH Prince Andrew", "Prince Andrew, Duke of York"] }} CORRECT handling of numbered identifiers: {{ "Accuser 1": ["Accuser 1", "Accuser-1", "Accuser 01", "ACCUSER 1"], "Accuser 2": ["Accuser 2", "Accuser-2", "Accuser 02", "ACCUSER 2"], "Accuser 3": ["Accuser 3", "Accuser-3", "Accuser 03"], "Jane Doe 1": ["Jane Doe 1", "Jane Doe-1", "JANE DOE 1"], "Jane Doe 2": ["Jane Doe 2", "Jane Doe-2"] }} WRONG EXAMPLES (DO NOT DO THIS): {{ "Accusers 1-3": ["Accuser 1", "Accuser 2", "Accuser 3"] // WRONG - these are different people! "Victims": ["Victim 1", "Victim 2", "Victim 3"] // WRONG - keep them separate "Mr. Epstein's brother": ["Jeffrey Epstein", "Epstein"] // WRONG - use actual name "The President": ["Bill Clinton"] // WRONG - use actual name "Plaintiff's attorney": ["John Smith"] // WRONG - use actual name }}""" elif entity_type == "organizations": examples = """Examples: {{ "Federal Bureau of Investigation": ["Federal Bureau of Investigation", "FBI", "F.B.I.", "FEDERAL BUREAU OF INVESTIGATION", "Federal Bureau Of Investigation"], "United States District Court": ["United States District Court", "U.S. District Court", "USDC", "District Court"], "Victoria's Secret": ["Victoria's Secret", "VICTORIA'S SECRET", "Victorias Secret", "Victoria Secret"] }}""" else: # locations examples = """Examples: {{ "New York City": ["New York City", "NEW YORK CITY", "NYC", "New York", "New York, NY", "New York City, NY", "Manhattan"], "Little Saint James": ["Little Saint James", "LITTLE SAINT JAMES", "Little St. James", "Little St James", "LSJ"], "Palm Beach": ["Palm Beach", "PALM BEACH", "Palm Beach, Florida", "Palm Beach, FL"] }}""" return f"""You are an expert at identifying and merging duplicate entities in legal documents. Given a list of {entity_type}, identify which names refer to the same entity and group them under their canonical name. ⚠️⚠️⚠️ CRITICAL WARNING ⚠️⚠️⚠️ The canonical name MUST be an actual person's PROPER NAME (First + Last). NEVER use descriptive phrases like "Mr. X's brother" or "The defendant". If you see "Jeffrey Epstein" in the list, that MUST be the canonical name, NOT "Mr. Epstein's brother". CRITICAL RULES FOR CANONICAL NAMES: **What makes a GOOD canonical name:** - Actual proper names (e.g., "Jeffrey Epstein", not "Mr. Epstein's brother") - Full first name + last name (e.g., "Jeffrey Epstein", not just "Epstein") - Include middle initial if commonly used (e.g., "William J. Clinton") - Use the most formal/complete version of the actual name **What is a BAD canonical name (NEVER USE THESE):** - Descriptive phrases (e.g., "Mr. Epstein's brother", "The defendant", "Plaintiff's attorney") - Titles alone (e.g., "The President", "The Judge") - Possessive forms (e.g., "Epstein's", "Maxwell's") - Roles or relationships (e.g., "co-conspirator", "witness", "victim") - Generic references (e.g., "he", "she", "defendant") **CRITICAL: Do NOT merge numbered identifiers:** - "Accuser 1", "Accuser 2", "Accuser 3" are DIFFERENT people - keep them separate - "Victim 1", "Victim 2", "Victim 3" are DIFFERENT people - keep them separate - "Witness 1", "Witness 2", "Witness 3" are DIFFERENT people - keep them separate - "Jane Doe 1", "Jane Doe 2" are DIFFERENT people - keep them separate - ONLY merge if the NUMBER is the same (e.g., "Accuser 1" = "Accuser-1" = "Accuser-01") **Deduplication Rules:** 1. **Use Proper Names Only**: The canonical name MUST be an actual person's name 2. **Case Insensitive**: "EPSTEIN", "Epstein", "epstein" are all the same 3. **Prefer Full Names**: "Jeffrey Epstein" not "Epstein" or "J. Epstein" 4. **Merge Variants**: - Last name only → Full name (e.g., "Epstein" → "Jeffrey Epstein") - Initials → Full name (e.g., "J. Epstein" → "Jeffrey Epstein") - Titles with same person (e.g., "Mr. Epstein" → "Jeffrey Epstein") - Honorifics (Dr., Mr., Ms., President, Judge, etc.) → actual name 5. **OCR Errors**: Merge spelling variations (e.g., "Jeffery" = "Jeffrey") 6. **Whitespace/Punctuation**: Ignore differences in spacing, periods, commas For PEOPLE specifically: - The canonical name should be First Name + Last Name (or First + Middle Initial + Last) - Merge all variants: full name, last name only, initials, titles, nicknames - NEVER use descriptive phrases like "Mr. X's brother" as canonical For ORGANIZATIONS: - Merge: Full name with abbreviations (FBI = Federal Bureau of Investigation) - Merge: Different legal forms (Inc., LLC, Corp., etc.) - Merge: With/without "The" prefix For LOCATIONS: - Merge: City abbreviations (NYC = New York City) - Merge: With/without state (Palm Beach = Palm Beach, FL) - Merge: Common neighborhood/borough names with city {examples} IMPORTANT: - Every entity must appear in exactly one group - The canonical name MUST be a proper name (First + Last), NOT a description - Use the most complete PROPER NAME as canonical (e.g., "Jeffrey Epstein" not "Mr. Epstein's brother") - When in doubt between a descriptive phrase and a name, ALWAYS choose the actual name - Merge aggressively - group all variants of the same person together - Include all variations in the variants array, including the canonical name itself VALIDATION: - Ask yourself: "Is this canonical name an actual person's name?" If no, find the actual name from the variants - Examples of GOOD canonical names: "Jeffrey Epstein", "Bill Clinton", "John Smith" - Examples of BAD canonical names: "Mr. Epstein's brother", "The defendant", "Plaintiff" STEP-BY-STEP PROCESS: 1. Look at the list of variants 2. Find the FULL PROPER NAME (e.g., "Jeffrey Epstein") 3. Use that as the canonical name 4. Add all other variants to the array 5. NEVER use descriptive phrases as canonical names EXAMPLE THOUGHT PROCESS: Variants: ["Jeffrey Epstein", "Epstein", "Mr. Epstein", "Mr. Epstein's brother", "J. Epstein"] Question: Which is the actual person's full name? Answer: "Jeffrey Epstein" ✓ NOT "Mr. Epstein's brother" ✗ (this is a description, not a name) Result: {{"Jeffrey Epstein": ["Jeffrey Epstein", "Epstein", "Mr. Epstein", "Mr. Epstein's brother", "J. Epstein"]}} Return ONLY valid JSON with NO extra text, markdown, or explanations.""" def deduplicate_entities(self, entities: List[str], entity_type: str, batch_size: int = 30) -> Dict[str, str]: """Use LLM to deduplicate entities, processing in batches""" if not entities: return {} print(f"\nDeduplicating {len(entities)} {entity_type}...") # Process in batches all_mappings = {} batches = [entities[i:i + batch_size] for i in range(0, len(entities), batch_size)] for batch_idx, batch in enumerate(tqdm(batches, desc=f"Processing {entity_type} batches")): try: response = self.client.chat.completions.create( model=self.model, messages=[ { "role": "system", "content": self.get_deduplication_prompt(entity_type) }, { "role": "user", "content": f"Identify duplicates in this list of {entity_type}:\n\n" + "\n".join(f"- {e}" for e in batch) + "\n\nRemember: Use FULL PROPER NAMES as canonical (e.g., 'Jeffrey Epstein'), NOT descriptions (e.g., 'Mr. Epstein's brother')." } ], temperature=0.0, # Make it deterministic max_tokens=4096 ) content = response.choices[0].message.content.strip() # Robust JSON extraction # 1. Try to find JSON between markdown code fences json_match = re.search(r'```(?:json)?\s*\n(.*?)\n```', content, re.DOTALL) if json_match: content = json_match.group(1).strip() else: # 2. Try to find JSON between curly braces json_match = re.search(r'\{.*\}', content, re.DOTALL) if json_match: content = json_match.group(0).strip() else: # 3. Strip markdown manually if content.startswith('```json'): content = content[7:] elif content.startswith('```'): content = content[3:] if content.endswith('```'): content = content[:-3] content = content.strip() # Try to parse JSON try: groups = json.loads(content) except json.JSONDecodeError as e: print(f"\nJSON parsing error in batch {batch_idx}:") print(f"Error: {e}") print(f"Content preview: {content[:500]}") # Try to salvage by finding the first complete JSON object try: # Find first { and matching } start = content.find('{') if start == -1: raise ValueError("No JSON object found") brace_count = 0 end = start for i in range(start, len(content)): if content[i] == '{': brace_count += 1 elif content[i] == '}': brace_count -= 1 if brace_count == 0: end = i + 1 break if end > start: content = content[start:end] groups = json.loads(content) print(f"✓ Recovered JSON from malformed response") else: raise ValueError("Could not find complete JSON object") except Exception as salvage_error: print(f"Could not salvage JSON: {salvage_error}") raise e # Validate and convert groups to individual mappings for canonical, variants in groups.items(): # Validate canonical name for people if entity_type == "people": # Check if this incorrectly merged numbered identifiers # e.g., "Accusers 1-3" should be split back into separate people if re.search(r'(accuser|victim|witness|jane doe|john doe)s?\s*\d+\s*-\s*\d+', canonical, re.IGNORECASE): # This is wrong - split it back print(f" ⚠️ Incorrectly merged group: '{canonical}' - splitting back into individuals") # Map each variant to itself for variant in variants: all_mappings[variant] = variant continue # Check for bad canonical names - be very aggressive canonical_lower = canonical.lower() # Pattern: anything with 's brother/sister/friend/attorney/mother/father etc if re.search(r"'s\s+(brother|sister|friend|attorney|lawyer|associate|mother|father|son|daughter)", canonical_lower): # Find actual name from variants actual_names = [v for v in variants if not re.search(r"'s\s+(brother|sister|friend|attorney|lawyer|associate|mother|father|son|daughter)", v.lower())] if actual_names: # Prefer names with first and last name full_names = [n for n in actual_names if len(n.split()) >= 2] if full_names: # Pick the longest/most complete better_name = max(full_names, key=len) print(f" ⚠️ Fixed bad canonical: '{canonical}' → '{better_name}'") canonical = better_name # Pattern: "The X" or "A X" (defendant, plaintiff, etc) elif re.search(r"^(the|a)\s+(defendant|plaintiff|witness|victim|judge|president)", canonical_lower): actual_names = [v for v in variants if not re.search(r"^(the|a)\s+", v.lower()) and len(v.split()) >= 2] if actual_names: better_name = max(actual_names, key=len) print(f" ⚠️ Fixed bad canonical: '{canonical}' → '{better_name}'") canonical = better_name # Pattern: ends with possessive elif canonical_lower.endswith("'s") or canonical_lower.endswith("'s"): non_possessive = [v for v in variants if not (v.lower().endswith("'s") or v.lower().endswith("'s"))] if non_possessive: better_name = max(non_possessive, key=len) print(f" ⚠️ Fixed bad canonical: '{canonical}' → '{better_name}'") canonical = better_name # Pattern: just title (Mr., Ms., Dr.) alone elif re.match(r"^(mr|ms|mrs|dr|judge|president)\.?\s*$", canonical_lower): actual_names = [v for v in variants if len(v.split()) >= 2] if actual_names: better_name = max(actual_names, key=len) print(f" ⚠️ Fixed bad canonical: '{canonical}' → '{better_name}'") canonical = better_name for variant in variants: all_mappings[variant] = canonical except Exception as e: print(f"Warning: Error processing batch {batch_idx}: {e}") # If batch fails, map each entity to itself for entity in batch: if entity not in all_mappings: all_mappings[entity] = entity return all_mappings def merge_batches(self, mappings: Dict[str, str]) -> Dict[str, str]: """Merge mappings from multiple batches to ensure consistency""" # Group by canonical names groups = defaultdict(set) for variant, canonical in mappings.items(): groups[canonical].add(variant) # Pick the most common canonical name for each group final_mappings = {} for canonical, variants in groups.items(): # Use the longest name as canonical (usually most complete) true_canonical = max(variants, key=len) for variant in variants: final_mappings[variant] = true_canonical return final_mappings def process_all(self, batch_size: int = 30) -> Dict[str, Dict[str, str]]: """Process all entity types""" print("=" * 60) print("ENTITY DEDUPLICATION") print("=" * 60) # Load all entities all_entities = self.load_all_entities() print(f"\nEntity counts:") for entity_type, entity_list in all_entities.items(): print(f" {entity_type}: {len(entity_list)}") # Deduplicate each type dedupe_mappings = {} for entity_type in ["people", "organizations", "locations"]: mappings = self.deduplicate_entities( all_entities[entity_type], entity_type, batch_size=batch_size ) dedupe_mappings[entity_type] = self.merge_batches(mappings) # Show stats unique_after = len(set(dedupe_mappings[entity_type].values())) print(f" {entity_type}: {len(all_entities[entity_type])} → {unique_after} unique entities") return dedupe_mappings def save_dedupe_file(self, mappings: Dict[str, Dict[str, str]]): """Save deduplication mappings to JSON file""" with open(self.dedupe_file, 'w', encoding='utf-8') as f: json.dump(mappings, f, indent=2, ensure_ascii=False) print(f"\n✅ Deduplication mappings saved to {self.dedupe_file}") def load_existing_dedupe(self) -> Dict[str, Dict[str, str]]: """Load existing dedupe file if it exists""" if self.dedupe_file.exists(): with open(self.dedupe_file, 'r', encoding='utf-8') as f: return json.load(f) return {"people": {}, "organizations": {}, "locations": {}} def main(): load_dotenv() import argparse parser = argparse.ArgumentParser(description="Deduplicate entities using LLM") parser.add_argument("--api-url", help="OpenAI-compatible API base URL") parser.add_argument("--api-key", help="API key") parser.add_argument("--model", help="Model name") parser.add_argument("--batch-size", type=int, default=30, help="Entities per batch (default: 30)") parser.add_argument("--show-stats", action="store_true", help="Show current deduplication stats and exit") args = parser.parse_args() api_url = args.api_url or os.getenv("OPENAI_API_URL") api_key = args.api_key or os.getenv("OPENAI_API_KEY") model = args.model or os.getenv("OPENAI_MODEL", "gpt-4o") deduplicator = EntityDeduplicator(api_url, api_key, model) if args.show_stats: # Just show stats existing = deduplicator.load_existing_dedupe() all_entities = deduplicator.load_all_entities() print("\nCurrent deduplication status:") for entity_type in ["people", "organizations", "locations"]: raw_count = len(all_entities[entity_type]) if existing.get(entity_type): unique_count = len(set(existing[entity_type].values())) print(f" {entity_type}: {raw_count} raw → {unique_count} unique") else: print(f" {entity_type}: {raw_count} (not deduplicated)") return # Process and save mappings = deduplicator.process_all(batch_size=args.batch_size) deduplicator.save_dedupe_file(mappings) if __name__ == "__main__": main()