deduplicate.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. #!/usr/bin/env python3
  2. """
  3. Entity deduplication script using LLM to identify and merge duplicate entities.
  4. Processes all JSON files from ./results/ and creates a dedupe.json mapping file.
  5. """
  6. import os
  7. import json
  8. import re
  9. from pathlib import Path
  10. from typing import Dict, List, Set
  11. from collections import defaultdict
  12. from openai import OpenAI
  13. from tqdm import tqdm
  14. from dotenv import load_dotenv
  15. class EntityDeduplicator:
  16. """Deduplicate entities using LLM assistance"""
  17. def __init__(self, api_url: str, api_key: str, model: str = "gpt-4o"):
  18. self.client = OpenAI(api_key=api_key, base_url=api_url)
  19. self.model = model
  20. self.results_dir = Path("./results")
  21. self.dedupe_file = Path("./dedupe.json")
  22. def load_all_entities(self) -> Dict[str, Set[str]]:
  23. """Load all unique entities from all JSON files"""
  24. entities = {
  25. "people": set(),
  26. "organizations": set(),
  27. "locations": set()
  28. }
  29. json_files = list(self.results_dir.glob("**/*.json"))
  30. print(f"Found {len(json_files)} JSON files to process")
  31. for json_file in tqdm(json_files, desc="Loading entities"):
  32. try:
  33. with open(json_file, 'r', encoding='utf-8') as f:
  34. data = json.load(f)
  35. if "entities" in data:
  36. for entity_type in ["people", "organizations", "locations"]:
  37. if entity_type in data["entities"]:
  38. entities[entity_type].update(data["entities"][entity_type])
  39. except Exception as e:
  40. print(f"Warning: Could not load {json_file}: {e}")
  41. return {k: sorted(list(v)) for k, v in entities.items()}
  42. def get_deduplication_prompt(self, entity_type: str) -> str:
  43. """Get the system prompt for deduplication"""
  44. if entity_type == "people":
  45. examples = """Examples:
  46. {
  47. "Jeffrey Epstein": ["Jeffrey Epstein", "JEFFREY EPSTEIN", "Epstein", "EPSTEIN", "J. Epstein", "Jeffrey E. Epstein", "J Epstein", "Jeffery Epstein", "Mr. Epstein", "Jeffrey E.", "Epstein's"],
  48. "Ghislaine Maxwell": ["Ghislaine Maxwell", "GHISLAINE MAXWELL", "Maxwell", "G. Maxwell", "Ghislane Maxwell", "Ghislain Maxwell", "Ms. Maxwell"],
  49. "Bill Clinton": ["Bill Clinton", "BILL CLINTON", "Clinton", "William Clinton", "William J. Clinton", "President Clinton", "William Jefferson Clinton"],
  50. "Prince Andrew": ["Prince Andrew", "PRINCE ANDREW", "Andrew", "Duke of York", "HRH Prince Andrew", "Prince Andrew, Duke of York"]
  51. }
  52. WRONG EXAMPLES (DO NOT DO THIS):
  53. {
  54. "Mr. Epstein's brother": ["Jeffrey Epstein", "Epstein"] // WRONG - use actual name
  55. "The President": ["Bill Clinton"] // WRONG - use actual name
  56. "Plaintiff's attorney": ["John Smith"] // WRONG - use actual name
  57. }"""
  58. elif entity_type == "organizations":
  59. examples = """Examples:
  60. {
  61. "Federal Bureau of Investigation": ["Federal Bureau of Investigation", "FBI", "F.B.I.", "FEDERAL BUREAU OF INVESTIGATION", "Federal Bureau Of Investigation"],
  62. "United States District Court": ["United States District Court", "U.S. District Court", "USDC", "District Court"],
  63. "Victoria's Secret": ["Victoria's Secret", "VICTORIA'S SECRET", "Victorias Secret", "Victoria Secret"]
  64. }"""
  65. else: # locations
  66. examples = """Examples:
  67. {
  68. "New York City": ["New York City", "NEW YORK CITY", "NYC", "New York", "New York, NY", "New York City, NY", "Manhattan"],
  69. "Little Saint James": ["Little Saint James", "LITTLE SAINT JAMES", "Little St. James", "Little St James", "LSJ"],
  70. "Palm Beach": ["Palm Beach", "PALM BEACH", "Palm Beach, Florida", "Palm Beach, FL"]
  71. }"""
  72. return f"""You are an expert at identifying and merging duplicate entities in legal documents.
  73. Given a list of {entity_type}, identify which names refer to the same entity and group them under their canonical name.
  74. CRITICAL RULES FOR CANONICAL NAMES:
  75. **What makes a GOOD canonical name:**
  76. - Actual proper names (e.g., "Jeffrey Epstein", not "Mr. Epstein's brother")
  77. - Full first name + last name (e.g., "Jeffrey Epstein", not just "Epstein")
  78. - Include middle initial if commonly used (e.g., "William J. Clinton")
  79. - Use the most formal/complete version of the actual name
  80. **What is a BAD canonical name (NEVER USE THESE):**
  81. - Descriptive phrases (e.g., "Mr. Epstein's brother", "The defendant", "Plaintiff's attorney")
  82. - Titles alone (e.g., "The President", "The Judge")
  83. - Possessive forms (e.g., "Epstein's", "Maxwell's")
  84. - Roles or relationships (e.g., "co-conspirator", "witness", "victim")
  85. - Generic references (e.g., "he", "she", "defendant")
  86. **Deduplication Rules:**
  87. 1. **Use Proper Names Only**: The canonical name MUST be an actual person's name
  88. 2. **Case Insensitive**: "EPSTEIN", "Epstein", "epstein" are all the same
  89. 3. **Prefer Full Names**: "Jeffrey Epstein" not "Epstein" or "J. Epstein"
  90. 4. **Merge Variants**:
  91. - Last name only → Full name (e.g., "Epstein" → "Jeffrey Epstein")
  92. - Initials → Full name (e.g., "J. Epstein" → "Jeffrey Epstein")
  93. - Titles with same person (e.g., "Mr. Epstein" → "Jeffrey Epstein")
  94. - Honorifics (Dr., Mr., Ms., President, Judge, etc.) → actual name
  95. 5. **OCR Errors**: Merge spelling variations (e.g., "Jeffery" = "Jeffrey")
  96. 6. **Whitespace/Punctuation**: Ignore differences in spacing, periods, commas
  97. For PEOPLE specifically:
  98. - The canonical name should be First Name + Last Name (or First + Middle Initial + Last)
  99. - Merge all variants: full name, last name only, initials, titles, nicknames
  100. - NEVER use descriptive phrases like "Mr. X's brother" as canonical
  101. For ORGANIZATIONS:
  102. - Merge: Full name with abbreviations (FBI = Federal Bureau of Investigation)
  103. - Merge: Different legal forms (Inc., LLC, Corp., etc.)
  104. - Merge: With/without "The" prefix
  105. For LOCATIONS:
  106. - Merge: City abbreviations (NYC = New York City)
  107. - Merge: With/without state (Palm Beach = Palm Beach, FL)
  108. - Merge: Common neighborhood/borough names with city
  109. {examples}
  110. IMPORTANT:
  111. - Every entity must appear in exactly one group
  112. - The canonical name MUST be a proper name (First + Last), NOT a description
  113. - Use the most complete PROPER NAME as canonical (e.g., "Jeffrey Epstein" not "Mr. Epstein's brother")
  114. - When in doubt between a descriptive phrase and a name, ALWAYS choose the actual name
  115. - Merge aggressively - group all variants of the same person together
  116. - Include all variations in the variants array, including the canonical name itself
  117. VALIDATION:
  118. - Ask yourself: "Is this canonical name an actual person's name?" If no, find the actual name from the variants
  119. - Examples of GOOD canonical names: "Jeffrey Epstein", "Bill Clinton", "John Smith"
  120. - Examples of BAD canonical names: "Mr. Epstein's brother", "The defendant", "Plaintiff"
  121. Return ONLY valid JSON with NO extra text, markdown, or explanations."""
  122. def deduplicate_entities(self, entities: List[str], entity_type: str, batch_size: int = 50) -> Dict[str, str]:
  123. """Use LLM to deduplicate entities, processing in batches"""
  124. if not entities:
  125. return {}
  126. print(f"\nDeduplicating {len(entities)} {entity_type}...")
  127. # Process in batches
  128. all_mappings = {}
  129. batches = [entities[i:i + batch_size] for i in range(0, len(entities), batch_size)]
  130. for batch_idx, batch in enumerate(tqdm(batches, desc=f"Processing {entity_type} batches")):
  131. try:
  132. response = self.client.chat.completions.create(
  133. model=self.model,
  134. messages=[
  135. {
  136. "role": "system",
  137. "content": self.get_deduplication_prompt(entity_type)
  138. },
  139. {
  140. "role": "user",
  141. "content": f"Identify duplicates in this list of {entity_type}:\n\n" + "\n".join(f"- {e}" for e in batch)
  142. }
  143. ],
  144. temperature=0.1,
  145. max_tokens=4096
  146. )
  147. content = response.choices[0].message.content.strip()
  148. # Robust JSON extraction
  149. # 1. Try to find JSON between markdown code fences
  150. json_match = re.search(r'```(?:json)?\s*\n(.*?)\n```', content, re.DOTALL)
  151. if json_match:
  152. content = json_match.group(1).strip()
  153. else:
  154. # 2. Try to find JSON between curly braces
  155. json_match = re.search(r'\{.*\}', content, re.DOTALL)
  156. if json_match:
  157. content = json_match.group(0).strip()
  158. else:
  159. # 3. Strip markdown manually
  160. if content.startswith('```json'):
  161. content = content[7:]
  162. elif content.startswith('```'):
  163. content = content[3:]
  164. if content.endswith('```'):
  165. content = content[:-3]
  166. content = content.strip()
  167. # Try to parse JSON
  168. try:
  169. groups = json.loads(content)
  170. except json.JSONDecodeError as e:
  171. print(f"\nJSON parsing error in batch {batch_idx}:")
  172. print(f"Error: {e}")
  173. print(f"Content preview: {content[:500]}")
  174. # Try to salvage by finding the first complete JSON object
  175. try:
  176. # Find first { and matching }
  177. start = content.find('{')
  178. if start == -1:
  179. raise ValueError("No JSON object found")
  180. brace_count = 0
  181. end = start
  182. for i in range(start, len(content)):
  183. if content[i] == '{':
  184. brace_count += 1
  185. elif content[i] == '}':
  186. brace_count -= 1
  187. if brace_count == 0:
  188. end = i + 1
  189. break
  190. if end > start:
  191. content = content[start:end]
  192. groups = json.loads(content)
  193. print(f"✓ Recovered JSON from malformed response")
  194. else:
  195. raise ValueError("Could not find complete JSON object")
  196. except Exception as salvage_error:
  197. print(f"Could not salvage JSON: {salvage_error}")
  198. raise e
  199. # Validate and convert groups to individual mappings
  200. for canonical, variants in groups.items():
  201. # Validate canonical name for people
  202. if entity_type == "people":
  203. # Check for bad canonical names
  204. bad_patterns = [
  205. r"'s\s+(brother|sister|friend|attorney|lawyer|associate)",
  206. r"^(the|a)\s+(defendant|plaintiff|witness|victim|judge|president)",
  207. r"^mr\.|^ms\.|^mrs\.|^dr\.\s*$",
  208. r"co-conspirator|witness\s+\d+|victim\s+\d+",
  209. r"'s$" # ends with possessive
  210. ]
  211. canonical_lower = canonical.lower()
  212. for pattern in bad_patterns:
  213. if re.search(pattern, canonical_lower):
  214. # This is a bad canonical name - try to find a better one
  215. # Look for the longest actual name in the variants
  216. better_name = max(
  217. (v for v in variants if len(v.split()) >= 2 and not re.search(pattern, v.lower())),
  218. key=len,
  219. default=canonical
  220. )
  221. if better_name != canonical:
  222. print(f" ⚠️ Fixed bad canonical: '{canonical}' → '{better_name}'")
  223. canonical = better_name
  224. break
  225. for variant in variants:
  226. all_mappings[variant] = canonical
  227. except Exception as e:
  228. print(f"Warning: Error processing batch {batch_idx}: {e}")
  229. # If batch fails, map each entity to itself
  230. for entity in batch:
  231. if entity not in all_mappings:
  232. all_mappings[entity] = entity
  233. return all_mappings
  234. def merge_batches(self, mappings: Dict[str, str]) -> Dict[str, str]:
  235. """Merge mappings from multiple batches to ensure consistency"""
  236. # Group by canonical names
  237. groups = defaultdict(set)
  238. for variant, canonical in mappings.items():
  239. groups[canonical].add(variant)
  240. # Pick the most common canonical name for each group
  241. final_mappings = {}
  242. for canonical, variants in groups.items():
  243. # Use the longest name as canonical (usually most complete)
  244. true_canonical = max(variants, key=len)
  245. for variant in variants:
  246. final_mappings[variant] = true_canonical
  247. return final_mappings
  248. def process_all(self, batch_size: int = 50) -> Dict[str, Dict[str, str]]:
  249. """Process all entity types"""
  250. print("=" * 60)
  251. print("ENTITY DEDUPLICATION")
  252. print("=" * 60)
  253. # Load all entities
  254. all_entities = self.load_all_entities()
  255. print(f"\nEntity counts:")
  256. for entity_type, entity_list in all_entities.items():
  257. print(f" {entity_type}: {len(entity_list)}")
  258. # Deduplicate each type
  259. dedupe_mappings = {}
  260. for entity_type in ["people", "organizations", "locations"]:
  261. mappings = self.deduplicate_entities(
  262. all_entities[entity_type],
  263. entity_type,
  264. batch_size=batch_size
  265. )
  266. dedupe_mappings[entity_type] = self.merge_batches(mappings)
  267. # Show stats
  268. unique_after = len(set(dedupe_mappings[entity_type].values()))
  269. print(f" {entity_type}: {len(all_entities[entity_type])} → {unique_after} unique entities")
  270. return dedupe_mappings
  271. def save_dedupe_file(self, mappings: Dict[str, Dict[str, str]]):
  272. """Save deduplication mappings to JSON file"""
  273. with open(self.dedupe_file, 'w', encoding='utf-8') as f:
  274. json.dump(mappings, f, indent=2, ensure_ascii=False)
  275. print(f"\n✅ Deduplication mappings saved to {self.dedupe_file}")
  276. def load_existing_dedupe(self) -> Dict[str, Dict[str, str]]:
  277. """Load existing dedupe file if it exists"""
  278. if self.dedupe_file.exists():
  279. with open(self.dedupe_file, 'r', encoding='utf-8') as f:
  280. return json.load(f)
  281. return {"people": {}, "organizations": {}, "locations": {}}
  282. def main():
  283. load_dotenv()
  284. import argparse
  285. parser = argparse.ArgumentParser(description="Deduplicate entities using LLM")
  286. parser.add_argument("--api-url", help="OpenAI-compatible API base URL")
  287. parser.add_argument("--api-key", help="API key")
  288. parser.add_argument("--model", help="Model name")
  289. parser.add_argument("--batch-size", type=int, default=50, help="Entities per batch (default: 50)")
  290. parser.add_argument("--show-stats", action="store_true", help="Show current deduplication stats and exit")
  291. args = parser.parse_args()
  292. api_url = args.api_url or os.getenv("OPENAI_API_URL")
  293. api_key = args.api_key or os.getenv("OPENAI_API_KEY")
  294. model = args.model or os.getenv("OPENAI_MODEL", "gpt-4o")
  295. deduplicator = EntityDeduplicator(api_url, api_key, model)
  296. if args.show_stats:
  297. # Just show stats
  298. existing = deduplicator.load_existing_dedupe()
  299. all_entities = deduplicator.load_all_entities()
  300. print("\nCurrent deduplication status:")
  301. for entity_type in ["people", "organizations", "locations"]:
  302. raw_count = len(all_entities[entity_type])
  303. if existing.get(entity_type):
  304. unique_count = len(set(existing[entity_type].values()))
  305. print(f" {entity_type}: {raw_count} raw → {unique_count} unique")
  306. else:
  307. print(f" {entity_type}: {raw_count} (not deduplicated)")
  308. return
  309. # Process and save
  310. mappings = deduplicator.process_all(batch_size=args.batch_size)
  311. deduplicator.save_dedupe_file(mappings)
  312. if __name__ == "__main__":
  313. main()