deduplicate_types.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. #!/usr/bin/env python3
  2. """
  3. Document type deduplication script using LLM to merge similar types.
  4. Groups document type variations (e.g., "Deposition", "deposition", "Deposition Transcript")
  5. into canonical types.
  6. """
  7. import os
  8. import json
  9. import re
  10. from pathlib import Path
  11. from typing import Dict, List
  12. from collections import Counter
  13. from openai import OpenAI
  14. from dotenv import load_dotenv
  15. class DocumentTypeDeduplicator:
  16. """Deduplicate document types using LLM"""
  17. def __init__(self, api_url: str, api_key: str, model: str = "gpt-4o"):
  18. self.client = OpenAI(api_key=api_key, base_url=api_url)
  19. self.model = model
  20. self.results_dir = Path("./results")
  21. self.output_file = Path("./dedupe_types.json")
  22. def collect_document_types(self) -> Counter:
  23. """Collect all document types from JSON files"""
  24. types = []
  25. for json_file in self.results_dir.glob("**/*.json"):
  26. try:
  27. with open(json_file, 'r', encoding='utf-8') as f:
  28. data = json.load(f)
  29. doc_type = data.get('document_metadata', {}).get('document_type')
  30. if doc_type:
  31. types.append(str(doc_type).strip())
  32. except Exception as e:
  33. print(f"Warning: Could not read {json_file}: {e}")
  34. return Counter(types)
  35. def _deduplicate_in_batches(self, unique_types: List[str], type_counts: Counter) -> Dict[str, str]:
  36. """Deduplicate types in batches to handle large numbers"""
  37. batch_size = 100
  38. all_mappings = {}
  39. canonical_to_variants = {}
  40. # First pass: deduplicate in batches
  41. for i in range(0, len(unique_types), batch_size):
  42. batch = unique_types[i:i+batch_size]
  43. print(f" Processing batch {i//batch_size + 1}/{(len(unique_types) + batch_size - 1)//batch_size} ({len(batch)} types)...")
  44. try:
  45. batch_mappings = self._deduplicate_single_batch(batch)
  46. # Collect mappings and track canonical types
  47. for original, canonical in batch_mappings.items():
  48. all_mappings[original] = canonical
  49. if canonical not in canonical_to_variants:
  50. canonical_to_variants[canonical] = []
  51. canonical_to_variants[canonical].append(original)
  52. except Exception as e:
  53. print(f" Warning: Failed to process batch, using original names: {e}")
  54. for t in batch:
  55. all_mappings[t] = t
  56. if t not in canonical_to_variants:
  57. canonical_to_variants[t] = []
  58. canonical_to_variants[t].append(t)
  59. # Second pass: deduplicate the canonical types themselves
  60. # (in case different batches created similar canonical types)
  61. print(f"\n📋 Batch processing created {len(canonical_to_variants)} unique canonical types")
  62. print(f"Running final deduplication pass to merge any duplicates across batches...")
  63. try:
  64. canonical_types = list(canonical_to_variants.keys())
  65. canonical_mappings = self._deduplicate_final_pass(canonical_types)
  66. # Apply final canonical deduplication
  67. for original, first_canonical in all_mappings.items():
  68. final_canonical = canonical_mappings.get(first_canonical, first_canonical)
  69. all_mappings[original] = final_canonical
  70. # Count final canonicals
  71. final_canonicals = set(all_mappings.values())
  72. print(f"✅ Final deduplication reduced {len(canonical_to_variants)} → {len(final_canonicals)} canonical types")
  73. except Exception as e:
  74. print(f" Warning: Failed to deduplicate canonical types: {e}")
  75. return all_mappings
  76. def _deduplicate_final_pass(self, canonical_types: List[str]) -> Dict[str, str]:
  77. """Final deduplication pass for canonical types from different batches"""
  78. if len(canonical_types) <= 1:
  79. return {t: t for t in canonical_types}
  80. prompt = f"""You are a legal document classifier performing a FINAL CLEANUP pass on canonical document types.
  81. Your task: Merge any remaining duplicate or very similar canonical types.
  82. ⚠️⚠️⚠️ CRITICAL RULES ⚠️⚠️⚠️
  83. 1. These are ALREADY canonical types, so be conservative
  84. 2. ONLY merge if types are truly the same thing with different names:
  85. - "Deposition" and "Deposition Transcript" → "Deposition"
  86. - "Court Filing" and "Court Document" → "Court Filing"
  87. - "Email" and "E-mail" → "Email"
  88. 3. DO NOT merge types that are legitimately different:
  89. - "Letter" and "Email" are DIFFERENT (keep separate)
  90. - "Affidavit" and "Declaration" are DIFFERENT (keep separate)
  91. - "Motion" and "Memorandum" are DIFFERENT (keep separate)
  92. 4. Prefer the SHORTER, simpler canonical name when merging
  93. 5. Use these standard canonical types when possible:
  94. - Deposition
  95. - Court Filing
  96. - Letter
  97. - Email
  98. - Affidavit
  99. - Motion
  100. - Subpoena
  101. - Flight Log
  102. - Financial Record
  103. - Contract
  104. - Memorandum
  105. - Transcript
  106. - Exhibit
  107. - Declaration
  108. - Report
  109. Here are the canonical types to review (sorted alphabetically):
  110. {json.dumps(sorted(canonical_types), indent=2)}
  111. Return ONLY valid JSON mapping each type to its final canonical form:
  112. {{
  113. "Type 1": "Final Canonical Type",
  114. "Type 2": "Final Canonical Type",
  115. ...
  116. }}
  117. If a type is already perfect, map it to itself."""
  118. response = self.client.chat.completions.create(
  119. model=self.model,
  120. messages=[{"role": "user", "content": prompt}],
  121. temperature=0.0,
  122. max_tokens=4000
  123. )
  124. content = response.choices[0].message.content.strip()
  125. # Extract JSON
  126. json_match = re.search(r'```(?:json)?\s*\n(.*?)\n```', content, re.DOTALL)
  127. if json_match:
  128. content = json_match.group(1).strip()
  129. else:
  130. json_match = re.search(r'\{.*\}', content, re.DOTALL)
  131. if json_match:
  132. content = json_match.group(0).strip()
  133. else:
  134. # Brace-counting fallback
  135. start = content.find('{')
  136. if start >= 0:
  137. brace_count = 0
  138. for i in range(start, len(content)):
  139. if content[i] == '{':
  140. brace_count += 1
  141. elif content[i] == '}':
  142. brace_count -= 1
  143. if brace_count == 0:
  144. content = content[start:i+1]
  145. break
  146. try:
  147. mappings = json.loads(content)
  148. except json.JSONDecodeError as e:
  149. print(f"Failed to parse JSON response in final pass. First 500 chars:")
  150. print(content[:500])
  151. raise
  152. # Validate mappings
  153. validated_mappings = {}
  154. for original, canonical in mappings.items():
  155. canonical = str(canonical).strip()
  156. if not canonical:
  157. canonical = original
  158. validated_mappings[original] = canonical
  159. return validated_mappings
  160. def _deduplicate_single_batch(self, types: List[str]) -> Dict[str, str]:
  161. """Deduplicate a single batch of types"""
  162. prompt = f"""You are a legal document classifier. Your task is to group similar document type labels into standardized canonical types.
  163. ⚠️⚠️⚠️ CRITICAL RULES ⚠️⚠️⚠️
  164. 1. The canonical type MUST be a clean, professional document type name
  165. 2. Use title case (e.g., "Deposition", "Court Filing", "Email")
  166. 3. Merge variations that mean the same thing:
  167. - "deposition" → "Deposition"
  168. - "DEPOSITION" → "Deposition"
  169. - "deposition transcript" → "Deposition"
  170. - "dep" → "Deposition"
  171. 4. Common canonical types to use:
  172. - Deposition
  173. - Court Filing
  174. - Letter
  175. - Email
  176. - Affidavit
  177. - Motion
  178. - Subpoena
  179. - Flight Log
  180. - Financial Record
  181. - Contract
  182. - Memorandum
  183. - Transcript
  184. - Exhibit
  185. - Declaration
  186. - Report
  187. - Unknown (only if truly unidentifiable)
  188. 5. Be generous with merging - if types are similar, merge them
  189. 6. Prefer shorter, cleaner canonical names
  190. Here are the document types to deduplicate:
  191. {json.dumps(types, indent=2)}
  192. Return ONLY valid JSON in this exact format:
  193. {{
  194. "document_type_1": "Canonical Type",
  195. "document_type_2": "Canonical Type",
  196. ...
  197. }}
  198. Map every input type to its canonical form. If a type is already clean, map it to itself."""
  199. response = self.client.chat.completions.create(
  200. model=self.model,
  201. messages=[{"role": "user", "content": prompt}],
  202. temperature=0.0,
  203. max_tokens=4000
  204. )
  205. content = response.choices[0].message.content.strip()
  206. # Extract JSON
  207. json_match = re.search(r'```(?:json)?\s*\n(.*?)\n```', content, re.DOTALL)
  208. if json_match:
  209. content = json_match.group(1).strip()
  210. else:
  211. json_match = re.search(r'\{.*\}', content, re.DOTALL)
  212. if json_match:
  213. content = json_match.group(0).strip()
  214. else:
  215. # Brace-counting fallback
  216. start = content.find('{')
  217. if start >= 0:
  218. brace_count = 0
  219. for i in range(start, len(content)):
  220. if content[i] == '{':
  221. brace_count += 1
  222. elif content[i] == '}':
  223. brace_count -= 1
  224. if brace_count == 0:
  225. content = content[start:i+1]
  226. break
  227. try:
  228. mappings = json.loads(content)
  229. except json.JSONDecodeError as e:
  230. print(f"Failed to parse JSON response. First 500 chars:")
  231. print(content[:500])
  232. raise
  233. # Validate and clean up mappings
  234. validated_mappings = {}
  235. for original, canonical in mappings.items():
  236. canonical = str(canonical).strip()
  237. if not canonical:
  238. canonical = "Unknown"
  239. validated_mappings[original] = canonical
  240. return validated_mappings
  241. def deduplicate_types(self, type_counts: Counter) -> Dict[str, str]:
  242. """Use LLM to deduplicate document types"""
  243. # Get unique types sorted by frequency
  244. unique_types = sorted(type_counts.keys(), key=lambda x: type_counts[x], reverse=True)
  245. print(f"Found {len(unique_types)} unique document types")
  246. # If too many types, process in batches
  247. if len(unique_types) > 100:
  248. print(f"Processing in batches (too many types for single request)...")
  249. return self._deduplicate_in_batches(unique_types, type_counts)
  250. print(f"Processing single batch deduplication...")
  251. mappings = self._deduplicate_single_batch(unique_types)
  252. # Get canonical types
  253. canonical_types = list(set(mappings.values()))
  254. print(f"\n📋 Initial deduplication created {len(canonical_types)} canonical types")
  255. # Do a final review pass
  256. if len(canonical_types) > 1:
  257. print(f"Running final review pass for cleanup...")
  258. try:
  259. final_mappings = self._deduplicate_final_pass(canonical_types)
  260. # Apply final pass
  261. for original, first_canonical in mappings.items():
  262. final_canonical = final_mappings.get(first_canonical, first_canonical)
  263. mappings[original] = final_canonical
  264. final_canonicals = set(mappings.values())
  265. print(f"✅ Final review reduced {len(canonical_types)} → {len(final_canonicals)} canonical types")
  266. except Exception as e:
  267. print(f" Warning: Final review failed: {e}")
  268. return mappings
  269. def save_mappings(self, mappings: Dict[str, str], type_counts: Counter):
  270. """Save deduplication mappings to JSON file"""
  271. # Get stats
  272. canonical_types = set(mappings.values())
  273. total_docs = sum(type_counts.values())
  274. output = {
  275. "stats": {
  276. "original_types": len(mappings),
  277. "canonical_types": len(canonical_types),
  278. "total_documents": total_docs,
  279. "reduction_percentage": round((1 - len(canonical_types) / len(mappings)) * 100, 1)
  280. },
  281. "mappings": mappings
  282. }
  283. with open(self.output_file, 'w', encoding='utf-8') as f:
  284. json.dump(output, f, indent=2, ensure_ascii=False)
  285. print(f"\n✅ Saved type mappings to {self.output_file}")
  286. print(f" Original types: {len(mappings)}")
  287. print(f" Canonical types: {len(canonical_types)}")
  288. print(f" Reduction: {output['stats']['reduction_percentage']}%")
  289. # Show canonical type breakdown
  290. canonical_counts = Counter()
  291. for original, canonical in mappings.items():
  292. canonical_counts[canonical] += type_counts[original]
  293. print(f"\n📊 Top canonical types:")
  294. for canonical, count in canonical_counts.most_common(10):
  295. print(f" {canonical}: {count} documents")
  296. def main():
  297. load_dotenv()
  298. import argparse
  299. parser = argparse.ArgumentParser(description="Deduplicate document types using LLM")
  300. parser.add_argument("--api-url", help="OpenAI-compatible API base URL")
  301. parser.add_argument("--api-key", help="API key")
  302. parser.add_argument("--model", help="Model name")
  303. args = parser.parse_args()
  304. api_url = args.api_url or os.getenv("OPENAI_API_URL")
  305. api_key = args.api_key or os.getenv("OPENAI_API_KEY")
  306. model = args.model or os.getenv("OPENAI_MODEL", "gpt-4o")
  307. if not api_url or not api_key:
  308. print("Error: API URL and API key are required")
  309. print("Set OPENAI_API_URL and OPENAI_API_KEY in .env or pass via --api-url and --api-key")
  310. return 1
  311. print("=" * 60)
  312. print("DOCUMENT TYPE DEDUPLICATION")
  313. print("=" * 60)
  314. deduplicator = DocumentTypeDeduplicator(api_url, api_key, model)
  315. # Collect all document types
  316. type_counts = deduplicator.collect_document_types()
  317. if not type_counts:
  318. print("No document types found in results directory")
  319. return 1
  320. # Deduplicate using LLM
  321. mappings = deduplicator.deduplicate_types(type_counts)
  322. # Save results
  323. deduplicator.save_mappings(mappings, type_counts)
  324. print("\n✅ Done! Update .eleventy.js to load dedupe_types.json")
  325. if __name__ == "__main__":
  326. exit(main() or 0)