.eleventy.js 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. const fs = require('fs');
  2. const path = require('path');
  3. module.exports = function(eleventyConfig) {
  4. // Copy results directory to output
  5. eleventyConfig.addPassthroughCopy({ "./results": "documents" });
  6. // Load deduplication mappings if available
  7. let dedupeMappings = { people: {}, organizations: {}, locations: {} };
  8. const dedupeFile = path.join(__dirname, 'dedupe.json');
  9. if (fs.existsSync(dedupeFile)) {
  10. try {
  11. dedupeMappings = JSON.parse(fs.readFileSync(dedupeFile, 'utf8'));
  12. console.log('✅ Loaded deduplication mappings from dedupe.json');
  13. } catch (e) {
  14. console.warn('⚠️ Could not load dedupe.json:', e.message);
  15. }
  16. } else {
  17. console.log('ℹ️ No dedupe.json found - entities will not be deduplicated');
  18. }
  19. // Load document type deduplication mappings if available
  20. let typeDedupeMap = {};
  21. const typeDedupeFile = path.join(__dirname, 'dedupe_types.json');
  22. if (fs.existsSync(typeDedupeFile)) {
  23. try {
  24. const data = JSON.parse(fs.readFileSync(typeDedupeFile, 'utf8'));
  25. typeDedupeMap = data.mappings || {};
  26. console.log('✅ Loaded document type mappings from dedupe_types.json');
  27. } catch (e) {
  28. console.warn('⚠️ Could not load dedupe_types.json:', e.message);
  29. }
  30. } else {
  31. console.log('ℹ️ No dedupe_types.json found - document types will not be deduplicated');
  32. }
  33. // Helper function to apply deduplication mapping
  34. function applyDedupe(entityType, entityName) {
  35. if (!entityName) return entityName;
  36. return dedupeMappings[entityType]?.[entityName] || entityName;
  37. }
  38. // Helper function to normalize document types (for grouping)
  39. function normalizeDocType(docType) {
  40. if (!docType) return null;
  41. const trimmed = String(docType).trim();
  42. // Apply deduplication mapping if available
  43. const canonical = typeDedupeMap[trimmed] || trimmed;
  44. return canonical.toLowerCase().trim();
  45. }
  46. // Helper function to format document types for display (title case)
  47. function formatDocType(docType) {
  48. if (!docType) return 'Unknown';
  49. const trimmed = String(docType).trim();
  50. // Apply deduplication mapping if available
  51. const canonical = typeDedupeMap[trimmed] || trimmed;
  52. // Return the canonical name (already in proper case from dedupe script)
  53. return canonical;
  54. }
  55. // Helper function to normalize dates to consistent format
  56. function normalizeDate(dateStr) {
  57. if (!dateStr) return null;
  58. const str = String(dateStr).trim();
  59. // Already in ISO format (YYYY-MM-DD)
  60. if (/^\d{4}-\d{2}-\d{2}$/.test(str)) {
  61. return str;
  62. }
  63. // Just a year (YYYY)
  64. if (/^\d{4}$/.test(str)) {
  65. return `${str}-00-00`;
  66. }
  67. // Try to parse various date formats
  68. const months = {
  69. 'jan': '01', 'january': '01',
  70. 'feb': '02', 'february': '02',
  71. 'mar': '03', 'march': '03',
  72. 'apr': '04', 'april': '04',
  73. 'may': '05',
  74. 'jun': '06', 'june': '06',
  75. 'jul': '07', 'july': '07',
  76. 'aug': '08', 'august': '08',
  77. 'sep': '09', 'september': '09',
  78. 'oct': '10', 'october': '10',
  79. 'nov': '11', 'november': '11',
  80. 'dec': '12', 'december': '12'
  81. };
  82. // "February 15, 2005" or "Feb 15, 2005"
  83. const match1 = str.match(/^(\w+)\s+(\d{1,2}),?\s+(\d{4})$/i);
  84. if (match1) {
  85. const month = months[match1[1].toLowerCase()];
  86. if (month) {
  87. const day = match1[2].padStart(2, '0');
  88. return `${match1[3]}-${month}-${day}`;
  89. }
  90. }
  91. // "15 February 2005" or "15 Feb 2005"
  92. const match2 = str.match(/^(\d{1,2})\s+(\w+)\s+(\d{4})$/i);
  93. if (match2) {
  94. const month = months[match2[2].toLowerCase()];
  95. if (month) {
  96. const day = match2[1].padStart(2, '0');
  97. return `${match2[3]}-${month}-${day}`;
  98. }
  99. }
  100. // "2005/02/15" or "2005.02.15"
  101. const match3 = str.match(/^(\d{4})[\/\.](\d{1,2})[\/\.](\d{1,2})$/);
  102. if (match3) {
  103. const month = match3[2].padStart(2, '0');
  104. const day = match3[3].padStart(2, '0');
  105. return `${match3[1]}-${month}-${day}`;
  106. }
  107. // "02/15/2005" or "02.15.2005" (US format)
  108. const match4 = str.match(/^(\d{1,2})[\/\.](\d{1,2})[\/\.](\d{4})$/);
  109. if (match4) {
  110. const month = match4[1].padStart(2, '0');
  111. const day = match4[2].padStart(2, '0');
  112. return `${match4[3]}-${month}-${day}`;
  113. }
  114. // Couldn't parse - return original
  115. return str;
  116. }
  117. // Helper function to format dates for display
  118. function formatDate(normalizedDate) {
  119. if (!normalizedDate) return 'Unknown Date';
  120. // Year only (YYYY-00-00)
  121. if (normalizedDate.endsWith('-00-00')) {
  122. return normalizedDate.substring(0, 4);
  123. }
  124. // Full date (YYYY-MM-DD)
  125. const match = normalizedDate.match(/^(\d{4})-(\d{2})-(\d{2})$/);
  126. if (match) {
  127. const months = ['', 'January', 'February', 'March', 'April', 'May', 'June',
  128. 'July', 'August', 'September', 'October', 'November', 'December'];
  129. const year = match[1];
  130. const month = parseInt(match[2]);
  131. const day = parseInt(match[3]);
  132. if (month > 0 && month <= 12) {
  133. return `${months[month]} ${day}, ${year}`;
  134. }
  135. }
  136. // Fallback
  137. return normalizedDate;
  138. }
  139. // Cache the documents data - only compute once
  140. let cachedDocuments = null;
  141. function getDocuments() {
  142. if (cachedDocuments) {
  143. return cachedDocuments;
  144. }
  145. const resultsDir = path.join(__dirname, './results');
  146. const pages = [];
  147. function readDocuments(dir, relativePath = '') {
  148. const entries = fs.readdirSync(dir, { withFileTypes: true });
  149. for (const entry of entries) {
  150. const fullPath = path.join(dir, entry.name);
  151. const relPath = path.join(relativePath, entry.name);
  152. if (entry.isDirectory()) {
  153. readDocuments(fullPath, relPath);
  154. } else if (entry.name.endsWith('.json')) {
  155. try {
  156. const content = JSON.parse(fs.readFileSync(fullPath, 'utf8'));
  157. pages.push({
  158. path: relPath,
  159. filename: entry.name.replace('.json', ''),
  160. folder: relativePath || 'root',
  161. ...content
  162. });
  163. } catch (e) {
  164. console.error(`Error reading ${fullPath}:`, e.message);
  165. }
  166. }
  167. }
  168. }
  169. readDocuments(resultsDir);
  170. // Normalize function to handle LLM inconsistencies in document numbers
  171. const normalizeDocNum = (docNum) => {
  172. if (!docNum) return null;
  173. // Convert to lowercase, remove all non-alphanumeric except hyphens, collapse multiple hyphens
  174. return String(docNum)
  175. .toLowerCase()
  176. .replace(/[^a-z0-9-]/g, '-')
  177. .replace(/-+/g, '-')
  178. .replace(/^-+|-+$/g, '');
  179. };
  180. // Group pages by NORMALIZED document_number to handle LLM variations
  181. const documentMap = new Map();
  182. pages.forEach(page => {
  183. // Use document_number from metadata to group pages of the same document
  184. const rawDocNum = page.document_metadata?.document_number;
  185. // Skip pages without a document number
  186. if (!rawDocNum) {
  187. console.warn(`Page ${page.filename} has no document_number, using filename as fallback`);
  188. const fallbackKey = normalizeDocNum(page.filename) || page.filename;
  189. if (!documentMap.has(fallbackKey)) {
  190. documentMap.set(fallbackKey, []);
  191. }
  192. documentMap.get(fallbackKey).push(page);
  193. return;
  194. }
  195. // Normalize the document number to group variants together
  196. const normalizedDocNum = normalizeDocNum(rawDocNum);
  197. if (!documentMap.has(normalizedDocNum)) {
  198. documentMap.set(normalizedDocNum, []);
  199. }
  200. documentMap.get(normalizedDocNum).push(page);
  201. });
  202. // Convert to array and sort pages within each document
  203. const documents = Array.from(documentMap.entries()).map(([normalizedDocNum, docPages]) => {
  204. // Sort pages by page number
  205. docPages.sort((a, b) => {
  206. const pageA = parseInt(a.document_metadata?.page_number) || 0;
  207. const pageB = parseInt(b.document_metadata?.page_number) || 0;
  208. return pageA - pageB;
  209. });
  210. // Combine all entities from all pages
  211. const allEntities = {
  212. people: new Set(),
  213. organizations: new Set(),
  214. locations: new Set(),
  215. dates: new Set(),
  216. reference_numbers: new Set()
  217. };
  218. docPages.forEach(page => {
  219. if (page.entities) {
  220. Object.keys(allEntities).forEach(key => {
  221. if (page.entities[key]) {
  222. page.entities[key].forEach(item => allEntities[key].add(item));
  223. }
  224. });
  225. }
  226. });
  227. // Get metadata from first page
  228. const firstPage = docPages[0];
  229. // Get all unique folders that contain pages of this document
  230. const folders = [...new Set(docPages.map(p => p.folder))];
  231. // Get all unique raw document numbers (for display)
  232. const rawDocNums = [...new Set(docPages.map(p => p.document_metadata?.document_number).filter(Boolean))];
  233. // Apply deduplication to document entities
  234. const deduplicatedEntities = {
  235. people: [...new Set(Array.from(allEntities.people).map(p => applyDedupe('people', p)))],
  236. organizations: [...new Set(Array.from(allEntities.organizations).map(o => applyDedupe('organizations', o)))],
  237. locations: [...new Set(Array.from(allEntities.locations).map(l => applyDedupe('locations', l)))],
  238. dates: [...new Set(Array.from(allEntities.dates).map(d => {
  239. const normalized = normalizeDate(d);
  240. return normalized ? formatDate(normalized) : d;
  241. }))],
  242. reference_numbers: Array.from(allEntities.reference_numbers)
  243. };
  244. // Normalize document metadata
  245. const normalizedMetadata = {
  246. ...firstPage.document_metadata,
  247. document_type: firstPage.document_metadata?.document_type
  248. ? formatDocType(firstPage.document_metadata.document_type)
  249. : null,
  250. date: firstPage.document_metadata?.date
  251. ? formatDate(normalizeDate(firstPage.document_metadata.date))
  252. : firstPage.document_metadata?.date
  253. };
  254. // Create lightweight pages array (keep full_text but make them lazy)
  255. const lightPages = docPages.map(p => {
  256. const lightPage = { ...p };
  257. // Keep full_text reference for document rendering, but it won't be duplicated
  258. return lightPage;
  259. });
  260. // Only include full_text when needed (for individual document pages)
  261. // For the main documents array, we skip it to save memory
  262. const docData = {
  263. unique_id: normalizedDocNum, // Normalized version for unique URLs
  264. document_number: rawDocNums.length === 1 ? rawDocNums[0] : normalizedDocNum, // Show original if consistent, else normalized
  265. raw_document_numbers: rawDocNums, // All variations found
  266. pages: lightPages,
  267. page_count: docPages.length,
  268. document_metadata: normalizedMetadata,
  269. entities: deduplicatedEntities,
  270. folder: folders.join(', '), // Show all folders if document spans multiple
  271. folders: folders // Keep array for reference
  272. };
  273. // Add full_text getter that loads on demand
  274. Object.defineProperty(docData, 'full_text', {
  275. get: function() {
  276. if (!this._full_text) {
  277. this._full_text = this.pages.map(p => p.full_text).join('\n\n--- PAGE BREAK ---\n\n');
  278. }
  279. return this._full_text;
  280. },
  281. enumerable: true
  282. });
  283. return docData;
  284. });
  285. cachedDocuments = documents;
  286. return documents;
  287. }
  288. // Load document analyses if available
  289. eleventyConfig.addGlobalData("analyses", () => {
  290. const analysesFile = path.join(__dirname, 'analyses.json');
  291. if (fs.existsSync(analysesFile)) {
  292. try {
  293. const data = JSON.parse(fs.readFileSync(analysesFile, 'utf8'));
  294. const analyses = data.analyses || [];
  295. // Apply document type deduplication to analyses
  296. if (Object.keys(typeDedupeMap).length > 0) {
  297. analyses.forEach(analysis => {
  298. if (analysis.analysis?.document_type) {
  299. const original = analysis.analysis.document_type;
  300. const canonical = typeDedupeMap[original] || original;
  301. analysis.analysis.document_type = canonical;
  302. }
  303. });
  304. }
  305. console.log(`✅ Loaded ${analyses.length} document analyses`);
  306. return analyses;
  307. } catch (e) {
  308. console.warn('⚠️ Could not load analyses.json:', e.message);
  309. return [];
  310. }
  311. }
  312. console.log('ℹ️ No analyses.json found - run analyze_documents.py to generate');
  313. return [];
  314. });
  315. // Get unique canonical document types from analyses
  316. eleventyConfig.addGlobalData("analysisDocumentTypes", () => {
  317. const analysesFile = path.join(__dirname, 'analyses.json');
  318. if (!fs.existsSync(analysesFile)) {
  319. return [];
  320. }
  321. try {
  322. const data = JSON.parse(fs.readFileSync(analysesFile, 'utf8'));
  323. const analyses = data.analyses || [];
  324. // Collect unique canonical types
  325. const typesSet = new Set();
  326. analyses.forEach(analysis => {
  327. if (analysis.analysis?.document_type) {
  328. let docType = analysis.analysis.document_type;
  329. // Apply deduplication if available
  330. if (Object.keys(typeDedupeMap).length > 0) {
  331. docType = typeDedupeMap[docType] || docType;
  332. }
  333. typesSet.add(docType);
  334. }
  335. });
  336. const uniqueTypes = Array.from(typesSet).sort();
  337. console.log(`✅ Found ${uniqueTypes.length} unique canonical document types for filters`);
  338. return uniqueTypes;
  339. } catch (e) {
  340. console.warn('⚠️ Could not load document types:', e.message);
  341. return [];
  342. }
  343. });
  344. // Add global data - load all pages and group into documents
  345. eleventyConfig.addGlobalData("documents", getDocuments);
  346. // Build indices from grouped documents
  347. eleventyConfig.addGlobalData("indices", () => {
  348. const documentsData = getDocuments();
  349. const people = new Map();
  350. const organizations = new Map();
  351. const locations = new Map();
  352. const dates = new Map();
  353. const documentTypes = new Map();
  354. documentsData.forEach(doc => {
  355. // People (with deduplication)
  356. if (doc.entities?.people) {
  357. doc.entities.people.forEach(person => {
  358. const canonicalName = applyDedupe('people', person);
  359. if (!people.has(canonicalName)) people.set(canonicalName, []);
  360. people.get(canonicalName).push(doc);
  361. });
  362. }
  363. // Organizations (with deduplication)
  364. if (doc.entities?.organizations) {
  365. doc.entities.organizations.forEach(org => {
  366. const canonicalName = applyDedupe('organizations', org);
  367. if (!organizations.has(canonicalName)) organizations.set(canonicalName, []);
  368. organizations.get(canonicalName).push(doc);
  369. });
  370. }
  371. // Locations (with deduplication)
  372. if (doc.entities?.locations) {
  373. doc.entities.locations.forEach(loc => {
  374. const canonicalName = applyDedupe('locations', loc);
  375. if (!locations.has(canonicalName)) locations.set(canonicalName, []);
  376. locations.get(canonicalName).push(doc);
  377. });
  378. }
  379. // Dates (normalize for grouping)
  380. if (doc.entities?.dates) {
  381. doc.entities.dates.forEach(date => {
  382. const normalized = normalizeDate(date);
  383. if (normalized) {
  384. if (!dates.has(normalized)) dates.set(normalized, []);
  385. dates.get(normalized).push(doc);
  386. }
  387. });
  388. }
  389. // Document types (normalize for grouping)
  390. const docType = doc.document_metadata?.document_type;
  391. if (docType) {
  392. const normalized = normalizeDocType(docType);
  393. if (normalized) {
  394. if (!documentTypes.has(normalized)) documentTypes.set(normalized, []);
  395. documentTypes.get(normalized).push(doc);
  396. }
  397. }
  398. });
  399. // Deduplicate document arrays (remove duplicate document references)
  400. const dedupeDocArray = (docs) => {
  401. const seen = new Set();
  402. return docs.filter(doc => {
  403. if (seen.has(doc.unique_id)) return false;
  404. seen.add(doc.unique_id);
  405. return true;
  406. });
  407. };
  408. return {
  409. people: Array.from(people.entries()).map(([name, docs]) => ({
  410. name,
  411. docs: dedupeDocArray(docs),
  412. count: dedupeDocArray(docs).length
  413. })).sort((a, b) => b.count - a.count),
  414. organizations: Array.from(organizations.entries()).map(([name, docs]) => ({
  415. name,
  416. docs: dedupeDocArray(docs),
  417. count: dedupeDocArray(docs).length
  418. })).sort((a, b) => b.count - a.count),
  419. locations: Array.from(locations.entries()).map(([name, docs]) => ({
  420. name,
  421. docs: dedupeDocArray(docs),
  422. count: dedupeDocArray(docs).length
  423. })).sort((a, b) => b.count - a.count),
  424. dates: Array.from(dates.entries()).map(([normalizedDate, docs]) => ({
  425. name: formatDate(normalizedDate), // Display formatted version
  426. normalizedDate, // Keep normalized for sorting
  427. docs: dedupeDocArray(docs),
  428. count: dedupeDocArray(docs).length
  429. })).sort((a, b) => {
  430. // Sort by normalized date (YYYY-MM-DD format sorts correctly)
  431. return b.normalizedDate.localeCompare(a.normalizedDate);
  432. }),
  433. documentTypes: Array.from(documentTypes.entries()).map(([normalizedType, docs]) => ({
  434. name: formatDocType(normalizedType), // Display formatted version
  435. docs: dedupeDocArray(docs),
  436. count: dedupeDocArray(docs).length
  437. })).sort((a, b) => b.count - a.count)
  438. };
  439. });
  440. return {
  441. dir: {
  442. input: "src",
  443. output: "_site",
  444. includes: "_includes"
  445. },
  446. pathPrefix: "/"
  447. };
  448. };