const fs = require('fs'); const path = require('path'); module.exports = function(eleventyConfig) { // Copy results directory to output eleventyConfig.addPassthroughCopy({ "./results": "documents" }); // Load deduplication mappings if available let dedupeMappings = { people: {}, organizations: {}, locations: {} }; const dedupeFile = path.join(__dirname, 'dedupe.json'); if (fs.existsSync(dedupeFile)) { try { dedupeMappings = JSON.parse(fs.readFileSync(dedupeFile, 'utf8')); console.log('✅ Loaded deduplication mappings from dedupe.json'); } catch (e) { console.warn('⚠️ Could not load dedupe.json:', e.message); } } else { console.log('ℹ️ No dedupe.json found - entities will not be deduplicated'); } // Load document type deduplication mappings if available let typeDedupeMap = {}; const typeDedupeFile = path.join(__dirname, 'dedupe_types.json'); if (fs.existsSync(typeDedupeFile)) { try { const data = JSON.parse(fs.readFileSync(typeDedupeFile, 'utf8')); typeDedupeMap = data.mappings || {}; console.log('✅ Loaded document type mappings from dedupe_types.json'); } catch (e) { console.warn('⚠️ Could not load dedupe_types.json:', e.message); } } else { console.log('ℹ️ No dedupe_types.json found - document types will not be deduplicated'); } // Helper function to apply deduplication mapping function applyDedupe(entityType, entityName) { if (!entityName) return entityName; return dedupeMappings[entityType]?.[entityName] || entityName; } // Helper function to normalize document types (for grouping) function normalizeDocType(docType) { if (!docType) return null; const trimmed = String(docType).trim(); // Apply deduplication mapping if available const canonical = typeDedupeMap[trimmed] || trimmed; return canonical.toLowerCase().trim(); } // Helper function to format document types for display (title case) function formatDocType(docType) { if (!docType) return 'Unknown'; const trimmed = String(docType).trim(); // Apply deduplication mapping if available const canonical = typeDedupeMap[trimmed] || trimmed; // Return the canonical name (already in proper case from dedupe script) return canonical; } // Helper function to normalize dates to consistent format function normalizeDate(dateStr) { if (!dateStr) return null; const str = String(dateStr).trim(); // Already in ISO format (YYYY-MM-DD) if (/^\d{4}-\d{2}-\d{2}$/.test(str)) { return str; } // Just a year (YYYY) if (/^\d{4}$/.test(str)) { return `${str}-00-00`; } // Try to parse various date formats const months = { 'jan': '01', 'january': '01', 'feb': '02', 'february': '02', 'mar': '03', 'march': '03', 'apr': '04', 'april': '04', 'may': '05', 'jun': '06', 'june': '06', 'jul': '07', 'july': '07', 'aug': '08', 'august': '08', 'sep': '09', 'september': '09', 'oct': '10', 'october': '10', 'nov': '11', 'november': '11', 'dec': '12', 'december': '12' }; // "February 15, 2005" or "Feb 15, 2005" const match1 = str.match(/^(\w+)\s+(\d{1,2}),?\s+(\d{4})$/i); if (match1) { const month = months[match1[1].toLowerCase()]; if (month) { const day = match1[2].padStart(2, '0'); return `${match1[3]}-${month}-${day}`; } } // "15 February 2005" or "15 Feb 2005" const match2 = str.match(/^(\d{1,2})\s+(\w+)\s+(\d{4})$/i); if (match2) { const month = months[match2[2].toLowerCase()]; if (month) { const day = match2[1].padStart(2, '0'); return `${match2[3]}-${month}-${day}`; } } // "2005/02/15" or "2005.02.15" const match3 = str.match(/^(\d{4})[\/\.](\d{1,2})[\/\.](\d{1,2})$/); if (match3) { const month = match3[2].padStart(2, '0'); const day = match3[3].padStart(2, '0'); return `${match3[1]}-${month}-${day}`; } // "02/15/2005" or "02.15.2005" (US format) const match4 = str.match(/^(\d{1,2})[\/\.](\d{1,2})[\/\.](\d{4})$/); if (match4) { const month = match4[1].padStart(2, '0'); const day = match4[2].padStart(2, '0'); return `${match4[3]}-${month}-${day}`; } // Couldn't parse - return original return str; } // Helper function to format dates for display function formatDate(normalizedDate) { if (!normalizedDate) return 'Unknown Date'; // Year only (YYYY-00-00) if (normalizedDate.endsWith('-00-00')) { return normalizedDate.substring(0, 4); } // Full date (YYYY-MM-DD) const match = normalizedDate.match(/^(\d{4})-(\d{2})-(\d{2})$/); if (match) { const months = ['', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']; const year = match[1]; const month = parseInt(match[2]); const day = parseInt(match[3]); if (month > 0 && month <= 12) { return `${months[month]} ${day}, ${year}`; } } // Fallback return normalizedDate; } // Cache the documents data - only compute once let cachedDocuments = null; function getDocuments() { if (cachedDocuments) { return cachedDocuments; } const resultsDir = path.join(__dirname, './results'); const pages = []; function readDocuments(dir, relativePath = '') { const entries = fs.readdirSync(dir, { withFileTypes: true }); for (const entry of entries) { const fullPath = path.join(dir, entry.name); const relPath = path.join(relativePath, entry.name); if (entry.isDirectory()) { readDocuments(fullPath, relPath); } else if (entry.name.endsWith('.json')) { try { const content = JSON.parse(fs.readFileSync(fullPath, 'utf8')); pages.push({ path: relPath, filename: entry.name.replace('.json', ''), folder: relativePath || 'root', ...content }); } catch (e) { console.error(`Error reading ${fullPath}:`, e.message); } } } } readDocuments(resultsDir); // Normalize function to handle LLM inconsistencies in document numbers const normalizeDocNum = (docNum) => { if (!docNum) return null; // Convert to lowercase, remove all non-alphanumeric except hyphens, collapse multiple hyphens return String(docNum) .toLowerCase() .replace(/[^a-z0-9-]/g, '-') .replace(/-+/g, '-') .replace(/^-+|-+$/g, ''); }; // Group pages by NORMALIZED document_number to handle LLM variations const documentMap = new Map(); pages.forEach(page => { // Use document_number from metadata to group pages of the same document const rawDocNum = page.document_metadata?.document_number; // Skip pages without a document number if (!rawDocNum) { console.warn(`Page ${page.filename} has no document_number, using filename as fallback`); const fallbackKey = normalizeDocNum(page.filename) || page.filename; if (!documentMap.has(fallbackKey)) { documentMap.set(fallbackKey, []); } documentMap.get(fallbackKey).push(page); return; } // Normalize the document number to group variants together const normalizedDocNum = normalizeDocNum(rawDocNum); if (!documentMap.has(normalizedDocNum)) { documentMap.set(normalizedDocNum, []); } documentMap.get(normalizedDocNum).push(page); }); // Convert to array and sort pages within each document const documents = Array.from(documentMap.entries()).map(([normalizedDocNum, docPages]) => { // Sort pages by page number docPages.sort((a, b) => { const pageA = parseInt(a.document_metadata?.page_number) || 0; const pageB = parseInt(b.document_metadata?.page_number) || 0; return pageA - pageB; }); // Combine all entities from all pages const allEntities = { people: new Set(), organizations: new Set(), locations: new Set(), dates: new Set(), reference_numbers: new Set() }; docPages.forEach(page => { if (page.entities) { Object.keys(allEntities).forEach(key => { if (page.entities[key]) { page.entities[key].forEach(item => allEntities[key].add(item)); } }); } }); // Get metadata from first page const firstPage = docPages[0]; // Get all unique folders that contain pages of this document const folders = [...new Set(docPages.map(p => p.folder))]; // Get all unique raw document numbers (for display) const rawDocNums = [...new Set(docPages.map(p => p.document_metadata?.document_number).filter(Boolean))]; // Apply deduplication to document entities const deduplicatedEntities = { people: [...new Set(Array.from(allEntities.people).map(p => applyDedupe('people', p)))], organizations: [...new Set(Array.from(allEntities.organizations).map(o => applyDedupe('organizations', o)))], locations: [...new Set(Array.from(allEntities.locations).map(l => applyDedupe('locations', l)))], dates: [...new Set(Array.from(allEntities.dates).map(d => { const normalized = normalizeDate(d); return normalized ? formatDate(normalized) : d; }))], reference_numbers: Array.from(allEntities.reference_numbers) }; // Normalize document metadata const normalizedMetadata = { ...firstPage.document_metadata, document_type: firstPage.document_metadata?.document_type ? formatDocType(firstPage.document_metadata.document_type) : null, date: firstPage.document_metadata?.date ? formatDate(normalizeDate(firstPage.document_metadata.date)) : firstPage.document_metadata?.date }; // Create lightweight pages array (keep full_text but make them lazy) const lightPages = docPages.map(p => { const lightPage = { ...p }; // Keep full_text reference for document rendering, but it won't be duplicated return lightPage; }); // Only include full_text when needed (for individual document pages) // For the main documents array, we skip it to save memory const docData = { unique_id: normalizedDocNum, // Normalized version for unique URLs document_number: rawDocNums.length === 1 ? rawDocNums[0] : normalizedDocNum, // Show original if consistent, else normalized raw_document_numbers: rawDocNums, // All variations found pages: lightPages, page_count: docPages.length, document_metadata: normalizedMetadata, entities: deduplicatedEntities, folder: folders.join(', '), // Show all folders if document spans multiple folders: folders // Keep array for reference }; // Add full_text getter that loads on demand Object.defineProperty(docData, 'full_text', { get: function() { if (!this._full_text) { this._full_text = this.pages.map(p => p.full_text).join('\n\n--- PAGE BREAK ---\n\n'); } return this._full_text; }, enumerable: true }); return docData; }); cachedDocuments = documents; return documents; } // Load document analyses if available eleventyConfig.addGlobalData("analyses", () => { const analysesFile = path.join(__dirname, 'analyses.json'); if (fs.existsSync(analysesFile)) { try { const data = JSON.parse(fs.readFileSync(analysesFile, 'utf8')); const analyses = data.analyses || []; // Apply document type deduplication to analyses if (Object.keys(typeDedupeMap).length > 0) { analyses.forEach(analysis => { if (analysis.analysis?.document_type) { const original = analysis.analysis.document_type; const canonical = typeDedupeMap[original] || original; analysis.analysis.document_type = canonical; } }); } console.log(`✅ Loaded ${analyses.length} document analyses`); return analyses; } catch (e) { console.warn('⚠️ Could not load analyses.json:', e.message); return []; } } console.log('ℹ️ No analyses.json found - run analyze_documents.py to generate'); return []; }); // Get unique canonical document types from analyses eleventyConfig.addGlobalData("analysisDocumentTypes", () => { const analysesFile = path.join(__dirname, 'analyses.json'); if (!fs.existsSync(analysesFile)) { return []; } try { const data = JSON.parse(fs.readFileSync(analysesFile, 'utf8')); const analyses = data.analyses || []; // Collect unique canonical types const typesSet = new Set(); analyses.forEach(analysis => { if (analysis.analysis?.document_type) { let docType = analysis.analysis.document_type; // Apply deduplication if available if (Object.keys(typeDedupeMap).length > 0) { docType = typeDedupeMap[docType] || docType; } typesSet.add(docType); } }); const uniqueTypes = Array.from(typesSet).sort(); console.log(`✅ Found ${uniqueTypes.length} unique canonical document types for filters`); return uniqueTypes; } catch (e) { console.warn('⚠️ Could not load document types:', e.message); return []; } }); // Add global data - load all pages and group into documents eleventyConfig.addGlobalData("documents", getDocuments); // Build indices from grouped documents eleventyConfig.addGlobalData("indices", () => { const documentsData = getDocuments(); const people = new Map(); const organizations = new Map(); const locations = new Map(); const dates = new Map(); const documentTypes = new Map(); documentsData.forEach(doc => { // People (with deduplication) if (doc.entities?.people) { doc.entities.people.forEach(person => { const canonicalName = applyDedupe('people', person); if (!people.has(canonicalName)) people.set(canonicalName, []); people.get(canonicalName).push(doc); }); } // Organizations (with deduplication) if (doc.entities?.organizations) { doc.entities.organizations.forEach(org => { const canonicalName = applyDedupe('organizations', org); if (!organizations.has(canonicalName)) organizations.set(canonicalName, []); organizations.get(canonicalName).push(doc); }); } // Locations (with deduplication) if (doc.entities?.locations) { doc.entities.locations.forEach(loc => { const canonicalName = applyDedupe('locations', loc); if (!locations.has(canonicalName)) locations.set(canonicalName, []); locations.get(canonicalName).push(doc); }); } // Dates (normalize for grouping) if (doc.entities?.dates) { doc.entities.dates.forEach(date => { const normalized = normalizeDate(date); if (normalized) { if (!dates.has(normalized)) dates.set(normalized, []); dates.get(normalized).push(doc); } }); } // Document types (normalize for grouping) const docType = doc.document_metadata?.document_type; if (docType) { const normalized = normalizeDocType(docType); if (normalized) { if (!documentTypes.has(normalized)) documentTypes.set(normalized, []); documentTypes.get(normalized).push(doc); } } }); // Deduplicate document arrays (remove duplicate document references) const dedupeDocArray = (docs) => { const seen = new Set(); return docs.filter(doc => { if (seen.has(doc.unique_id)) return false; seen.add(doc.unique_id); return true; }); }; return { people: Array.from(people.entries()).map(([name, docs]) => ({ name, docs: dedupeDocArray(docs), count: dedupeDocArray(docs).length })).sort((a, b) => b.count - a.count), organizations: Array.from(organizations.entries()).map(([name, docs]) => ({ name, docs: dedupeDocArray(docs), count: dedupeDocArray(docs).length })).sort((a, b) => b.count - a.count), locations: Array.from(locations.entries()).map(([name, docs]) => ({ name, docs: dedupeDocArray(docs), count: dedupeDocArray(docs).length })).sort((a, b) => b.count - a.count), dates: Array.from(dates.entries()).map(([normalizedDate, docs]) => ({ name: formatDate(normalizedDate), // Display formatted version normalizedDate, // Keep normalized for sorting docs: dedupeDocArray(docs), count: dedupeDocArray(docs).length })).sort((a, b) => { // Sort by normalized date (YYYY-MM-DD format sorts correctly) return b.normalizedDate.localeCompare(a.normalizedDate); }), documentTypes: Array.from(documentTypes.entries()).map(([normalizedType, docs]) => ({ name: formatDocType(normalizedType), // Display formatted version docs: dedupeDocArray(docs), count: dedupeDocArray(docs).length })).sort((a, b) => b.count - a.count) }; }); return { dir: { input: "src", output: "_site", includes: "_includes" }, pathPrefix: "/" }; };