.eleventy.js 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. const fs = require('fs');
  2. const path = require('path');
  3. module.exports = function(eleventyConfig) {
  4. // Copy results directory to output
  5. eleventyConfig.addPassthroughCopy({ "./results": "documents" });
  6. // Load deduplication mappings if available
  7. let dedupeMappings = { people: {}, organizations: {}, locations: {} };
  8. const dedupeFile = path.join(__dirname, 'dedupe.json');
  9. if (fs.existsSync(dedupeFile)) {
  10. try {
  11. dedupeMappings = JSON.parse(fs.readFileSync(dedupeFile, 'utf8'));
  12. console.log('✅ Loaded deduplication mappings from dedupe.json');
  13. } catch (e) {
  14. console.warn('⚠️ Could not load dedupe.json:', e.message);
  15. }
  16. } else {
  17. console.log('ℹ️ No dedupe.json found - entities will not be deduplicated');
  18. }
  19. // Helper function to apply deduplication mapping
  20. function applyDedupe(entityType, entityName) {
  21. if (!entityName) return entityName;
  22. return dedupeMappings[entityType]?.[entityName] || entityName;
  23. }
  24. // Helper function to normalize document types (for grouping)
  25. function normalizeDocType(docType) {
  26. if (!docType) return null;
  27. return String(docType).toLowerCase().trim();
  28. }
  29. // Helper function to format document types for display (title case)
  30. function formatDocType(docType) {
  31. if (!docType) return 'Unknown';
  32. return String(docType)
  33. .toLowerCase()
  34. .trim()
  35. .split(' ')
  36. .map(word => word.charAt(0).toUpperCase() + word.slice(1))
  37. .join(' ');
  38. }
  39. // Helper function to normalize dates to consistent format
  40. function normalizeDate(dateStr) {
  41. if (!dateStr) return null;
  42. const str = String(dateStr).trim();
  43. // Already in ISO format (YYYY-MM-DD)
  44. if (/^\d{4}-\d{2}-\d{2}$/.test(str)) {
  45. return str;
  46. }
  47. // Just a year (YYYY)
  48. if (/^\d{4}$/.test(str)) {
  49. return `${str}-00-00`;
  50. }
  51. // Try to parse various date formats
  52. const months = {
  53. 'jan': '01', 'january': '01',
  54. 'feb': '02', 'february': '02',
  55. 'mar': '03', 'march': '03',
  56. 'apr': '04', 'april': '04',
  57. 'may': '05',
  58. 'jun': '06', 'june': '06',
  59. 'jul': '07', 'july': '07',
  60. 'aug': '08', 'august': '08',
  61. 'sep': '09', 'september': '09',
  62. 'oct': '10', 'october': '10',
  63. 'nov': '11', 'november': '11',
  64. 'dec': '12', 'december': '12'
  65. };
  66. // "February 15, 2005" or "Feb 15, 2005"
  67. const match1 = str.match(/^(\w+)\s+(\d{1,2}),?\s+(\d{4})$/i);
  68. if (match1) {
  69. const month = months[match1[1].toLowerCase()];
  70. if (month) {
  71. const day = match1[2].padStart(2, '0');
  72. return `${match1[3]}-${month}-${day}`;
  73. }
  74. }
  75. // "15 February 2005" or "15 Feb 2005"
  76. const match2 = str.match(/^(\d{1,2})\s+(\w+)\s+(\d{4})$/i);
  77. if (match2) {
  78. const month = months[match2[2].toLowerCase()];
  79. if (month) {
  80. const day = match2[1].padStart(2, '0');
  81. return `${match2[3]}-${month}-${day}`;
  82. }
  83. }
  84. // "2005/02/15" or "2005.02.15"
  85. const match3 = str.match(/^(\d{4})[\/\.](\d{1,2})[\/\.](\d{1,2})$/);
  86. if (match3) {
  87. const month = match3[2].padStart(2, '0');
  88. const day = match3[3].padStart(2, '0');
  89. return `${match3[1]}-${month}-${day}`;
  90. }
  91. // "02/15/2005" or "02.15.2005" (US format)
  92. const match4 = str.match(/^(\d{1,2})[\/\.](\d{1,2})[\/\.](\d{4})$/);
  93. if (match4) {
  94. const month = match4[1].padStart(2, '0');
  95. const day = match4[2].padStart(2, '0');
  96. return `${match4[3]}-${month}-${day}`;
  97. }
  98. // Couldn't parse - return original
  99. return str;
  100. }
  101. // Helper function to format dates for display
  102. function formatDate(normalizedDate) {
  103. if (!normalizedDate) return 'Unknown Date';
  104. // Year only (YYYY-00-00)
  105. if (normalizedDate.endsWith('-00-00')) {
  106. return normalizedDate.substring(0, 4);
  107. }
  108. // Full date (YYYY-MM-DD)
  109. const match = normalizedDate.match(/^(\d{4})-(\d{2})-(\d{2})$/);
  110. if (match) {
  111. const months = ['', 'January', 'February', 'March', 'April', 'May', 'June',
  112. 'July', 'August', 'September', 'October', 'November', 'December'];
  113. const year = match[1];
  114. const month = parseInt(match[2]);
  115. const day = parseInt(match[3]);
  116. if (month > 0 && month <= 12) {
  117. return `${months[month]} ${day}, ${year}`;
  118. }
  119. }
  120. // Fallback
  121. return normalizedDate;
  122. }
  123. // Cache the documents data - only compute once
  124. let cachedDocuments = null;
  125. function getDocuments() {
  126. if (cachedDocuments) {
  127. return cachedDocuments;
  128. }
  129. const resultsDir = path.join(__dirname, './results');
  130. const pages = [];
  131. function readDocuments(dir, relativePath = '') {
  132. const entries = fs.readdirSync(dir, { withFileTypes: true });
  133. for (const entry of entries) {
  134. const fullPath = path.join(dir, entry.name);
  135. const relPath = path.join(relativePath, entry.name);
  136. if (entry.isDirectory()) {
  137. readDocuments(fullPath, relPath);
  138. } else if (entry.name.endsWith('.json')) {
  139. try {
  140. const content = JSON.parse(fs.readFileSync(fullPath, 'utf8'));
  141. pages.push({
  142. path: relPath,
  143. filename: entry.name.replace('.json', ''),
  144. folder: relativePath || 'root',
  145. ...content
  146. });
  147. } catch (e) {
  148. console.error(`Error reading ${fullPath}:`, e.message);
  149. }
  150. }
  151. }
  152. }
  153. readDocuments(resultsDir);
  154. // Normalize function to handle LLM inconsistencies in document numbers
  155. const normalizeDocNum = (docNum) => {
  156. if (!docNum) return null;
  157. // Convert to lowercase, remove all non-alphanumeric except hyphens, collapse multiple hyphens
  158. return String(docNum)
  159. .toLowerCase()
  160. .replace(/[^a-z0-9-]/g, '-')
  161. .replace(/-+/g, '-')
  162. .replace(/^-+|-+$/g, '');
  163. };
  164. // Group pages by NORMALIZED document_number to handle LLM variations
  165. const documentMap = new Map();
  166. pages.forEach(page => {
  167. // Use document_number from metadata to group pages of the same document
  168. const rawDocNum = page.document_metadata?.document_number;
  169. // Skip pages without a document number
  170. if (!rawDocNum) {
  171. console.warn(`Page ${page.filename} has no document_number, using filename as fallback`);
  172. const fallbackKey = normalizeDocNum(page.filename) || page.filename;
  173. if (!documentMap.has(fallbackKey)) {
  174. documentMap.set(fallbackKey, []);
  175. }
  176. documentMap.get(fallbackKey).push(page);
  177. return;
  178. }
  179. // Normalize the document number to group variants together
  180. const normalizedDocNum = normalizeDocNum(rawDocNum);
  181. if (!documentMap.has(normalizedDocNum)) {
  182. documentMap.set(normalizedDocNum, []);
  183. }
  184. documentMap.get(normalizedDocNum).push(page);
  185. });
  186. // Convert to array and sort pages within each document
  187. const documents = Array.from(documentMap.entries()).map(([normalizedDocNum, docPages]) => {
  188. // Sort pages by page number
  189. docPages.sort((a, b) => {
  190. const pageA = parseInt(a.document_metadata?.page_number) || 0;
  191. const pageB = parseInt(b.document_metadata?.page_number) || 0;
  192. return pageA - pageB;
  193. });
  194. // Combine all entities from all pages
  195. const allEntities = {
  196. people: new Set(),
  197. organizations: new Set(),
  198. locations: new Set(),
  199. dates: new Set(),
  200. reference_numbers: new Set()
  201. };
  202. docPages.forEach(page => {
  203. if (page.entities) {
  204. Object.keys(allEntities).forEach(key => {
  205. if (page.entities[key]) {
  206. page.entities[key].forEach(item => allEntities[key].add(item));
  207. }
  208. });
  209. }
  210. });
  211. // Get metadata from first page
  212. const firstPage = docPages[0];
  213. // Get all unique folders that contain pages of this document
  214. const folders = [...new Set(docPages.map(p => p.folder))];
  215. // Get all unique raw document numbers (for display)
  216. const rawDocNums = [...new Set(docPages.map(p => p.document_metadata?.document_number).filter(Boolean))];
  217. // Apply deduplication to document entities
  218. const deduplicatedEntities = {
  219. people: [...new Set(Array.from(allEntities.people).map(p => applyDedupe('people', p)))],
  220. organizations: [...new Set(Array.from(allEntities.organizations).map(o => applyDedupe('organizations', o)))],
  221. locations: [...new Set(Array.from(allEntities.locations).map(l => applyDedupe('locations', l)))],
  222. dates: [...new Set(Array.from(allEntities.dates).map(d => {
  223. const normalized = normalizeDate(d);
  224. return normalized ? formatDate(normalized) : d;
  225. }))],
  226. reference_numbers: Array.from(allEntities.reference_numbers)
  227. };
  228. // Normalize document metadata
  229. const normalizedMetadata = {
  230. ...firstPage.document_metadata,
  231. document_type: firstPage.document_metadata?.document_type
  232. ? formatDocType(firstPage.document_metadata.document_type)
  233. : null,
  234. date: firstPage.document_metadata?.date
  235. ? formatDate(normalizeDate(firstPage.document_metadata.date))
  236. : firstPage.document_metadata?.date
  237. };
  238. return {
  239. unique_id: normalizedDocNum, // Normalized version for unique URLs
  240. document_number: rawDocNums.length === 1 ? rawDocNums[0] : normalizedDocNum, // Show original if consistent, else normalized
  241. raw_document_numbers: rawDocNums, // All variations found
  242. pages: docPages,
  243. page_count: docPages.length,
  244. document_metadata: normalizedMetadata,
  245. entities: deduplicatedEntities,
  246. full_text: docPages.map(p => p.full_text).join('\n\n--- PAGE BREAK ---\n\n'),
  247. folder: folders.join(', '), // Show all folders if document spans multiple
  248. folders: folders // Keep array for reference
  249. };
  250. });
  251. cachedDocuments = documents;
  252. return documents;
  253. }
  254. // Load document analyses if available
  255. eleventyConfig.addGlobalData("analyses", () => {
  256. const analysesFile = path.join(__dirname, 'analyses.json');
  257. if (fs.existsSync(analysesFile)) {
  258. try {
  259. const data = JSON.parse(fs.readFileSync(analysesFile, 'utf8'));
  260. console.log(`✅ Loaded ${data.analyses?.length || 0} document analyses`);
  261. return data.analyses || [];
  262. } catch (e) {
  263. console.warn('⚠️ Could not load analyses.json:', e.message);
  264. return [];
  265. }
  266. }
  267. console.log('ℹ️ No analyses.json found - run analyze_documents.py to generate');
  268. return [];
  269. });
  270. // Add global data - load all pages and group into documents
  271. eleventyConfig.addGlobalData("documents", getDocuments);
  272. // Build indices from grouped documents
  273. eleventyConfig.addGlobalData("indices", () => {
  274. const documentsData = getDocuments();
  275. const people = new Map();
  276. const organizations = new Map();
  277. const locations = new Map();
  278. const dates = new Map();
  279. const documentTypes = new Map();
  280. documentsData.forEach(doc => {
  281. // People (with deduplication)
  282. if (doc.entities?.people) {
  283. doc.entities.people.forEach(person => {
  284. const canonicalName = applyDedupe('people', person);
  285. if (!people.has(canonicalName)) people.set(canonicalName, []);
  286. people.get(canonicalName).push(doc);
  287. });
  288. }
  289. // Organizations (with deduplication)
  290. if (doc.entities?.organizations) {
  291. doc.entities.organizations.forEach(org => {
  292. const canonicalName = applyDedupe('organizations', org);
  293. if (!organizations.has(canonicalName)) organizations.set(canonicalName, []);
  294. organizations.get(canonicalName).push(doc);
  295. });
  296. }
  297. // Locations (with deduplication)
  298. if (doc.entities?.locations) {
  299. doc.entities.locations.forEach(loc => {
  300. const canonicalName = applyDedupe('locations', loc);
  301. if (!locations.has(canonicalName)) locations.set(canonicalName, []);
  302. locations.get(canonicalName).push(doc);
  303. });
  304. }
  305. // Dates (normalize for grouping)
  306. if (doc.entities?.dates) {
  307. doc.entities.dates.forEach(date => {
  308. const normalized = normalizeDate(date);
  309. if (normalized) {
  310. if (!dates.has(normalized)) dates.set(normalized, []);
  311. dates.get(normalized).push(doc);
  312. }
  313. });
  314. }
  315. // Document types (normalize for grouping)
  316. const docType = doc.document_metadata?.document_type;
  317. if (docType) {
  318. const normalized = normalizeDocType(docType);
  319. if (normalized) {
  320. if (!documentTypes.has(normalized)) documentTypes.set(normalized, []);
  321. documentTypes.get(normalized).push(doc);
  322. }
  323. }
  324. });
  325. // Deduplicate document arrays (remove duplicate document references)
  326. const dedupeDocArray = (docs) => {
  327. const seen = new Set();
  328. return docs.filter(doc => {
  329. if (seen.has(doc.unique_id)) return false;
  330. seen.add(doc.unique_id);
  331. return true;
  332. });
  333. };
  334. return {
  335. people: Array.from(people.entries()).map(([name, docs]) => ({
  336. name,
  337. docs: dedupeDocArray(docs),
  338. count: dedupeDocArray(docs).length
  339. })).sort((a, b) => b.count - a.count),
  340. organizations: Array.from(organizations.entries()).map(([name, docs]) => ({
  341. name,
  342. docs: dedupeDocArray(docs),
  343. count: dedupeDocArray(docs).length
  344. })).sort((a, b) => b.count - a.count),
  345. locations: Array.from(locations.entries()).map(([name, docs]) => ({
  346. name,
  347. docs: dedupeDocArray(docs),
  348. count: dedupeDocArray(docs).length
  349. })).sort((a, b) => b.count - a.count),
  350. dates: Array.from(dates.entries()).map(([normalizedDate, docs]) => ({
  351. name: formatDate(normalizedDate), // Display formatted version
  352. normalizedDate, // Keep normalized for sorting
  353. docs: dedupeDocArray(docs),
  354. count: dedupeDocArray(docs).length
  355. })).sort((a, b) => {
  356. // Sort by normalized date (YYYY-MM-DD format sorts correctly)
  357. return b.normalizedDate.localeCompare(a.normalizedDate);
  358. }),
  359. documentTypes: Array.from(documentTypes.entries()).map(([normalizedType, docs]) => ({
  360. name: formatDocType(normalizedType), // Display formatted version
  361. docs: dedupeDocArray(docs),
  362. count: dedupeDocArray(docs).length
  363. })).sort((a, b) => b.count - a.count)
  364. };
  365. });
  366. return {
  367. dir: {
  368. input: "src",
  369. output: "_site",
  370. includes: "_includes"
  371. },
  372. pathPrefix: "/"
  373. };
  374. };