.eleventy.js 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. const fs = require('fs');
  2. const path = require('path');
  3. module.exports = function(eleventyConfig) {
  4. // Copy results directory to output
  5. eleventyConfig.addPassthroughCopy({ "./results": "documents" });
  6. // Cache the documents data - only compute once
  7. let cachedDocuments = null;
  8. function getDocuments() {
  9. if (cachedDocuments) {
  10. return cachedDocuments;
  11. }
  12. const resultsDir = path.join(__dirname, './results');
  13. const pages = [];
  14. function readDocuments(dir, relativePath = '') {
  15. const entries = fs.readdirSync(dir, { withFileTypes: true });
  16. for (const entry of entries) {
  17. const fullPath = path.join(dir, entry.name);
  18. const relPath = path.join(relativePath, entry.name);
  19. if (entry.isDirectory()) {
  20. readDocuments(fullPath, relPath);
  21. } else if (entry.name.endsWith('.json')) {
  22. try {
  23. const content = JSON.parse(fs.readFileSync(fullPath, 'utf8'));
  24. pages.push({
  25. path: relPath,
  26. filename: entry.name.replace('.json', ''),
  27. folder: relativePath || 'root',
  28. ...content
  29. });
  30. } catch (e) {
  31. console.error(`Error reading ${fullPath}:`, e.message);
  32. }
  33. }
  34. }
  35. }
  36. readDocuments(resultsDir);
  37. // Normalize function to handle LLM inconsistencies in document numbers
  38. const normalizeDocNum = (docNum) => {
  39. if (!docNum) return null;
  40. // Convert to lowercase, remove all non-alphanumeric except hyphens, collapse multiple hyphens
  41. return String(docNum)
  42. .toLowerCase()
  43. .replace(/[^a-z0-9-]/g, '-')
  44. .replace(/-+/g, '-')
  45. .replace(/^-+|-+$/g, '');
  46. };
  47. // Group pages by NORMALIZED document_number to handle LLM variations
  48. const documentMap = new Map();
  49. pages.forEach(page => {
  50. // Use document_number from metadata to group pages of the same document
  51. const rawDocNum = page.document_metadata?.document_number;
  52. // Skip pages without a document number
  53. if (!rawDocNum) {
  54. console.warn(`Page ${page.filename} has no document_number, using filename as fallback`);
  55. const fallbackKey = normalizeDocNum(page.filename) || page.filename;
  56. if (!documentMap.has(fallbackKey)) {
  57. documentMap.set(fallbackKey, []);
  58. }
  59. documentMap.get(fallbackKey).push(page);
  60. return;
  61. }
  62. // Normalize the document number to group variants together
  63. const normalizedDocNum = normalizeDocNum(rawDocNum);
  64. if (!documentMap.has(normalizedDocNum)) {
  65. documentMap.set(normalizedDocNum, []);
  66. }
  67. documentMap.get(normalizedDocNum).push(page);
  68. });
  69. // Convert to array and sort pages within each document
  70. const documents = Array.from(documentMap.entries()).map(([normalizedDocNum, docPages]) => {
  71. // Sort pages by page number
  72. docPages.sort((a, b) => {
  73. const pageA = parseInt(a.document_metadata?.page_number) || 0;
  74. const pageB = parseInt(b.document_metadata?.page_number) || 0;
  75. return pageA - pageB;
  76. });
  77. // Combine all entities from all pages
  78. const allEntities = {
  79. people: new Set(),
  80. organizations: new Set(),
  81. locations: new Set(),
  82. dates: new Set(),
  83. reference_numbers: new Set()
  84. };
  85. docPages.forEach(page => {
  86. if (page.entities) {
  87. Object.keys(allEntities).forEach(key => {
  88. if (page.entities[key]) {
  89. page.entities[key].forEach(item => allEntities[key].add(item));
  90. }
  91. });
  92. }
  93. });
  94. // Get metadata from first page
  95. const firstPage = docPages[0];
  96. // Get all unique folders that contain pages of this document
  97. const folders = [...new Set(docPages.map(p => p.folder))];
  98. // Get all unique raw document numbers (for display)
  99. const rawDocNums = [...new Set(docPages.map(p => p.document_metadata?.document_number).filter(Boolean))];
  100. return {
  101. unique_id: normalizedDocNum, // Normalized version for unique URLs
  102. document_number: rawDocNums.length === 1 ? rawDocNums[0] : normalizedDocNum, // Show original if consistent, else normalized
  103. raw_document_numbers: rawDocNums, // All variations found
  104. pages: docPages,
  105. page_count: docPages.length,
  106. document_metadata: firstPage.document_metadata,
  107. entities: {
  108. people: Array.from(allEntities.people),
  109. organizations: Array.from(allEntities.organizations),
  110. locations: Array.from(allEntities.locations),
  111. dates: Array.from(allEntities.dates),
  112. reference_numbers: Array.from(allEntities.reference_numbers)
  113. },
  114. full_text: docPages.map(p => p.full_text).join('\n\n--- PAGE BREAK ---\n\n'),
  115. folder: folders.join(', '), // Show all folders if document spans multiple
  116. folders: folders // Keep array for reference
  117. };
  118. });
  119. cachedDocuments = documents;
  120. return documents;
  121. }
  122. // Add global data - load all pages and group into documents
  123. eleventyConfig.addGlobalData("documents", getDocuments);
  124. // Build indices from grouped documents
  125. eleventyConfig.addGlobalData("indices", () => {
  126. const documentsData = getDocuments();
  127. const people = new Map();
  128. const organizations = new Map();
  129. const locations = new Map();
  130. const dates = new Map();
  131. const documentTypes = new Map();
  132. documentsData.forEach(doc => {
  133. // People
  134. if (doc.entities?.people) {
  135. doc.entities.people.forEach(person => {
  136. if (!people.has(person)) people.set(person, []);
  137. people.get(person).push(doc);
  138. });
  139. }
  140. // Organizations
  141. if (doc.entities?.organizations) {
  142. doc.entities.organizations.forEach(org => {
  143. if (!organizations.has(org)) organizations.set(org, []);
  144. organizations.get(org).push(doc);
  145. });
  146. }
  147. // Locations
  148. if (doc.entities?.locations) {
  149. doc.entities.locations.forEach(loc => {
  150. if (!locations.has(loc)) locations.set(loc, []);
  151. locations.get(loc).push(doc);
  152. });
  153. }
  154. // Dates
  155. if (doc.entities?.dates) {
  156. doc.entities.dates.forEach(date => {
  157. if (!dates.has(date)) dates.set(date, []);
  158. dates.get(date).push(doc);
  159. });
  160. }
  161. // Document types
  162. const docType = doc.document_metadata?.document_type;
  163. if (docType) {
  164. if (!documentTypes.has(docType)) documentTypes.set(docType, []);
  165. documentTypes.get(docType).push(doc);
  166. }
  167. });
  168. return {
  169. people: Array.from(people.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count),
  170. organizations: Array.from(organizations.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count),
  171. locations: Array.from(locations.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count),
  172. dates: Array.from(dates.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count),
  173. documentTypes: Array.from(documentTypes.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count)
  174. };
  175. });
  176. return {
  177. dir: {
  178. input: "src",
  179. output: "_site",
  180. includes: "_includes"
  181. },
  182. pathPrefix: "/"
  183. };
  184. };