.eleventy.js 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. const fs = require('fs');
  2. const path = require('path');
  3. module.exports = function(eleventyConfig) {
  4. // Copy results directory to output
  5. eleventyConfig.addPassthroughCopy({ "./results": "documents" });
  6. // Load deduplication mappings if available
  7. let dedupeMappings = { people: {}, organizations: {}, locations: {} };
  8. const dedupeFile = path.join(__dirname, 'dedupe.json');
  9. if (fs.existsSync(dedupeFile)) {
  10. try {
  11. dedupeMappings = JSON.parse(fs.readFileSync(dedupeFile, 'utf8'));
  12. console.log('✅ Loaded deduplication mappings from dedupe.json');
  13. } catch (e) {
  14. console.warn('⚠️ Could not load dedupe.json:', e.message);
  15. }
  16. } else {
  17. console.log('ℹ️ No dedupe.json found - entities will not be deduplicated');
  18. }
  19. // Helper function to apply deduplication mapping
  20. function applyDedupe(entityType, entityName) {
  21. if (!entityName) return entityName;
  22. return dedupeMappings[entityType]?.[entityName] || entityName;
  23. }
  24. // Cache the documents data - only compute once
  25. let cachedDocuments = null;
  26. function getDocuments() {
  27. if (cachedDocuments) {
  28. return cachedDocuments;
  29. }
  30. const resultsDir = path.join(__dirname, './results');
  31. const pages = [];
  32. function readDocuments(dir, relativePath = '') {
  33. const entries = fs.readdirSync(dir, { withFileTypes: true });
  34. for (const entry of entries) {
  35. const fullPath = path.join(dir, entry.name);
  36. const relPath = path.join(relativePath, entry.name);
  37. if (entry.isDirectory()) {
  38. readDocuments(fullPath, relPath);
  39. } else if (entry.name.endsWith('.json')) {
  40. try {
  41. const content = JSON.parse(fs.readFileSync(fullPath, 'utf8'));
  42. pages.push({
  43. path: relPath,
  44. filename: entry.name.replace('.json', ''),
  45. folder: relativePath || 'root',
  46. ...content
  47. });
  48. } catch (e) {
  49. console.error(`Error reading ${fullPath}:`, e.message);
  50. }
  51. }
  52. }
  53. }
  54. readDocuments(resultsDir);
  55. // Normalize function to handle LLM inconsistencies in document numbers
  56. const normalizeDocNum = (docNum) => {
  57. if (!docNum) return null;
  58. // Convert to lowercase, remove all non-alphanumeric except hyphens, collapse multiple hyphens
  59. return String(docNum)
  60. .toLowerCase()
  61. .replace(/[^a-z0-9-]/g, '-')
  62. .replace(/-+/g, '-')
  63. .replace(/^-+|-+$/g, '');
  64. };
  65. // Group pages by NORMALIZED document_number to handle LLM variations
  66. const documentMap = new Map();
  67. pages.forEach(page => {
  68. // Use document_number from metadata to group pages of the same document
  69. const rawDocNum = page.document_metadata?.document_number;
  70. // Skip pages without a document number
  71. if (!rawDocNum) {
  72. console.warn(`Page ${page.filename} has no document_number, using filename as fallback`);
  73. const fallbackKey = normalizeDocNum(page.filename) || page.filename;
  74. if (!documentMap.has(fallbackKey)) {
  75. documentMap.set(fallbackKey, []);
  76. }
  77. documentMap.get(fallbackKey).push(page);
  78. return;
  79. }
  80. // Normalize the document number to group variants together
  81. const normalizedDocNum = normalizeDocNum(rawDocNum);
  82. if (!documentMap.has(normalizedDocNum)) {
  83. documentMap.set(normalizedDocNum, []);
  84. }
  85. documentMap.get(normalizedDocNum).push(page);
  86. });
  87. // Convert to array and sort pages within each document
  88. const documents = Array.from(documentMap.entries()).map(([normalizedDocNum, docPages]) => {
  89. // Sort pages by page number
  90. docPages.sort((a, b) => {
  91. const pageA = parseInt(a.document_metadata?.page_number) || 0;
  92. const pageB = parseInt(b.document_metadata?.page_number) || 0;
  93. return pageA - pageB;
  94. });
  95. // Combine all entities from all pages
  96. const allEntities = {
  97. people: new Set(),
  98. organizations: new Set(),
  99. locations: new Set(),
  100. dates: new Set(),
  101. reference_numbers: new Set()
  102. };
  103. docPages.forEach(page => {
  104. if (page.entities) {
  105. Object.keys(allEntities).forEach(key => {
  106. if (page.entities[key]) {
  107. page.entities[key].forEach(item => allEntities[key].add(item));
  108. }
  109. });
  110. }
  111. });
  112. // Get metadata from first page
  113. const firstPage = docPages[0];
  114. // Get all unique folders that contain pages of this document
  115. const folders = [...new Set(docPages.map(p => p.folder))];
  116. // Get all unique raw document numbers (for display)
  117. const rawDocNums = [...new Set(docPages.map(p => p.document_metadata?.document_number).filter(Boolean))];
  118. // Apply deduplication to document entities
  119. const deduplicatedEntities = {
  120. people: [...new Set(Array.from(allEntities.people).map(p => applyDedupe('people', p)))],
  121. organizations: [...new Set(Array.from(allEntities.organizations).map(o => applyDedupe('organizations', o)))],
  122. locations: [...new Set(Array.from(allEntities.locations).map(l => applyDedupe('locations', l)))],
  123. dates: Array.from(allEntities.dates),
  124. reference_numbers: Array.from(allEntities.reference_numbers)
  125. };
  126. return {
  127. unique_id: normalizedDocNum, // Normalized version for unique URLs
  128. document_number: rawDocNums.length === 1 ? rawDocNums[0] : normalizedDocNum, // Show original if consistent, else normalized
  129. raw_document_numbers: rawDocNums, // All variations found
  130. pages: docPages,
  131. page_count: docPages.length,
  132. document_metadata: firstPage.document_metadata,
  133. entities: deduplicatedEntities,
  134. full_text: docPages.map(p => p.full_text).join('\n\n--- PAGE BREAK ---\n\n'),
  135. folder: folders.join(', '), // Show all folders if document spans multiple
  136. folders: folders // Keep array for reference
  137. };
  138. });
  139. cachedDocuments = documents;
  140. return documents;
  141. }
  142. // Load document analyses if available
  143. eleventyConfig.addGlobalData("analyses", () => {
  144. const analysesFile = path.join(__dirname, 'analyses.json');
  145. if (fs.existsSync(analysesFile)) {
  146. try {
  147. const data = JSON.parse(fs.readFileSync(analysesFile, 'utf8'));
  148. console.log(`✅ Loaded ${data.analyses?.length || 0} document analyses`);
  149. return data.analyses || [];
  150. } catch (e) {
  151. console.warn('⚠️ Could not load analyses.json:', e.message);
  152. return [];
  153. }
  154. }
  155. console.log('ℹ️ No analyses.json found - run analyze_documents.py to generate');
  156. return [];
  157. });
  158. // Add global data - load all pages and group into documents
  159. eleventyConfig.addGlobalData("documents", getDocuments);
  160. // Build indices from grouped documents
  161. eleventyConfig.addGlobalData("indices", () => {
  162. const documentsData = getDocuments();
  163. const people = new Map();
  164. const organizations = new Map();
  165. const locations = new Map();
  166. const dates = new Map();
  167. const documentTypes = new Map();
  168. documentsData.forEach(doc => {
  169. // People (with deduplication)
  170. if (doc.entities?.people) {
  171. doc.entities.people.forEach(person => {
  172. const canonicalName = applyDedupe('people', person);
  173. if (!people.has(canonicalName)) people.set(canonicalName, []);
  174. people.get(canonicalName).push(doc);
  175. });
  176. }
  177. // Organizations (with deduplication)
  178. if (doc.entities?.organizations) {
  179. doc.entities.organizations.forEach(org => {
  180. const canonicalName = applyDedupe('organizations', org);
  181. if (!organizations.has(canonicalName)) organizations.set(canonicalName, []);
  182. organizations.get(canonicalName).push(doc);
  183. });
  184. }
  185. // Locations (with deduplication)
  186. if (doc.entities?.locations) {
  187. doc.entities.locations.forEach(loc => {
  188. const canonicalName = applyDedupe('locations', loc);
  189. if (!locations.has(canonicalName)) locations.set(canonicalName, []);
  190. locations.get(canonicalName).push(doc);
  191. });
  192. }
  193. // Dates
  194. if (doc.entities?.dates) {
  195. doc.entities.dates.forEach(date => {
  196. if (!dates.has(date)) dates.set(date, []);
  197. dates.get(date).push(doc);
  198. });
  199. }
  200. // Document types
  201. const docType = doc.document_metadata?.document_type;
  202. if (docType) {
  203. if (!documentTypes.has(docType)) documentTypes.set(docType, []);
  204. documentTypes.get(docType).push(doc);
  205. }
  206. });
  207. // Deduplicate document arrays (remove duplicate document references)
  208. const dedupeDocArray = (docs) => {
  209. const seen = new Set();
  210. return docs.filter(doc => {
  211. if (seen.has(doc.unique_id)) return false;
  212. seen.add(doc.unique_id);
  213. return true;
  214. });
  215. };
  216. return {
  217. people: Array.from(people.entries()).map(([name, docs]) => ({
  218. name,
  219. docs: dedupeDocArray(docs),
  220. count: dedupeDocArray(docs).length
  221. })).sort((a, b) => b.count - a.count),
  222. organizations: Array.from(organizations.entries()).map(([name, docs]) => ({
  223. name,
  224. docs: dedupeDocArray(docs),
  225. count: dedupeDocArray(docs).length
  226. })).sort((a, b) => b.count - a.count),
  227. locations: Array.from(locations.entries()).map(([name, docs]) => ({
  228. name,
  229. docs: dedupeDocArray(docs),
  230. count: dedupeDocArray(docs).length
  231. })).sort((a, b) => b.count - a.count),
  232. dates: Array.from(dates.entries()).map(([name, docs]) => ({
  233. name,
  234. docs: dedupeDocArray(docs),
  235. count: dedupeDocArray(docs).length
  236. })).sort((a, b) => b.count - a.count),
  237. documentTypes: Array.from(documentTypes.entries()).map(([name, docs]) => ({
  238. name,
  239. docs: dedupeDocArray(docs),
  240. count: dedupeDocArray(docs).length
  241. })).sort((a, b) => b.count - a.count)
  242. };
  243. });
  244. return {
  245. dir: {
  246. input: "src",
  247. output: "_site",
  248. includes: "_includes"
  249. },
  250. pathPrefix: "/"
  251. };
  252. };