github-mirrors
/
mirror-epstein-docs


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
							const fs = require('fs');
const path = require('path');

module.exports = function(eleventyConfig) {
  // Copy results directory to output
  eleventyConfig.addPassthroughCopy({ "./results": "documents" });

  // Cache the documents data - only compute once
  let cachedDocuments = null;

  function getDocuments() {
    if (cachedDocuments) {
      return cachedDocuments;
    }
    const resultsDir = path.join(__dirname, './results');
    const pages = [];

    function readDocuments(dir, relativePath = '') {
      const entries = fs.readdirSync(dir, { withFileTypes: true });

      for (const entry of entries) {
        const fullPath = path.join(dir, entry.name);
        const relPath = path.join(relativePath, entry.name);

        if (entry.isDirectory()) {
          readDocuments(fullPath, relPath);
        } else if (entry.name.endsWith('.json')) {
          try {
            const content = JSON.parse(fs.readFileSync(fullPath, 'utf8'));
            pages.push({
              path: relPath,
              filename: entry.name.replace('.json', ''),
              folder: relativePath || 'root',
              ...content
            });
          } catch (e) {
            console.error(`Error reading ${fullPath}:`, e.message);
          }
        }
      }
    }

    readDocuments(resultsDir);

    // Normalize function to handle LLM inconsistencies in document numbers
    const normalizeDocNum = (docNum) => {
      if (!docNum) return null;
      // Convert to lowercase, remove all non-alphanumeric except hyphens, collapse multiple hyphens
      return String(docNum)
        .toLowerCase()
        .replace(/[^a-z0-9-]/g, '-')
        .replace(/-+/g, '-')
        .replace(/^-+|-+$/g, '');
    };

    // Group pages by NORMALIZED document_number to handle LLM variations
    const documentMap = new Map();

    pages.forEach(page => {
      // Use document_number from metadata to group pages of the same document
      const rawDocNum = page.document_metadata?.document_number;

      // Skip pages without a document number
      if (!rawDocNum) {
        console.warn(`Page ${page.filename} has no document_number, using filename as fallback`);
        const fallbackKey = normalizeDocNum(page.filename) || page.filename;
        if (!documentMap.has(fallbackKey)) {
          documentMap.set(fallbackKey, []);
        }
        documentMap.get(fallbackKey).push(page);
        return;
      }

      // Normalize the document number to group variants together
      const normalizedDocNum = normalizeDocNum(rawDocNum);

      if (!documentMap.has(normalizedDocNum)) {
        documentMap.set(normalizedDocNum, []);
      }
      documentMap.get(normalizedDocNum).push(page);
    });

    // Convert to array and sort pages within each document
    const documents = Array.from(documentMap.entries()).map(([normalizedDocNum, docPages]) => {

      // Sort pages by page number
      docPages.sort((a, b) => {
        const pageA = parseInt(a.document_metadata?.page_number) || 0;
        const pageB = parseInt(b.document_metadata?.page_number) || 0;
        return pageA - pageB;
      });

      // Combine all entities from all pages
      const allEntities = {
        people: new Set(),
        organizations: new Set(),
        locations: new Set(),
        dates: new Set(),
        reference_numbers: new Set()
      };

      docPages.forEach(page => {
        if (page.entities) {
          Object.keys(allEntities).forEach(key => {
            if (page.entities[key]) {
              page.entities[key].forEach(item => allEntities[key].add(item));
            }
          });
        }
      });

      // Get metadata from first page
      const firstPage = docPages[0];

      // Get all unique folders that contain pages of this document
      const folders = [...new Set(docPages.map(p => p.folder))];

      // Get all unique raw document numbers (for display)
      const rawDocNums = [...new Set(docPages.map(p => p.document_metadata?.document_number).filter(Boolean))];

      return {
        unique_id: normalizedDocNum,  // Normalized version for unique URLs
        document_number: rawDocNums.length === 1 ? rawDocNums[0] : normalizedDocNum, // Show original if consistent, else normalized
        raw_document_numbers: rawDocNums, // All variations found
        pages: docPages,
        page_count: docPages.length,
        document_metadata: firstPage.document_metadata,
        entities: {
          people: Array.from(allEntities.people),
          organizations: Array.from(allEntities.organizations),
          locations: Array.from(allEntities.locations),
          dates: Array.from(allEntities.dates),
          reference_numbers: Array.from(allEntities.reference_numbers)
        },
        full_text: docPages.map(p => p.full_text).join('\n\n--- PAGE BREAK ---\n\n'),
        folder: folders.join(', '),  // Show all folders if document spans multiple
        folders: folders  // Keep array for reference
      };
    });

    cachedDocuments = documents;
    return documents;
  }

  // Add global data - load all pages and group into documents
  eleventyConfig.addGlobalData("documents", getDocuments);

  // Build indices from grouped documents
  eleventyConfig.addGlobalData("indices", () => {
    const documentsData = getDocuments();

    const people = new Map();
    const organizations = new Map();
    const locations = new Map();
    const dates = new Map();
    const documentTypes = new Map();

    documentsData.forEach(doc => {
      // People
      if (doc.entities?.people) {
        doc.entities.people.forEach(person => {
          if (!people.has(person)) people.set(person, []);
          people.get(person).push(doc);
        });
      }

      // Organizations
      if (doc.entities?.organizations) {
        doc.entities.organizations.forEach(org => {
          if (!organizations.has(org)) organizations.set(org, []);
          organizations.get(org).push(doc);
        });
      }

      // Locations
      if (doc.entities?.locations) {
        doc.entities.locations.forEach(loc => {
          if (!locations.has(loc)) locations.set(loc, []);
          locations.get(loc).push(doc);
        });
      }

      // Dates
      if (doc.entities?.dates) {
        doc.entities.dates.forEach(date => {
          if (!dates.has(date)) dates.set(date, []);
          dates.get(date).push(doc);
        });
      }

      // Document types
      const docType = doc.document_metadata?.document_type;
      if (docType) {
        if (!documentTypes.has(docType)) documentTypes.set(docType, []);
        documentTypes.get(docType).push(doc);
      }
    });

    return {
      people: Array.from(people.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count),
      organizations: Array.from(organizations.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count),
      locations: Array.from(locations.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count),
      dates: Array.from(dates.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count),
      documentTypes: Array.from(documentTypes.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count)
    };
  });

  return {
    dir: {
      input: "src",
      output: "_site",
      includes: "_includes"
    },
    pathPrefix: "/"
  };
};