github-mirrors
/
mirror-epstein-docs


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
							const fs = require('fs');
const path = require('path');

module.exports = function(eleventyConfig) {
  // Copy results directory to output
  eleventyConfig.addPassthroughCopy({ "./results": "documents" });

  // Load deduplication mappings if available
  let dedupeMappings = { people: {}, organizations: {}, locations: {} };
  const dedupeFile = path.join(__dirname, 'dedupe.json');
  if (fs.existsSync(dedupeFile)) {
    try {
      dedupeMappings = JSON.parse(fs.readFileSync(dedupeFile, 'utf8'));
      console.log('✅ Loaded deduplication mappings from dedupe.json');
    } catch (e) {
      console.warn('⚠️  Could not load dedupe.json:', e.message);
    }
  } else {
    console.log('ℹ️  No dedupe.json found - entities will not be deduplicated');
  }

  // Load document type deduplication mappings if available
  let typeDedupeMap = {};
  const typeDedupeFile = path.join(__dirname, 'dedupe_types.json');
  if (fs.existsSync(typeDedupeFile)) {
    try {
      const data = JSON.parse(fs.readFileSync(typeDedupeFile, 'utf8'));
      typeDedupeMap = data.mappings || {};
      console.log('✅ Loaded document type mappings from dedupe_types.json');
    } catch (e) {
      console.warn('⚠️  Could not load dedupe_types.json:', e.message);
    }
  } else {
    console.log('ℹ️  No dedupe_types.json found - document types will not be deduplicated');
  }

  // Helper function to apply deduplication mapping
  function applyDedupe(entityType, entityName) {
    if (!entityName) return entityName;
    return dedupeMappings[entityType]?.[entityName] || entityName;
  }

  // Helper function to normalize document types (for grouping)
  function normalizeDocType(docType) {
    if (!docType) return null;
    const trimmed = String(docType).trim();

    // Apply deduplication mapping if available
    const canonical = typeDedupeMap[trimmed] || trimmed;

    return canonical.toLowerCase().trim();
  }

  // Helper function to format document types for display (title case)
  function formatDocType(docType) {
    if (!docType) return 'Unknown';
    const trimmed = String(docType).trim();

    // Apply deduplication mapping if available
    const canonical = typeDedupeMap[trimmed] || trimmed;

    // Return the canonical name (already in proper case from dedupe script)
    return canonical;
  }

  // Helper function to normalize dates to consistent format
  function normalizeDate(dateStr) {
    if (!dateStr) return null;

    const str = String(dateStr).trim();

    // Already in ISO format (YYYY-MM-DD)
    if (/^\d{4}-\d{2}-\d{2}$/.test(str)) {
      return str;
    }

    // Just a year (YYYY)
    if (/^\d{4}$/.test(str)) {
      return `${str}-00-00`;
    }

    // Try to parse various date formats
    const months = {
      'jan': '01', 'january': '01',
      'feb': '02', 'february': '02',
      'mar': '03', 'march': '03',
      'apr': '04', 'april': '04',
      'may': '05',
      'jun': '06', 'june': '06',
      'jul': '07', 'july': '07',
      'aug': '08', 'august': '08',
      'sep': '09', 'september': '09',
      'oct': '10', 'october': '10',
      'nov': '11', 'november': '11',
      'dec': '12', 'december': '12'
    };

    // "February 15, 2005" or "Feb 15, 2005"
    const match1 = str.match(/^(\w+)\s+(\d{1,2}),?\s+(\d{4})$/i);
    if (match1) {
      const month = months[match1[1].toLowerCase()];
      if (month) {
        const day = match1[2].padStart(2, '0');
        return `${match1[3]}-${month}-${day}`;
      }
    }

    // "15 February 2005" or "15 Feb 2005"
    const match2 = str.match(/^(\d{1,2})\s+(\w+)\s+(\d{4})$/i);
    if (match2) {
      const month = months[match2[2].toLowerCase()];
      if (month) {
        const day = match2[1].padStart(2, '0');
        return `${match2[3]}-${month}-${day}`;
      }
    }

    // "2005/02/15" or "2005.02.15"
    const match3 = str.match(/^(\d{4})[\/\.](\d{1,2})[\/\.](\d{1,2})$/);
    if (match3) {
      const month = match3[2].padStart(2, '0');
      const day = match3[3].padStart(2, '0');
      return `${match3[1]}-${month}-${day}`;
    }

    // "02/15/2005" or "02.15.2005" (US format)
    const match4 = str.match(/^(\d{1,2})[\/\.](\d{1,2})[\/\.](\d{4})$/);
    if (match4) {
      const month = match4[1].padStart(2, '0');
      const day = match4[2].padStart(2, '0');
      return `${match4[3]}-${month}-${day}`;
    }

    // Couldn't parse - return original
    return str;
  }

  // Helper function to format dates for display
  function formatDate(normalizedDate) {
    if (!normalizedDate) return 'Unknown Date';

    // Year only (YYYY-00-00)
    if (normalizedDate.endsWith('-00-00')) {
      return normalizedDate.substring(0, 4);
    }

    // Full date (YYYY-MM-DD)
    const match = normalizedDate.match(/^(\d{4})-(\d{2})-(\d{2})$/);
    if (match) {
      const months = ['', 'January', 'February', 'March', 'April', 'May', 'June',
                      'July', 'August', 'September', 'October', 'November', 'December'];
      const year = match[1];
      const month = parseInt(match[2]);
      const day = parseInt(match[3]);

      if (month > 0 && month <= 12) {
        return `${months[month]} ${day}, ${year}`;
      }
    }

    // Fallback
    return normalizedDate;
  }

  // Cache the documents data - only compute once
  let cachedDocuments = null;

  function getDocuments() {
    if (cachedDocuments) {
      return cachedDocuments;
    }
    const resultsDir = path.join(__dirname, './results');
    const pages = [];

    function readDocuments(dir, relativePath = '') {
      const entries = fs.readdirSync(dir, { withFileTypes: true });

      for (const entry of entries) {
        const fullPath = path.join(dir, entry.name);
        const relPath = path.join(relativePath, entry.name);

        if (entry.isDirectory()) {
          readDocuments(fullPath, relPath);
        } else if (entry.name.endsWith('.json')) {
          try {
            const content = JSON.parse(fs.readFileSync(fullPath, 'utf8'));
            pages.push({
              path: relPath,
              filename: entry.name.replace('.json', ''),
              folder: relativePath || 'root',
              ...content
            });
          } catch (e) {
            console.error(`Error reading ${fullPath}:`, e.message);
          }
        }
      }
    }

    readDocuments(resultsDir);

    // Normalize function to handle LLM inconsistencies in document numbers
    const normalizeDocNum = (docNum) => {
      if (!docNum) return null;
      // Convert to lowercase, remove all non-alphanumeric except hyphens, collapse multiple hyphens
      return String(docNum)
        .toLowerCase()
        .replace(/[^a-z0-9-]/g, '-')
        .replace(/-+/g, '-')
        .replace(/^-+|-+$/g, '');
    };

    // Group pages by NORMALIZED document_number to handle LLM variations
    const documentMap = new Map();

    pages.forEach(page => {
      // Use document_number from metadata to group pages of the same document
      const rawDocNum = page.document_metadata?.document_number;

      // Skip pages without a document number
      if (!rawDocNum) {
        console.warn(`Page ${page.filename} has no document_number, using filename as fallback`);
        const fallbackKey = normalizeDocNum(page.filename) || page.filename;
        if (!documentMap.has(fallbackKey)) {
          documentMap.set(fallbackKey, []);
        }
        documentMap.get(fallbackKey).push(page);
        return;
      }

      // Normalize the document number to group variants together
      const normalizedDocNum = normalizeDocNum(rawDocNum);

      if (!documentMap.has(normalizedDocNum)) {
        documentMap.set(normalizedDocNum, []);
      }
      documentMap.get(normalizedDocNum).push(page);
    });

    // Convert to array and sort pages within each document
    const documents = Array.from(documentMap.entries()).map(([normalizedDocNum, docPages]) => {

      // Sort pages by page number
      docPages.sort((a, b) => {
        const pageA = parseInt(a.document_metadata?.page_number) || 0;
        const pageB = parseInt(b.document_metadata?.page_number) || 0;
        return pageA - pageB;
      });

      // Combine all entities from all pages
      const allEntities = {
        people: new Set(),
        organizations: new Set(),
        locations: new Set(),
        dates: new Set(),
        reference_numbers: new Set()
      };

      docPages.forEach(page => {
        if (page.entities) {
          Object.keys(allEntities).forEach(key => {
            if (page.entities[key]) {
              page.entities[key].forEach(item => allEntities[key].add(item));
            }
          });
        }
      });

      // Get metadata from first page
      const firstPage = docPages[0];

      // Get all unique folders that contain pages of this document
      const folders = [...new Set(docPages.map(p => p.folder))];

      // Get all unique raw document numbers (for display)
      const rawDocNums = [...new Set(docPages.map(p => p.document_metadata?.document_number).filter(Boolean))];

      // Apply deduplication to document entities
      const deduplicatedEntities = {
        people: [...new Set(Array.from(allEntities.people).map(p => applyDedupe('people', p)))],
        organizations: [...new Set(Array.from(allEntities.organizations).map(o => applyDedupe('organizations', o)))],
        locations: [...new Set(Array.from(allEntities.locations).map(l => applyDedupe('locations', l)))],
        dates: [...new Set(Array.from(allEntities.dates).map(d => {
          const normalized = normalizeDate(d);
          return normalized ? formatDate(normalized) : d;
        }))],
        reference_numbers: Array.from(allEntities.reference_numbers)
      };

      // Normalize document metadata
      const normalizedMetadata = {
        ...firstPage.document_metadata,
        document_type: firstPage.document_metadata?.document_type
          ? formatDocType(firstPage.document_metadata.document_type)
          : null,
        date: firstPage.document_metadata?.date
          ? formatDate(normalizeDate(firstPage.document_metadata.date))
          : firstPage.document_metadata?.date
      };

      // Create lightweight pages array (keep full_text but make them lazy)
      const lightPages = docPages.map(p => {
        const lightPage = { ...p };
        // Keep full_text reference for document rendering, but it won't be duplicated
        return lightPage;
      });

      // Only include full_text when needed (for individual document pages)
      // For the main documents array, we skip it to save memory
      const docData = {
        unique_id: normalizedDocNum,  // Normalized version for unique URLs
        document_number: rawDocNums.length === 1 ? rawDocNums[0] : normalizedDocNum, // Show original if consistent, else normalized
        raw_document_numbers: rawDocNums, // All variations found
        pages: lightPages,
        page_count: docPages.length,
        document_metadata: normalizedMetadata,
        entities: deduplicatedEntities,
        folder: folders.join(', '),  // Show all folders if document spans multiple
        folders: folders  // Keep array for reference
      };

      // Add full_text getter that loads on demand
      Object.defineProperty(docData, 'full_text', {
        get: function() {
          if (!this._full_text) {
            this._full_text = this.pages.map(p => p.full_text).join('\n\n--- PAGE BREAK ---\n\n');
          }
          return this._full_text;
        },
        enumerable: true
      });

      return docData;
    });

    cachedDocuments = documents;
    return documents;
  }

  // Load document analyses if available
  eleventyConfig.addGlobalData("analyses", () => {
    const analysesFile = path.join(__dirname, 'analyses.json');
    if (fs.existsSync(analysesFile)) {
      try {
        const data = JSON.parse(fs.readFileSync(analysesFile, 'utf8'));
        const analyses = data.analyses || [];

        // Apply document type deduplication to analyses
        if (Object.keys(typeDedupeMap).length > 0) {
          analyses.forEach(analysis => {
            if (analysis.analysis?.document_type) {
              const original = analysis.analysis.document_type;
              const canonical = typeDedupeMap[original] || original;
              analysis.analysis.document_type = canonical;
            }
          });
        }

        console.log(`✅ Loaded ${analyses.length} document analyses`);
        return analyses;
      } catch (e) {
        console.warn('⚠️  Could not load analyses.json:', e.message);
        return [];
      }
    }
    console.log('ℹ️  No analyses.json found - run analyze_documents.py to generate');
    return [];
  });

  // Get unique canonical document types from analyses
  eleventyConfig.addGlobalData("analysisDocumentTypes", () => {
    const analysesFile = path.join(__dirname, 'analyses.json');
    if (!fs.existsSync(analysesFile)) {
      return [];
    }

    try {
      const data = JSON.parse(fs.readFileSync(analysesFile, 'utf8'));
      const analyses = data.analyses || [];

      // Collect unique canonical types
      const typesSet = new Set();
      analyses.forEach(analysis => {
        if (analysis.analysis?.document_type) {
          let docType = analysis.analysis.document_type;

          // Apply deduplication if available
          if (Object.keys(typeDedupeMap).length > 0) {
            docType = typeDedupeMap[docType] || docType;
          }

          typesSet.add(docType);
        }
      });

      const uniqueTypes = Array.from(typesSet).sort();
      console.log(`✅ Found ${uniqueTypes.length} unique canonical document types for filters`);
      return uniqueTypes;
    } catch (e) {
      console.warn('⚠️  Could not load document types:', e.message);
      return [];
    }
  });

  // Add global data - load all pages and group into documents
  eleventyConfig.addGlobalData("documents", getDocuments);

  // Build indices from grouped documents
  eleventyConfig.addGlobalData("indices", () => {
    const documentsData = getDocuments();

    const people = new Map();
    const organizations = new Map();
    const locations = new Map();
    const dates = new Map();
    const documentTypes = new Map();

    documentsData.forEach(doc => {
      // People (with deduplication)
      if (doc.entities?.people) {
        doc.entities.people.forEach(person => {
          const canonicalName = applyDedupe('people', person);
          if (!people.has(canonicalName)) people.set(canonicalName, []);
          people.get(canonicalName).push(doc);
        });
      }

      // Organizations (with deduplication)
      if (doc.entities?.organizations) {
        doc.entities.organizations.forEach(org => {
          const canonicalName = applyDedupe('organizations', org);
          if (!organizations.has(canonicalName)) organizations.set(canonicalName, []);
          organizations.get(canonicalName).push(doc);
        });
      }

      // Locations (with deduplication)
      if (doc.entities?.locations) {
        doc.entities.locations.forEach(loc => {
          const canonicalName = applyDedupe('locations', loc);
          if (!locations.has(canonicalName)) locations.set(canonicalName, []);
          locations.get(canonicalName).push(doc);
        });
      }

      // Dates (normalize for grouping)
      if (doc.entities?.dates) {
        doc.entities.dates.forEach(date => {
          const normalized = normalizeDate(date);
          if (normalized) {
            if (!dates.has(normalized)) dates.set(normalized, []);
            dates.get(normalized).push(doc);
          }
        });
      }

      // Document types (normalize for grouping)
      const docType = doc.document_metadata?.document_type;
      if (docType) {
        const normalized = normalizeDocType(docType);
        if (normalized) {
          if (!documentTypes.has(normalized)) documentTypes.set(normalized, []);
          documentTypes.get(normalized).push(doc);
        }
      }
    });

    // Deduplicate document arrays (remove duplicate document references)
    const dedupeDocArray = (docs) => {
      const seen = new Set();
      return docs.filter(doc => {
        if (seen.has(doc.unique_id)) return false;
        seen.add(doc.unique_id);
        return true;
      });
    };

    return {
      people: Array.from(people.entries()).map(([name, docs]) => ({
        name,
        docs: dedupeDocArray(docs),
        count: dedupeDocArray(docs).length
      })).sort((a, b) => b.count - a.count),
      organizations: Array.from(organizations.entries()).map(([name, docs]) => ({
        name,
        docs: dedupeDocArray(docs),
        count: dedupeDocArray(docs).length
      })).sort((a, b) => b.count - a.count),
      locations: Array.from(locations.entries()).map(([name, docs]) => ({
        name,
        docs: dedupeDocArray(docs),
        count: dedupeDocArray(docs).length
      })).sort((a, b) => b.count - a.count),
      dates: Array.from(dates.entries()).map(([normalizedDate, docs]) => ({
        name: formatDate(normalizedDate),  // Display formatted version
        normalizedDate,  // Keep normalized for sorting
        docs: dedupeDocArray(docs),
        count: dedupeDocArray(docs).length
      })).sort((a, b) => {
        // Sort by normalized date (YYYY-MM-DD format sorts correctly)
        return b.normalizedDate.localeCompare(a.normalizedDate);
      }),
      documentTypes: Array.from(documentTypes.entries()).map(([normalizedType, docs]) => ({
        name: formatDocType(normalizedType),  // Display formatted version
        docs: dedupeDocArray(docs),
        count: dedupeDocArray(docs).length
      })).sort((a, b) => b.count - a.count)
    };
  });

  return {
    dir: {
      input: "src",
      output: "_site",
      includes: "_includes"
    },
    pathPrefix: "/"
  };
};