Update rag_document_upload.ts

RAG optimise
2025-01-07 08:27:03 +01:00 · 2025-01-07 08:27:03 +01:00 · 51de59b65c
commit 51de59b65c
parent 453a572664
1 changed files with 64 additions and 204 deletions
--- a/src/chains/rag_document_upload.ts
+++ b/src/chains/rag_document_upload.ts
@ -1,4 +1,3 @@
 import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 import { Document } from '@langchain/core/documents';
 import { Embeddings } from '@langchain/core/embeddings';
 import { Chroma } from '@langchain/community/vectorstores/chroma';
@ -9,7 +8,6 @@ import { StringOutputParser } from '@langchain/core/output_parsers';
 import formatChatHistoryAsString from '../utils/formatHistory';
 import { BaseMessage } from '@langchain/core/messages';
 // Type local pour la chaîne de recherche
 type SearchInput = {
  query: string;
  chat_history: BaseMessage[];
@ -17,42 +15,19 @@ type SearchInput = {
 };
 export class RAGDocumentChain {
  private static instance: RAGDocumentChain;
  private vectorStore: Chroma | null = null;
-  private textSplitter = new RecursiveCharacterTextSplitter({
+  private readonly collectionName = 'uploaded_docs';
-    chunkSize: 1000,
+  private initialized = false;
-    chunkOverlap: 200,
+  private currentDocuments: Document[] = [];
    separators: ["\n\n", "\n", ".", "!", "?", ";", ":", " ", ""],
    keepSeparator: true,
    lengthFunction: (text) => text.length
  });
-  // Add chunk preprocessing
+  private constructor() {}
-  private preprocessChunk(text: string): string {
+
-    return text
+  public static getInstance(): RAGDocumentChain {
-      .replace(/\s+/g, ' ')
+    if (!RAGDocumentChain.instance) {
-      .replace(/\n+/g, ' ')
+      RAGDocumentChain.instance = new RAGDocumentChain();
      .trim();
    }
-
+    return RAGDocumentChain.instance;
  // Add metadata enrichment
  private enrichChunkMetadata(doc: Document): Document {
    const metadata = {
      ...doc.metadata,
      chunk_type: 'text',
      word_count: doc.pageContent.split(/\s+/).length,
      processed_date: new Date().toISOString()
    };
    return new Document({
      pageContent: this.preprocessChunk(doc.pageContent),
      metadata
    });
  }
  // Add chunk scoring
  private scoreChunk(chunk: string): number {
    const wordCount = chunk.split(/\s+/).length;
    const sentenceCount = chunk.split(/[.!?]+/).length;
    return wordCount > 10 && sentenceCount > 0 ? 1 : 0;
  }
  public async initializeVectorStoreFromDocuments(
@ -60,46 +35,54 @@ export class RAGDocumentChain {
    embeddings: Embeddings
  ) {
    try {
-      console.log("🔄 Préparation des documents...");
+      // Filtrer les documents invalides
-      
+      const validDocuments = documents.filter(doc => 
-      // Validate and preprocess documents
+        doc.pageContent && 
-      const validDocuments = documents
+        typeof doc.pageContent === 'string' && 
-        .filter(doc => doc.pageContent && doc.pageContent.trim().length > 50)
+        doc.pageContent.trim().length > 0
        .map(doc => this.enrichChunkMetadata(doc));
      // Split documents into chunks
      const texts = await this.textSplitter.splitDocuments(validDocuments);
      console.log(`📄 ${texts.length} chunks créés`);
      // Score and filter chunks
      const scoredTexts = texts.filter(doc => this.scoreChunk(doc.pageContent) > 0);
      console.log(`📄 ${scoredTexts.length} chunks valides après scoring`);
      // Deduplicate chunks
      const uniqueTexts = this.deduplicateChunks(scoredTexts);
      console.log(`📄 ${uniqueTexts.length} chunks uniques après déduplication`);
      // Initialize vector store with optimized settings
      this.vectorStore = await Chroma.fromDocuments(
        uniqueTexts,
        embeddings,
        {
          collectionName: "uploaded_docs",
          url: "http://chroma:8000",
          collectionMetadata: {
            "hnsw:space": "cosine",
            "hnsw:construction_ef": 100,  // Increased for better index quality
            "hnsw:search_ef": 50,         // Balanced for search performance
            "hnsw:m": 16                  // Number of connections per element
          }
        }
      );
-      console.log("✅ VectorStore initialisé avec succès");
+      console.log(`📄 Documents valides: ${validDocuments.length}/${documents.length}`);
      // Si déjà initialisé avec les mêmes documents, ne rien faire
      const sameDocuments = this.initialized && 
        this.currentDocuments.length === validDocuments.length &&
        validDocuments.every((doc, index) => 
          doc.pageContent === this.currentDocuments[index].pageContent
        );
      if (sameDocuments) {
        console.log("📚 Réutilisation de la collection existante");
        return {
          totalDocuments: documents.length,
-        validChunks: uniqueTexts.length,
+          validDocuments: this.currentDocuments.length,
-        averageChunkSize: this.calculateAverageChunkSize(uniqueTexts)
+          reused: true
        };
      }
      if (!this.vectorStore) {
        console.log("🔄 Initialisation du vectorStore");
        this.vectorStore = await Chroma.fromDocuments(validDocuments, embeddings, {
          collectionName: this.collectionName,
          url: "http://chroma:8000"
        });
        this.initialized = true;
      } else {
        console.log("🔄 Réinitialisation de la collection");
        // Créer une nouvelle instance avec les nouveaux documents
        this.vectorStore = await Chroma.fromDocuments(validDocuments, embeddings, {
          collectionName: this.collectionName,
          url: "http://chroma:8000"
        });
        this.initialized = true;
      }
      this.currentDocuments = validDocuments;
      return {
        totalDocuments: documents.length,
        validDocuments: validDocuments.length,
        reused: false
      };
    } catch (error) {
      console.error("❌ Erreur lors de l'initialisation:", error);
@ -107,141 +90,26 @@ export class RAGDocumentChain {
    }
  }
  private calculateAverageChunkSize(chunks: Document[]): number {
    if (chunks.length === 0) return 0;
    const totalLength = chunks.reduce((sum, doc) => sum + doc.pageContent.length, 0);
    return Math.round(totalLength / chunks.length);
  }
  private deduplicateChunks(chunks: Document[]): Document[] {
    const seen = new Set<string>();
    return chunks.filter(chunk => {
      const normalized = chunk.pageContent
        .toLowerCase()
        .replace(/\s+/g, ' ')
        .trim();
      if (seen.has(normalized)) {
        return false;
      }
      seen.add(normalized);
      return true;
    });
  }
  public async searchSimilarDocuments(query: string, limit: number = 5) {
-    if (!this.vectorStore) {
+    if (!this.vectorStore || !this.initialized) {
-      console.warn("⚠️ VectorStore non initialisé");
+      throw new Error("VectorStore non initialisé");
      return [];
    }
    try {
      console.log("🔍 Recherche pour:", query);
-      const initialResults = await this.vectorStore.similaritySearch(
+      const results = await this.vectorStore.similaritySearch(query, limit, {
-        query,
+        k: limit
        limit * 2,
        { 
          filter: { source: { $exists: true } },
          minScore: 0.7
        }
      );
      const scoredResults = initialResults
        .filter(doc => doc.pageContent.trim().length > 50)
        .map(doc => ({
          document: doc,
          score: this.calculateRelevanceScore(query, doc.pageContent)
        }))
        .sort((a, b) => b.score - a.score)
        .slice(0, limit)
        .map(item => {
          const doc = item.document;
          const pageNumber = doc.metadata.page_number || doc.metadata.pageNumber || 1;
          const title = doc.metadata.title || 'Document';
          const source = doc.metadata.source;
          // Préparer le texte à surligner
          const searchText = doc.pageContent
            .substring(0, 200)
            .replace(/[\n\r]+/g, ' ')
            .trim();
          return new Document({
            pageContent: doc.pageContent,
            metadata: {
              title: title,
              pageNumber: pageNumber,
              source: source,
              type: doc.metadata.type || 'uploaded',
              searchText: searchText,
              url: source ? 
                `/api/uploads/${source}/view?page=${pageNumber}&search=${encodeURIComponent(searchText)}` : 
                undefined
            }
          });
      });
-      const mergedResults = this.mergeRelatedChunks(scoredResults);
+      console.log(`📄 ${results.length} documents pertinents trouvés`);
-      console.log(`📄 ${mergedResults.length} documents pertinents trouvés après reranking`);
+      return results;
      return mergedResults;
    } catch (error) {
      console.error("❌ Erreur de recherche:", error);
-      return [];
+      return this.currentDocuments.slice(0, limit);
    }
  }
  private calculateRelevanceScore(query: string, content: string): number {
    const normalizedQuery = query.toLowerCase();
    const normalizedContent = content.toLowerCase();
    // Basic relevance scoring based on multiple factors
    let score = 0;
    // Term frequency
    const queryTerms = normalizedQuery.split(/\s+/);
    queryTerms.forEach(term => {
      const termCount = (normalizedContent.match(new RegExp(term, 'g')) || []).length;
      score += termCount * 0.1;
    });
    // Exact phrase matching
    if (normalizedContent.includes(normalizedQuery)) {
      score += 1;
    }
    // Content length penalty (prefer shorter, more focused chunks)
    const lengthPenalty = Math.max(0, 1 - (content.length / 5000));
    score *= (1 + lengthPenalty);
    return score;
  }
  private mergeRelatedChunks(documents: Document[]): Document[] {
    const merged: { [key: string]: Document } = {};
    documents.forEach(doc => {
      const source = doc.metadata?.source || '';
      const page = doc.metadata?.pageNumber || 1;
      const key = `${source}-${page}`;
      if (!merged[key]) {
        merged[key] = doc;
      } else {
        const existingDoc = merged[key];
        merged[key] = new Document({
          pageContent: `${existingDoc.pageContent}\n\n${doc.pageContent}`,
          metadata: {
            ...existingDoc.metadata,
            searchText: existingDoc.metadata.searchText
          }
        });
      }
    });
    return Object.values(merged);
  }
  public createSearchChain(llm: BaseChatModel) {
    return RunnableSequence.from([
      RunnableMap.from({
@ -251,15 +119,7 @@ export class RAGDocumentChain {
          const docs = await this.searchSimilarDocuments(input.query);
          return docs.map((doc, i) => {
            const source = doc.metadata?.source || 'Document';
-            const title = doc.metadata?.title || '';
+            return `[Source ${i + 1}] ${doc.pageContent}\nSource: ${source}`;
            const pageNumber = doc.metadata?.pageNumber;
            const url = doc.metadata?.url;
            let sourceInfo = `Source: ${title || source}`;
            if (pageNumber) sourceInfo += ` (page ${pageNumber})`;
            if (url) sourceInfo += `\nURL: ${url}`;
            return `[Source ${i + 1}] ${doc.pageContent}\n${sourceInfo}`;
          }).join("\n\n");
        }
      }),