From 51de59b65cdc273dc740bbe7e2d33b33c0774f21 Mon Sep 17 00:00:00 2001
From: Lucas <lbometon@hotmail.fr>
Date: Tue, 7 Jan 2025 08:27:03 +0100
Subject: [PATCH] Update rag_document_upload.ts

RAG optimise
---
 src/chains/rag_document_upload.ts | 268 +++++++-----------------------
 1 file changed, 64 insertions(+), 204 deletions(-)

diff --git a/src/chains/rag_document_upload.ts b/src/chains/rag_document_upload.ts
index 32b4681..0339a7f 100644
--- a/src/chains/rag_document_upload.ts
+++ b/src/chains/rag_document_upload.ts
@@ -1,4 +1,3 @@
-import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 import { Document } from '@langchain/core/documents';
 import { Embeddings } from '@langchain/core/embeddings';
 import { Chroma } from '@langchain/community/vectorstores/chroma';
@@ -9,7 +8,6 @@ import { StringOutputParser } from '@langchain/core/output_parsers';
 import formatChatHistoryAsString from '../utils/formatHistory';
 import { BaseMessage } from '@langchain/core/messages';
 
-// Type local pour la chaîne de recherche
 type SearchInput = {
   query: string;
   chat_history: BaseMessage[];
@@ -17,42 +15,19 @@ type SearchInput = {
 };
 
 export class RAGDocumentChain {
+  private static instance: RAGDocumentChain;
   private vectorStore: Chroma | null = null;
-  private textSplitter = new RecursiveCharacterTextSplitter({
-    chunkSize: 1000,
-    chunkOverlap: 200,
-    separators: ["\n\n", "\n", ".", "!", "?", ";", ":", " ", ""],
-    keepSeparator: true,
-    lengthFunction: (text) => text.length
-  });
+  private readonly collectionName = 'uploaded_docs';
+  private initialized = false;
+  private currentDocuments: Document[] = [];
 
-  // Add chunk preprocessing
-  private preprocessChunk(text: string): string {
-    return text
-      .replace(/\s+/g, ' ')
-      .replace(/\n+/g, ' ')
-      .trim();
-  }
+  private constructor() {}
 
-  // Add metadata enrichment
-  private enrichChunkMetadata(doc: Document): Document {
-    const metadata = {
-      ...doc.metadata,
-      chunk_type: 'text',
-      word_count: doc.pageContent.split(/\s+/).length,
-      processed_date: new Date().toISOString()
-    };
-    return new Document({
-      pageContent: this.preprocessChunk(doc.pageContent),
-      metadata
-    });
-  }
-
-  // Add chunk scoring
-  private scoreChunk(chunk: string): number {
-    const wordCount = chunk.split(/\s+/).length;
-    const sentenceCount = chunk.split(/[.!?]+/).length;
-    return wordCount > 10 && sentenceCount > 0 ? 1 : 0;
+  public static getInstance(): RAGDocumentChain {
+    if (!RAGDocumentChain.instance) {
+      RAGDocumentChain.instance = new RAGDocumentChain();
+    }
+    return RAGDocumentChain.instance;
   }
 
   public async initializeVectorStoreFromDocuments(
@@ -60,46 +35,54 @@ export class RAGDocumentChain {
     embeddings: Embeddings
   ) {
     try {
-      console.log("🔄 Préparation des documents...");
-      
-      // Validate and preprocess documents
-      const validDocuments = documents
-        .filter(doc => doc.pageContent && doc.pageContent.trim().length > 50)
-        .map(doc => this.enrichChunkMetadata(doc));
-      
-      // Split documents into chunks
-      const texts = await this.textSplitter.splitDocuments(validDocuments);
-      console.log(`📄 ${texts.length} chunks créés`);
-      
-      // Score and filter chunks
-      const scoredTexts = texts.filter(doc => this.scoreChunk(doc.pageContent) > 0);
-      console.log(`📄 ${scoredTexts.length} chunks valides après scoring`);
-      
-      // Deduplicate chunks
-      const uniqueTexts = this.deduplicateChunks(scoredTexts);
-      console.log(`📄 ${uniqueTexts.length} chunks uniques après déduplication`);
-      
-      // Initialize vector store with optimized settings
-      this.vectorStore = await Chroma.fromDocuments(
-        uniqueTexts,
-        embeddings,
-        {
-          collectionName: "uploaded_docs",
-          url: "http://chroma:8000",
-          collectionMetadata: {
-            "hnsw:space": "cosine",
-            "hnsw:construction_ef": 100,  // Increased for better index quality
-            "hnsw:search_ef": 50,         // Balanced for search performance
-            "hnsw:m": 16                  // Number of connections per element
-          }
-        }
+      // Filtrer les documents invalides
+      const validDocuments = documents.filter(doc => 
+        doc.pageContent && 
+        typeof doc.pageContent === 'string' && 
+        doc.pageContent.trim().length > 0
       );
+
+      console.log(`📄 Documents valides: ${validDocuments.length}/${documents.length}`);
+      
+      // Si déjà initialisé avec les mêmes documents, ne rien faire
+      const sameDocuments = this.initialized && 
+        this.currentDocuments.length === validDocuments.length &&
+        validDocuments.every((doc, index) => 
+          doc.pageContent === this.currentDocuments[index].pageContent
+        );
+
+      if (sameDocuments) {
+        console.log("📚 Réutilisation de la collection existante");
+        return {
+          totalDocuments: documents.length,
+          validDocuments: this.currentDocuments.length,
+          reused: true
+        };
+      }
+      
+      if (!this.vectorStore) {
+        console.log("🔄 Initialisation du vectorStore");
+        this.vectorStore = await Chroma.fromDocuments(validDocuments, embeddings, {
+          collectionName: this.collectionName,
+          url: "http://chroma:8000"
+        });
+        this.initialized = true;
+      } else {
+        console.log("🔄 Réinitialisation de la collection");
+        // Créer une nouvelle instance avec les nouveaux documents
+        this.vectorStore = await Chroma.fromDocuments(validDocuments, embeddings, {
+          collectionName: this.collectionName,
+          url: "http://chroma:8000"
+        });
+        this.initialized = true;
+      }
+
+      this.currentDocuments = validDocuments;
       
-      console.log("✅ VectorStore initialisé avec succès");
       return {
         totalDocuments: documents.length,
-        validChunks: uniqueTexts.length,
-        averageChunkSize: this.calculateAverageChunkSize(uniqueTexts)
+        validDocuments: validDocuments.length,
+        reused: false
       };
     } catch (error) {
       console.error("❌ Erreur lors de l'initialisation:", error);
@@ -107,141 +90,26 @@ export class RAGDocumentChain {
     }
   }
 
-  private calculateAverageChunkSize(chunks: Document[]): number {
-    if (chunks.length === 0) return 0;
-    const totalLength = chunks.reduce((sum, doc) => sum + doc.pageContent.length, 0);
-    return Math.round(totalLength / chunks.length);
-  }
-
-  private deduplicateChunks(chunks: Document[]): Document[] {
-    const seen = new Set<string>();
-    return chunks.filter(chunk => {
-      const normalized = chunk.pageContent
-        .toLowerCase()
-        .replace(/\s+/g, ' ')
-        .trim();
-      
-      if (seen.has(normalized)) {
-        return false;
-      }
-      seen.add(normalized);
-      return true;
-    });
-  }
-
   public async searchSimilarDocuments(query: string, limit: number = 5) {
-    if (!this.vectorStore) {
-      console.warn("⚠️ VectorStore non initialisé");
-      return [];
+    if (!this.vectorStore || !this.initialized) {
+      throw new Error("VectorStore non initialisé");
     }
 
     try {
       console.log("🔍 Recherche pour:", query);
       
-      const initialResults = await this.vectorStore.similaritySearch(
-        query,
-        limit * 2,
-        { 
-          filter: { source: { $exists: true } },
-          minScore: 0.7
-        }
-      );
+      const results = await this.vectorStore.similaritySearch(query, limit, {
+        k: limit
+      });
       
-      const scoredResults = initialResults
-        .filter(doc => doc.pageContent.trim().length > 50)
-        .map(doc => ({
-          document: doc,
-          score: this.calculateRelevanceScore(query, doc.pageContent)
-        }))
-        .sort((a, b) => b.score - a.score)
-        .slice(0, limit)
-        .map(item => {
-          const doc = item.document;
-          const pageNumber = doc.metadata.page_number || doc.metadata.pageNumber || 1;
-          const title = doc.metadata.title || 'Document';
-          const source = doc.metadata.source;
-          
-          // Préparer le texte à surligner
-          const searchText = doc.pageContent
-            .substring(0, 200)
-            .replace(/[\n\r]+/g, ' ')
-            .trim();
-          
-          return new Document({
-            pageContent: doc.pageContent,
-            metadata: {
-              title: title,
-              pageNumber: pageNumber,
-              source: source,
-              type: doc.metadata.type || 'uploaded',
-              searchText: searchText,
-              url: source ? 
-                `/api/uploads/${source}/view?page=${pageNumber}&search=${encodeURIComponent(searchText)}` : 
-                undefined
-            }
-          });
-        });
-
-      const mergedResults = this.mergeRelatedChunks(scoredResults);
-      console.log(`📄 ${mergedResults.length} documents pertinents trouvés après reranking`);
-      return mergedResults;
+      console.log(`📄 ${results.length} documents pertinents trouvés`);
+      return results;
     } catch (error) {
       console.error("❌ Erreur de recherche:", error);
-      return [];
+      return this.currentDocuments.slice(0, limit);
     }
   }
 
-  private calculateRelevanceScore(query: string, content: string): number {
-    const normalizedQuery = query.toLowerCase();
-    const normalizedContent = content.toLowerCase();
-    
-    // Basic relevance scoring based on multiple factors
-    let score = 0;
-    
-    // Term frequency
-    const queryTerms = normalizedQuery.split(/\s+/);
-    queryTerms.forEach(term => {
-      const termCount = (normalizedContent.match(new RegExp(term, 'g')) || []).length;
-      score += termCount * 0.1;
-    });
-    
-    // Exact phrase matching
-    if (normalizedContent.includes(normalizedQuery)) {
-      score += 1;
-    }
-    
-    // Content length penalty (prefer shorter, more focused chunks)
-    const lengthPenalty = Math.max(0, 1 - (content.length / 5000));
-    score *= (1 + lengthPenalty);
-    
-    return score;
-  }
-
-  private mergeRelatedChunks(documents: Document[]): Document[] {
-    const merged: { [key: string]: Document } = {};
-    
-    documents.forEach(doc => {
-      const source = doc.metadata?.source || '';
-      const page = doc.metadata?.pageNumber || 1;
-      const key = `${source}-${page}`;
-      
-      if (!merged[key]) {
-        merged[key] = doc;
-      } else {
-        const existingDoc = merged[key];
-        merged[key] = new Document({
-          pageContent: `${existingDoc.pageContent}\n\n${doc.pageContent}`,
-          metadata: {
-            ...existingDoc.metadata,
-            searchText: existingDoc.metadata.searchText
-          }
-        });
-      }
-    });
-    
-    return Object.values(merged);
-  }
-
   public createSearchChain(llm: BaseChatModel) {
     return RunnableSequence.from([
       RunnableMap.from({
@@ -251,15 +119,7 @@ export class RAGDocumentChain {
           const docs = await this.searchSimilarDocuments(input.query);
           return docs.map((doc, i) => {
             const source = doc.metadata?.source || 'Document';
-            const title = doc.metadata?.title || '';
-            const pageNumber = doc.metadata?.pageNumber;
-            const url = doc.metadata?.url;
-            
-            let sourceInfo = `Source: ${title || source}`;
-            if (pageNumber) sourceInfo += ` (page ${pageNumber})`;
-            if (url) sourceInfo += `\nURL: ${url}`;
-            
-            return `[Source ${i + 1}] ${doc.pageContent}\n${sourceInfo}`;
+            return `[Source ${i + 1}] ${doc.pageContent}\nSource: ${source}`;
           }).join("\n\n");
         }
       }),
@@ -289,4 +149,4 @@ export class RAGDocumentChain {
   public isInitialized(): boolean {
     return this.vectorStore !== null;
   }
-} 
\ No newline at end of file
+}