Update rag_document_upload.ts

RAG optimise
2025-01-07 08:27:03 +01:00 · 2025-01-07 08:27:03 +01:00 · 51de59b65c
commit 51de59b65c
parent 453a572664
1 changed files with 64 additions and 204 deletions
--- a/src/chains/rag_document_upload.ts
+++ b/src/chains/rag_document_upload.ts
@ -1,4 +1,3 @@
-import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 import { Document } from '@langchain/core/documents';
 import { Embeddings } from '@langchain/core/embeddings';
 import { Chroma } from '@langchain/community/vectorstores/chroma';
@ -9,7 +8,6 @@ import { StringOutputParser } from '@langchain/core/output_parsers';
 import formatChatHistoryAsString from '../utils/formatHistory';
 import { BaseMessage } from '@langchain/core/messages';

-// Type local pour la chaîne de recherche
 type SearchInput = {
  query: string;
  chat_history: BaseMessage[];
@ -17,42 +15,19 @@ type SearchInput = {
 };

 export class RAGDocumentChain {
+  private static instance: RAGDocumentChain;
  private vectorStore: Chroma | null = null;
-  private textSplitter = new RecursiveCharacterTextSplitter({
-    chunkSize: 1000,
-    chunkOverlap: 200,
-    separators: ["\n\n", "\n", ".", "!", "?", ";", ":", " ", ""],
-    keepSeparator: true,
-    lengthFunction: (text) => text.length
-  });
+  private readonly collectionName = 'uploaded_docs';
+  private initialized = false;
+  private currentDocuments: Document[] = [];

-  // Add chunk preprocessing
-  private preprocessChunk(text: string): string {
-    return text
-      .replace(/\s+/g, ' ')
-      .replace(/\n+/g, ' ')
-      .trim();
+  private constructor() {}
+
+  public static getInstance(): RAGDocumentChain {
+    if (!RAGDocumentChain.instance) {
+      RAGDocumentChain.instance = new RAGDocumentChain();
    }
-
-  // Add metadata enrichment
-  private enrichChunkMetadata(doc: Document): Document {
-    const metadata = {
-      ...doc.metadata,
-      chunk_type: 'text',
-      word_count: doc.pageContent.split(/\s+/).length,
-      processed_date: new Date().toISOString()
-    };
-    return new Document({
-      pageContent: this.preprocessChunk(doc.pageContent),
-      metadata
-    });
-  }
-
-  // Add chunk scoring
-  private scoreChunk(chunk: string): number {
-    const wordCount = chunk.split(/\s+/).length;
-    const sentenceCount = chunk.split(/[.!?]+/).length;
-    return wordCount > 10 && sentenceCount > 0 ? 1 : 0;
+    return RAGDocumentChain.instance;
  }

  public async initializeVectorStoreFromDocuments(
@ -60,46 +35,54 @@ export class RAGDocumentChain {
    embeddings: Embeddings
  ) {
    try {
-      console.log("🔄 Préparation des documents...");
-      
-      // Validate and preprocess documents
-      const validDocuments = documents
-        .filter(doc => doc.pageContent && doc.pageContent.trim().length > 50)
-        .map(doc => this.enrichChunkMetadata(doc));
-      
-      // Split documents into chunks
-      const texts = await this.textSplitter.splitDocuments(validDocuments);
-      console.log(`📄 ${texts.length} chunks créés`);
-      
-      // Score and filter chunks
-      const scoredTexts = texts.filter(doc => this.scoreChunk(doc.pageContent) > 0);
-      console.log(`📄 ${scoredTexts.length} chunks valides après scoring`);
-      
-      // Deduplicate chunks
-      const uniqueTexts = this.deduplicateChunks(scoredTexts);
-      console.log(`📄 ${uniqueTexts.length} chunks uniques après déduplication`);
-      
-      // Initialize vector store with optimized settings
-      this.vectorStore = await Chroma.fromDocuments(
-        uniqueTexts,
-        embeddings,
-        {
-          collectionName: "uploaded_docs",
-          url: "http://chroma:8000",
-          collectionMetadata: {
-            "hnsw:space": "cosine",
-            "hnsw:construction_ef": 100,  // Increased for better index quality
-            "hnsw:search_ef": 50,         // Balanced for search performance
-            "hnsw:m": 16                  // Number of connections per element
-          }
-        }
+      // Filtrer les documents invalides
+      const validDocuments = documents.filter(doc => 
+        doc.pageContent && 
+        typeof doc.pageContent === 'string' && 
+        doc.pageContent.trim().length > 0
      );

-      console.log("✅ VectorStore initialisé avec succès");
+      console.log(`📄 Documents valides: ${validDocuments.length}/${documents.length}`);
+      
+      // Si déjà initialisé avec les mêmes documents, ne rien faire
+      const sameDocuments = this.initialized && 
+        this.currentDocuments.length === validDocuments.length &&
+        validDocuments.every((doc, index) => 
+          doc.pageContent === this.currentDocuments[index].pageContent
+        );
+
+      if (sameDocuments) {
+        console.log("📚 Réutilisation de la collection existante");
        return {
          totalDocuments: documents.length,
-        validChunks: uniqueTexts.length,
-        averageChunkSize: this.calculateAverageChunkSize(uniqueTexts)
+          validDocuments: this.currentDocuments.length,
+          reused: true
+        };
+      }
+      
+      if (!this.vectorStore) {
+        console.log("🔄 Initialisation du vectorStore");
+        this.vectorStore = await Chroma.fromDocuments(validDocuments, embeddings, {
+          collectionName: this.collectionName,
+          url: "http://chroma:8000"
+        });
+        this.initialized = true;
+      } else {
+        console.log("🔄 Réinitialisation de la collection");
+        // Créer une nouvelle instance avec les nouveaux documents
+        this.vectorStore = await Chroma.fromDocuments(validDocuments, embeddings, {
+          collectionName: this.collectionName,
+          url: "http://chroma:8000"
+        });
+        this.initialized = true;
+      }
+
+      this.currentDocuments = validDocuments;
+      
+      return {
+        totalDocuments: documents.length,
+        validDocuments: validDocuments.length,
+        reused: false
      };
    } catch (error) {
      console.error("❌ Erreur lors de l'initialisation:", error);
@ -107,141 +90,26 @@ export class RAGDocumentChain {
    }
  }

-  private calculateAverageChunkSize(chunks: Document[]): number {
-    if (chunks.length === 0) return 0;
-    const totalLength = chunks.reduce((sum, doc) => sum + doc.pageContent.length, 0);
-    return Math.round(totalLength / chunks.length);
-  }
-
-  private deduplicateChunks(chunks: Document[]): Document[] {
-    const seen = new Set<string>();
-    return chunks.filter(chunk => {
-      const normalized = chunk.pageContent
-        .toLowerCase()
-        .replace(/\s+/g, ' ')
-        .trim();
-      
-      if (seen.has(normalized)) {
-        return false;
-      }
-      seen.add(normalized);
-      return true;
-    });
-  }
-
  public async searchSimilarDocuments(query: string, limit: number = 5) {
-    if (!this.vectorStore) {
-      console.warn("⚠️ VectorStore non initialisé");
-      return [];
+    if (!this.vectorStore || !this.initialized) {
+      throw new Error("VectorStore non initialisé");
    }

    try {
      console.log("🔍 Recherche pour:", query);
      
-      const initialResults = await this.vectorStore.similaritySearch(
-        query,
-        limit * 2,
-        { 
-          filter: { source: { $exists: true } },
-          minScore: 0.7
-        }
-      );
-      
-      const scoredResults = initialResults
-        .filter(doc => doc.pageContent.trim().length > 50)
-        .map(doc => ({
-          document: doc,
-          score: this.calculateRelevanceScore(query, doc.pageContent)
-        }))
-        .sort((a, b) => b.score - a.score)
-        .slice(0, limit)
-        .map(item => {
-          const doc = item.document;
-          const pageNumber = doc.metadata.page_number || doc.metadata.pageNumber || 1;
-          const title = doc.metadata.title || 'Document';
-          const source = doc.metadata.source;
-          
-          // Préparer le texte à surligner
-          const searchText = doc.pageContent
-            .substring(0, 200)
-            .replace(/[\n\r]+/g, ' ')
-            .trim();
-          
-          return new Document({
-            pageContent: doc.pageContent,
-            metadata: {
-              title: title,
-              pageNumber: pageNumber,
-              source: source,
-              type: doc.metadata.type || 'uploaded',
-              searchText: searchText,
-              url: source ? 
-                `/api/uploads/${source}/view?page=${pageNumber}&search=${encodeURIComponent(searchText)}` : 
-                undefined
-            }
-          });
+      const results = await this.vectorStore.similaritySearch(query, limit, {
+        k: limit
      });
      
-      const mergedResults = this.mergeRelatedChunks(scoredResults);
-      console.log(`📄 ${mergedResults.length} documents pertinents trouvés après reranking`);
-      return mergedResults;
+      console.log(`📄 ${results.length} documents pertinents trouvés`);
+      return results;
    } catch (error) {
      console.error("❌ Erreur de recherche:", error);
-      return [];
+      return this.currentDocuments.slice(0, limit);
    }
  }

-  private calculateRelevanceScore(query: string, content: string): number {
-    const normalizedQuery = query.toLowerCase();
-    const normalizedContent = content.toLowerCase();
-    
-    // Basic relevance scoring based on multiple factors
-    let score = 0;
-    
-    // Term frequency
-    const queryTerms = normalizedQuery.split(/\s+/);
-    queryTerms.forEach(term => {
-      const termCount = (normalizedContent.match(new RegExp(term, 'g')) || []).length;
-      score += termCount * 0.1;
-    });
-    
-    // Exact phrase matching
-    if (normalizedContent.includes(normalizedQuery)) {
-      score += 1;
-    }
-    
-    // Content length penalty (prefer shorter, more focused chunks)
-    const lengthPenalty = Math.max(0, 1 - (content.length / 5000));
-    score *= (1 + lengthPenalty);
-    
-    return score;
-  }
-
-  private mergeRelatedChunks(documents: Document[]): Document[] {
-    const merged: { [key: string]: Document } = {};
-    
-    documents.forEach(doc => {
-      const source = doc.metadata?.source || '';
-      const page = doc.metadata?.pageNumber || 1;
-      const key = `${source}-${page}`;
-      
-      if (!merged[key]) {
-        merged[key] = doc;
-      } else {
-        const existingDoc = merged[key];
-        merged[key] = new Document({
-          pageContent: `${existingDoc.pageContent}\n\n${doc.pageContent}`,
-          metadata: {
-            ...existingDoc.metadata,
-            searchText: existingDoc.metadata.searchText
-          }
-        });
-      }
-    });
-    
-    return Object.values(merged);
-  }
-
  public createSearchChain(llm: BaseChatModel) {
    return RunnableSequence.from([
      RunnableMap.from({
@ -251,15 +119,7 @@ export class RAGDocumentChain {
          const docs = await this.searchSimilarDocuments(input.query);
          return docs.map((doc, i) => {
            const source = doc.metadata?.source || 'Document';
-            const title = doc.metadata?.title || '';
-            const pageNumber = doc.metadata?.pageNumber;
-            const url = doc.metadata?.url;
-            
-            let sourceInfo = `Source: ${title || source}`;
-            if (pageNumber) sourceInfo += ` (page ${pageNumber})`;
-            if (url) sourceInfo += `\nURL: ${url}`;
-            
-            return `[Source ${i + 1}] ${doc.pageContent}\n${sourceInfo}`;
+            return `[Source ${i + 1}] ${doc.pageContent}\nSource: ${source}`;
          }).join("\n\n");
        }
      }),