From 51de59b65cdc273dc740bbe7e2d33b33c0774f21 Mon Sep 17 00:00:00 2001 From: Lucas Date: Tue, 7 Jan 2025 08:27:03 +0100 Subject: [PATCH] Update rag_document_upload.ts RAG optimise --- src/chains/rag_document_upload.ts | 268 +++++++----------------------- 1 file changed, 64 insertions(+), 204 deletions(-) diff --git a/src/chains/rag_document_upload.ts b/src/chains/rag_document_upload.ts index 32b4681..0339a7f 100644 --- a/src/chains/rag_document_upload.ts +++ b/src/chains/rag_document_upload.ts @@ -1,4 +1,3 @@ -import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import { Document } from '@langchain/core/documents'; import { Embeddings } from '@langchain/core/embeddings'; import { Chroma } from '@langchain/community/vectorstores/chroma'; @@ -9,7 +8,6 @@ import { StringOutputParser } from '@langchain/core/output_parsers'; import formatChatHistoryAsString from '../utils/formatHistory'; import { BaseMessage } from '@langchain/core/messages'; -// Type local pour la chaîne de recherche type SearchInput = { query: string; chat_history: BaseMessage[]; @@ -17,42 +15,19 @@ type SearchInput = { }; export class RAGDocumentChain { + private static instance: RAGDocumentChain; private vectorStore: Chroma | null = null; - private textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize: 1000, - chunkOverlap: 200, - separators: ["\n\n", "\n", ".", "!", "?", ";", ":", " ", ""], - keepSeparator: true, - lengthFunction: (text) => text.length - }); + private readonly collectionName = 'uploaded_docs'; + private initialized = false; + private currentDocuments: Document[] = []; - // Add chunk preprocessing - private preprocessChunk(text: string): string { - return text - .replace(/\s+/g, ' ') - .replace(/\n+/g, ' ') - .trim(); - } + private constructor() {} - // Add metadata enrichment - private enrichChunkMetadata(doc: Document): Document { - const metadata = { - ...doc.metadata, - chunk_type: 'text', - word_count: doc.pageContent.split(/\s+/).length, - processed_date: new Date().toISOString() - }; - return new Document({ - pageContent: this.preprocessChunk(doc.pageContent), - metadata - }); - } - - // Add chunk scoring - private scoreChunk(chunk: string): number { - const wordCount = chunk.split(/\s+/).length; - const sentenceCount = chunk.split(/[.!?]+/).length; - return wordCount > 10 && sentenceCount > 0 ? 1 : 0; + public static getInstance(): RAGDocumentChain { + if (!RAGDocumentChain.instance) { + RAGDocumentChain.instance = new RAGDocumentChain(); + } + return RAGDocumentChain.instance; } public async initializeVectorStoreFromDocuments( @@ -60,46 +35,54 @@ export class RAGDocumentChain { embeddings: Embeddings ) { try { - console.log("🔄 Préparation des documents..."); - - // Validate and preprocess documents - const validDocuments = documents - .filter(doc => doc.pageContent && doc.pageContent.trim().length > 50) - .map(doc => this.enrichChunkMetadata(doc)); - - // Split documents into chunks - const texts = await this.textSplitter.splitDocuments(validDocuments); - console.log(`📄 ${texts.length} chunks créés`); - - // Score and filter chunks - const scoredTexts = texts.filter(doc => this.scoreChunk(doc.pageContent) > 0); - console.log(`📄 ${scoredTexts.length} chunks valides après scoring`); - - // Deduplicate chunks - const uniqueTexts = this.deduplicateChunks(scoredTexts); - console.log(`📄 ${uniqueTexts.length} chunks uniques après déduplication`); - - // Initialize vector store with optimized settings - this.vectorStore = await Chroma.fromDocuments( - uniqueTexts, - embeddings, - { - collectionName: "uploaded_docs", - url: "http://chroma:8000", - collectionMetadata: { - "hnsw:space": "cosine", - "hnsw:construction_ef": 100, // Increased for better index quality - "hnsw:search_ef": 50, // Balanced for search performance - "hnsw:m": 16 // Number of connections per element - } - } + // Filtrer les documents invalides + const validDocuments = documents.filter(doc => + doc.pageContent && + typeof doc.pageContent === 'string' && + doc.pageContent.trim().length > 0 ); + + console.log(`📄 Documents valides: ${validDocuments.length}/${documents.length}`); + + // Si déjà initialisé avec les mêmes documents, ne rien faire + const sameDocuments = this.initialized && + this.currentDocuments.length === validDocuments.length && + validDocuments.every((doc, index) => + doc.pageContent === this.currentDocuments[index].pageContent + ); + + if (sameDocuments) { + console.log("📚 Réutilisation de la collection existante"); + return { + totalDocuments: documents.length, + validDocuments: this.currentDocuments.length, + reused: true + }; + } + + if (!this.vectorStore) { + console.log("🔄 Initialisation du vectorStore"); + this.vectorStore = await Chroma.fromDocuments(validDocuments, embeddings, { + collectionName: this.collectionName, + url: "http://chroma:8000" + }); + this.initialized = true; + } else { + console.log("🔄 Réinitialisation de la collection"); + // Créer une nouvelle instance avec les nouveaux documents + this.vectorStore = await Chroma.fromDocuments(validDocuments, embeddings, { + collectionName: this.collectionName, + url: "http://chroma:8000" + }); + this.initialized = true; + } + + this.currentDocuments = validDocuments; - console.log("✅ VectorStore initialisé avec succès"); return { totalDocuments: documents.length, - validChunks: uniqueTexts.length, - averageChunkSize: this.calculateAverageChunkSize(uniqueTexts) + validDocuments: validDocuments.length, + reused: false }; } catch (error) { console.error("❌ Erreur lors de l'initialisation:", error); @@ -107,141 +90,26 @@ export class RAGDocumentChain { } } - private calculateAverageChunkSize(chunks: Document[]): number { - if (chunks.length === 0) return 0; - const totalLength = chunks.reduce((sum, doc) => sum + doc.pageContent.length, 0); - return Math.round(totalLength / chunks.length); - } - - private deduplicateChunks(chunks: Document[]): Document[] { - const seen = new Set(); - return chunks.filter(chunk => { - const normalized = chunk.pageContent - .toLowerCase() - .replace(/\s+/g, ' ') - .trim(); - - if (seen.has(normalized)) { - return false; - } - seen.add(normalized); - return true; - }); - } - public async searchSimilarDocuments(query: string, limit: number = 5) { - if (!this.vectorStore) { - console.warn("⚠️ VectorStore non initialisé"); - return []; + if (!this.vectorStore || !this.initialized) { + throw new Error("VectorStore non initialisé"); } try { console.log("🔍 Recherche pour:", query); - const initialResults = await this.vectorStore.similaritySearch( - query, - limit * 2, - { - filter: { source: { $exists: true } }, - minScore: 0.7 - } - ); + const results = await this.vectorStore.similaritySearch(query, limit, { + k: limit + }); - const scoredResults = initialResults - .filter(doc => doc.pageContent.trim().length > 50) - .map(doc => ({ - document: doc, - score: this.calculateRelevanceScore(query, doc.pageContent) - })) - .sort((a, b) => b.score - a.score) - .slice(0, limit) - .map(item => { - const doc = item.document; - const pageNumber = doc.metadata.page_number || doc.metadata.pageNumber || 1; - const title = doc.metadata.title || 'Document'; - const source = doc.metadata.source; - - // Préparer le texte à surligner - const searchText = doc.pageContent - .substring(0, 200) - .replace(/[\n\r]+/g, ' ') - .trim(); - - return new Document({ - pageContent: doc.pageContent, - metadata: { - title: title, - pageNumber: pageNumber, - source: source, - type: doc.metadata.type || 'uploaded', - searchText: searchText, - url: source ? - `/api/uploads/${source}/view?page=${pageNumber}&search=${encodeURIComponent(searchText)}` : - undefined - } - }); - }); - - const mergedResults = this.mergeRelatedChunks(scoredResults); - console.log(`📄 ${mergedResults.length} documents pertinents trouvés après reranking`); - return mergedResults; + console.log(`📄 ${results.length} documents pertinents trouvés`); + return results; } catch (error) { console.error("❌ Erreur de recherche:", error); - return []; + return this.currentDocuments.slice(0, limit); } } - private calculateRelevanceScore(query: string, content: string): number { - const normalizedQuery = query.toLowerCase(); - const normalizedContent = content.toLowerCase(); - - // Basic relevance scoring based on multiple factors - let score = 0; - - // Term frequency - const queryTerms = normalizedQuery.split(/\s+/); - queryTerms.forEach(term => { - const termCount = (normalizedContent.match(new RegExp(term, 'g')) || []).length; - score += termCount * 0.1; - }); - - // Exact phrase matching - if (normalizedContent.includes(normalizedQuery)) { - score += 1; - } - - // Content length penalty (prefer shorter, more focused chunks) - const lengthPenalty = Math.max(0, 1 - (content.length / 5000)); - score *= (1 + lengthPenalty); - - return score; - } - - private mergeRelatedChunks(documents: Document[]): Document[] { - const merged: { [key: string]: Document } = {}; - - documents.forEach(doc => { - const source = doc.metadata?.source || ''; - const page = doc.metadata?.pageNumber || 1; - const key = `${source}-${page}`; - - if (!merged[key]) { - merged[key] = doc; - } else { - const existingDoc = merged[key]; - merged[key] = new Document({ - pageContent: `${existingDoc.pageContent}\n\n${doc.pageContent}`, - metadata: { - ...existingDoc.metadata, - searchText: existingDoc.metadata.searchText - } - }); - } - }); - - return Object.values(merged); - } - public createSearchChain(llm: BaseChatModel) { return RunnableSequence.from([ RunnableMap.from({ @@ -251,15 +119,7 @@ export class RAGDocumentChain { const docs = await this.searchSimilarDocuments(input.query); return docs.map((doc, i) => { const source = doc.metadata?.source || 'Document'; - const title = doc.metadata?.title || ''; - const pageNumber = doc.metadata?.pageNumber; - const url = doc.metadata?.url; - - let sourceInfo = `Source: ${title || source}`; - if (pageNumber) sourceInfo += ` (page ${pageNumber})`; - if (url) sourceInfo += `\nURL: ${url}`; - - return `[Source ${i + 1}] ${doc.pageContent}\n${sourceInfo}`; + return `[Source ${i + 1}] ${doc.pageContent}\nSource: ${source}`; }).join("\n\n"); } }), @@ -289,4 +149,4 @@ export class RAGDocumentChain { public isInitialized(): boolean { return this.vectorStore !== null; } -} \ No newline at end of file +}