Update rag_document_upload.ts
RAG optimise
This commit is contained in:
parent
453a572664
commit
51de59b65c
1 changed files with 64 additions and 204 deletions
|
@ -1,4 +1,3 @@
|
||||||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
|
||||||
import { Document } from '@langchain/core/documents';
|
import { Document } from '@langchain/core/documents';
|
||||||
import { Embeddings } from '@langchain/core/embeddings';
|
import { Embeddings } from '@langchain/core/embeddings';
|
||||||
import { Chroma } from '@langchain/community/vectorstores/chroma';
|
import { Chroma } from '@langchain/community/vectorstores/chroma';
|
||||||
|
@ -9,7 +8,6 @@ import { StringOutputParser } from '@langchain/core/output_parsers';
|
||||||
import formatChatHistoryAsString from '../utils/formatHistory';
|
import formatChatHistoryAsString from '../utils/formatHistory';
|
||||||
import { BaseMessage } from '@langchain/core/messages';
|
import { BaseMessage } from '@langchain/core/messages';
|
||||||
|
|
||||||
// Type local pour la chaîne de recherche
|
|
||||||
type SearchInput = {
|
type SearchInput = {
|
||||||
query: string;
|
query: string;
|
||||||
chat_history: BaseMessage[];
|
chat_history: BaseMessage[];
|
||||||
|
@ -17,42 +15,19 @@ type SearchInput = {
|
||||||
};
|
};
|
||||||
|
|
||||||
export class RAGDocumentChain {
|
export class RAGDocumentChain {
|
||||||
|
private static instance: RAGDocumentChain;
|
||||||
private vectorStore: Chroma | null = null;
|
private vectorStore: Chroma | null = null;
|
||||||
private textSplitter = new RecursiveCharacterTextSplitter({
|
private readonly collectionName = 'uploaded_docs';
|
||||||
chunkSize: 1000,
|
private initialized = false;
|
||||||
chunkOverlap: 200,
|
private currentDocuments: Document[] = [];
|
||||||
separators: ["\n\n", "\n", ".", "!", "?", ";", ":", " ", ""],
|
|
||||||
keepSeparator: true,
|
|
||||||
lengthFunction: (text) => text.length
|
|
||||||
});
|
|
||||||
|
|
||||||
// Add chunk preprocessing
|
private constructor() {}
|
||||||
private preprocessChunk(text: string): string {
|
|
||||||
return text
|
public static getInstance(): RAGDocumentChain {
|
||||||
.replace(/\s+/g, ' ')
|
if (!RAGDocumentChain.instance) {
|
||||||
.replace(/\n+/g, ' ')
|
RAGDocumentChain.instance = new RAGDocumentChain();
|
||||||
.trim();
|
|
||||||
}
|
}
|
||||||
|
return RAGDocumentChain.instance;
|
||||||
// Add metadata enrichment
|
|
||||||
private enrichChunkMetadata(doc: Document): Document {
|
|
||||||
const metadata = {
|
|
||||||
...doc.metadata,
|
|
||||||
chunk_type: 'text',
|
|
||||||
word_count: doc.pageContent.split(/\s+/).length,
|
|
||||||
processed_date: new Date().toISOString()
|
|
||||||
};
|
|
||||||
return new Document({
|
|
||||||
pageContent: this.preprocessChunk(doc.pageContent),
|
|
||||||
metadata
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add chunk scoring
|
|
||||||
private scoreChunk(chunk: string): number {
|
|
||||||
const wordCount = chunk.split(/\s+/).length;
|
|
||||||
const sentenceCount = chunk.split(/[.!?]+/).length;
|
|
||||||
return wordCount > 10 && sentenceCount > 0 ? 1 : 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public async initializeVectorStoreFromDocuments(
|
public async initializeVectorStoreFromDocuments(
|
||||||
|
@ -60,46 +35,54 @@ export class RAGDocumentChain {
|
||||||
embeddings: Embeddings
|
embeddings: Embeddings
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
console.log("🔄 Préparation des documents...");
|
// Filtrer les documents invalides
|
||||||
|
const validDocuments = documents.filter(doc =>
|
||||||
// Validate and preprocess documents
|
doc.pageContent &&
|
||||||
const validDocuments = documents
|
typeof doc.pageContent === 'string' &&
|
||||||
.filter(doc => doc.pageContent && doc.pageContent.trim().length > 50)
|
doc.pageContent.trim().length > 0
|
||||||
.map(doc => this.enrichChunkMetadata(doc));
|
|
||||||
|
|
||||||
// Split documents into chunks
|
|
||||||
const texts = await this.textSplitter.splitDocuments(validDocuments);
|
|
||||||
console.log(`📄 ${texts.length} chunks créés`);
|
|
||||||
|
|
||||||
// Score and filter chunks
|
|
||||||
const scoredTexts = texts.filter(doc => this.scoreChunk(doc.pageContent) > 0);
|
|
||||||
console.log(`📄 ${scoredTexts.length} chunks valides après scoring`);
|
|
||||||
|
|
||||||
// Deduplicate chunks
|
|
||||||
const uniqueTexts = this.deduplicateChunks(scoredTexts);
|
|
||||||
console.log(`📄 ${uniqueTexts.length} chunks uniques après déduplication`);
|
|
||||||
|
|
||||||
// Initialize vector store with optimized settings
|
|
||||||
this.vectorStore = await Chroma.fromDocuments(
|
|
||||||
uniqueTexts,
|
|
||||||
embeddings,
|
|
||||||
{
|
|
||||||
collectionName: "uploaded_docs",
|
|
||||||
url: "http://chroma:8000",
|
|
||||||
collectionMetadata: {
|
|
||||||
"hnsw:space": "cosine",
|
|
||||||
"hnsw:construction_ef": 100, // Increased for better index quality
|
|
||||||
"hnsw:search_ef": 50, // Balanced for search performance
|
|
||||||
"hnsw:m": 16 // Number of connections per element
|
|
||||||
}
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
console.log("✅ VectorStore initialisé avec succès");
|
console.log(`📄 Documents valides: ${validDocuments.length}/${documents.length}`);
|
||||||
|
|
||||||
|
// Si déjà initialisé avec les mêmes documents, ne rien faire
|
||||||
|
const sameDocuments = this.initialized &&
|
||||||
|
this.currentDocuments.length === validDocuments.length &&
|
||||||
|
validDocuments.every((doc, index) =>
|
||||||
|
doc.pageContent === this.currentDocuments[index].pageContent
|
||||||
|
);
|
||||||
|
|
||||||
|
if (sameDocuments) {
|
||||||
|
console.log("📚 Réutilisation de la collection existante");
|
||||||
return {
|
return {
|
||||||
totalDocuments: documents.length,
|
totalDocuments: documents.length,
|
||||||
validChunks: uniqueTexts.length,
|
validDocuments: this.currentDocuments.length,
|
||||||
averageChunkSize: this.calculateAverageChunkSize(uniqueTexts)
|
reused: true
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!this.vectorStore) {
|
||||||
|
console.log("🔄 Initialisation du vectorStore");
|
||||||
|
this.vectorStore = await Chroma.fromDocuments(validDocuments, embeddings, {
|
||||||
|
collectionName: this.collectionName,
|
||||||
|
url: "http://chroma:8000"
|
||||||
|
});
|
||||||
|
this.initialized = true;
|
||||||
|
} else {
|
||||||
|
console.log("🔄 Réinitialisation de la collection");
|
||||||
|
// Créer une nouvelle instance avec les nouveaux documents
|
||||||
|
this.vectorStore = await Chroma.fromDocuments(validDocuments, embeddings, {
|
||||||
|
collectionName: this.collectionName,
|
||||||
|
url: "http://chroma:8000"
|
||||||
|
});
|
||||||
|
this.initialized = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.currentDocuments = validDocuments;
|
||||||
|
|
||||||
|
return {
|
||||||
|
totalDocuments: documents.length,
|
||||||
|
validDocuments: validDocuments.length,
|
||||||
|
reused: false
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("❌ Erreur lors de l'initialisation:", error);
|
console.error("❌ Erreur lors de l'initialisation:", error);
|
||||||
|
@ -107,141 +90,26 @@ export class RAGDocumentChain {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private calculateAverageChunkSize(chunks: Document[]): number {
|
|
||||||
if (chunks.length === 0) return 0;
|
|
||||||
const totalLength = chunks.reduce((sum, doc) => sum + doc.pageContent.length, 0);
|
|
||||||
return Math.round(totalLength / chunks.length);
|
|
||||||
}
|
|
||||||
|
|
||||||
private deduplicateChunks(chunks: Document[]): Document[] {
|
|
||||||
const seen = new Set<string>();
|
|
||||||
return chunks.filter(chunk => {
|
|
||||||
const normalized = chunk.pageContent
|
|
||||||
.toLowerCase()
|
|
||||||
.replace(/\s+/g, ' ')
|
|
||||||
.trim();
|
|
||||||
|
|
||||||
if (seen.has(normalized)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
seen.add(normalized);
|
|
||||||
return true;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
public async searchSimilarDocuments(query: string, limit: number = 5) {
|
public async searchSimilarDocuments(query: string, limit: number = 5) {
|
||||||
if (!this.vectorStore) {
|
if (!this.vectorStore || !this.initialized) {
|
||||||
console.warn("⚠️ VectorStore non initialisé");
|
throw new Error("VectorStore non initialisé");
|
||||||
return [];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
console.log("🔍 Recherche pour:", query);
|
console.log("🔍 Recherche pour:", query);
|
||||||
|
|
||||||
const initialResults = await this.vectorStore.similaritySearch(
|
const results = await this.vectorStore.similaritySearch(query, limit, {
|
||||||
query,
|
k: limit
|
||||||
limit * 2,
|
|
||||||
{
|
|
||||||
filter: { source: { $exists: true } },
|
|
||||||
minScore: 0.7
|
|
||||||
}
|
|
||||||
);
|
|
||||||
|
|
||||||
const scoredResults = initialResults
|
|
||||||
.filter(doc => doc.pageContent.trim().length > 50)
|
|
||||||
.map(doc => ({
|
|
||||||
document: doc,
|
|
||||||
score: this.calculateRelevanceScore(query, doc.pageContent)
|
|
||||||
}))
|
|
||||||
.sort((a, b) => b.score - a.score)
|
|
||||||
.slice(0, limit)
|
|
||||||
.map(item => {
|
|
||||||
const doc = item.document;
|
|
||||||
const pageNumber = doc.metadata.page_number || doc.metadata.pageNumber || 1;
|
|
||||||
const title = doc.metadata.title || 'Document';
|
|
||||||
const source = doc.metadata.source;
|
|
||||||
|
|
||||||
// Préparer le texte à surligner
|
|
||||||
const searchText = doc.pageContent
|
|
||||||
.substring(0, 200)
|
|
||||||
.replace(/[\n\r]+/g, ' ')
|
|
||||||
.trim();
|
|
||||||
|
|
||||||
return new Document({
|
|
||||||
pageContent: doc.pageContent,
|
|
||||||
metadata: {
|
|
||||||
title: title,
|
|
||||||
pageNumber: pageNumber,
|
|
||||||
source: source,
|
|
||||||
type: doc.metadata.type || 'uploaded',
|
|
||||||
searchText: searchText,
|
|
||||||
url: source ?
|
|
||||||
`/api/uploads/${source}/view?page=${pageNumber}&search=${encodeURIComponent(searchText)}` :
|
|
||||||
undefined
|
|
||||||
}
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
|
|
||||||
const mergedResults = this.mergeRelatedChunks(scoredResults);
|
console.log(`📄 ${results.length} documents pertinents trouvés`);
|
||||||
console.log(`📄 ${mergedResults.length} documents pertinents trouvés après reranking`);
|
return results;
|
||||||
return mergedResults;
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("❌ Erreur de recherche:", error);
|
console.error("❌ Erreur de recherche:", error);
|
||||||
return [];
|
return this.currentDocuments.slice(0, limit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private calculateRelevanceScore(query: string, content: string): number {
|
|
||||||
const normalizedQuery = query.toLowerCase();
|
|
||||||
const normalizedContent = content.toLowerCase();
|
|
||||||
|
|
||||||
// Basic relevance scoring based on multiple factors
|
|
||||||
let score = 0;
|
|
||||||
|
|
||||||
// Term frequency
|
|
||||||
const queryTerms = normalizedQuery.split(/\s+/);
|
|
||||||
queryTerms.forEach(term => {
|
|
||||||
const termCount = (normalizedContent.match(new RegExp(term, 'g')) || []).length;
|
|
||||||
score += termCount * 0.1;
|
|
||||||
});
|
|
||||||
|
|
||||||
// Exact phrase matching
|
|
||||||
if (normalizedContent.includes(normalizedQuery)) {
|
|
||||||
score += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Content length penalty (prefer shorter, more focused chunks)
|
|
||||||
const lengthPenalty = Math.max(0, 1 - (content.length / 5000));
|
|
||||||
score *= (1 + lengthPenalty);
|
|
||||||
|
|
||||||
return score;
|
|
||||||
}
|
|
||||||
|
|
||||||
private mergeRelatedChunks(documents: Document[]): Document[] {
|
|
||||||
const merged: { [key: string]: Document } = {};
|
|
||||||
|
|
||||||
documents.forEach(doc => {
|
|
||||||
const source = doc.metadata?.source || '';
|
|
||||||
const page = doc.metadata?.pageNumber || 1;
|
|
||||||
const key = `${source}-${page}`;
|
|
||||||
|
|
||||||
if (!merged[key]) {
|
|
||||||
merged[key] = doc;
|
|
||||||
} else {
|
|
||||||
const existingDoc = merged[key];
|
|
||||||
merged[key] = new Document({
|
|
||||||
pageContent: `${existingDoc.pageContent}\n\n${doc.pageContent}`,
|
|
||||||
metadata: {
|
|
||||||
...existingDoc.metadata,
|
|
||||||
searchText: existingDoc.metadata.searchText
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
return Object.values(merged);
|
|
||||||
}
|
|
||||||
|
|
||||||
public createSearchChain(llm: BaseChatModel) {
|
public createSearchChain(llm: BaseChatModel) {
|
||||||
return RunnableSequence.from([
|
return RunnableSequence.from([
|
||||||
RunnableMap.from({
|
RunnableMap.from({
|
||||||
|
@ -251,15 +119,7 @@ export class RAGDocumentChain {
|
||||||
const docs = await this.searchSimilarDocuments(input.query);
|
const docs = await this.searchSimilarDocuments(input.query);
|
||||||
return docs.map((doc, i) => {
|
return docs.map((doc, i) => {
|
||||||
const source = doc.metadata?.source || 'Document';
|
const source = doc.metadata?.source || 'Document';
|
||||||
const title = doc.metadata?.title || '';
|
return `[Source ${i + 1}] ${doc.pageContent}\nSource: ${source}`;
|
||||||
const pageNumber = doc.metadata?.pageNumber;
|
|
||||||
const url = doc.metadata?.url;
|
|
||||||
|
|
||||||
let sourceInfo = `Source: ${title || source}`;
|
|
||||||
if (pageNumber) sourceInfo += ` (page ${pageNumber})`;
|
|
||||||
if (url) sourceInfo += `\nURL: ${url}`;
|
|
||||||
|
|
||||||
return `[Source ${i + 1}] ${doc.pageContent}\n${sourceInfo}`;
|
|
||||||
}).join("\n\n");
|
}).join("\n\n");
|
||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
|
|
Loading…
Add table
Reference in a new issue