Update uploads.ts

RAG optimise
This commit is contained in:
Lucas 2025-01-07 08:26:27 +01:00 committed by GitHub
parent dda24ad349
commit 453a572664
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -6,20 +6,24 @@ import crypto from 'crypto';
import fs from 'fs';
import { Embeddings } from '@langchain/core/embeddings';
import { getAvailableEmbeddingModelProviders } from '../lib/providers';
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx';
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { Document } from '@langchain/core/documents';
import { RAGDocumentChain } from '../chains/rag_document_upload';
import { Chroma } from "langchain/vectorstores/chroma";
import { Chroma } from "@langchain/community/vectorstores/chroma";
const router = express.Router();
// Ajout d'un cache pour les embeddings avec le bon type
const embeddingsCache = new Map<string, number[]>();
// Configuration optimisée du text splitter
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
chunkSize: 1500,
chunkOverlap: 150,
separators: ["\n\n", "\n", ".", "!", "?", ";", ":", " ", ""],
keepSeparator: true,
keepSeparator: false,
lengthFunction: (text) => text.length
});
@ -62,6 +66,129 @@ const scoreDocument = (doc: Document): number => {
return wordCount > 10 && sentenceCount > 0 ? 1 : 0;
};
// Optimisation du traitement des documents
const processDocumentInBatches = async (
docs: Document[],
batchSize: number = 50
): Promise<Document[]> => {
const processedDocs: Document[] = [];
for (let i = 0; i < docs.length; i += batchSize) {
const batch = docs.slice(i, i + batchSize);
const processed = await Promise.all(
batch.map(async (doc) => preprocessDocument(doc))
);
processedDocs.push(...processed);
}
return processedDocs;
};
// Optimisation de l'extraction du texte avec des loaders natifs
const extractDocument = async (filePath: string, mimeType: string): Promise<Document[]> => {
try {
console.log(`📄 Extraction du document: ${filePath} (${mimeType})`);
let docs: Document[] = [];
if (mimeType === 'application/pdf') {
const loader = new PDFLoader(filePath, {
splitPages: true,
parsedItemSeparator: "\n",
});
docs = await loader.load();
} else if (mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
const loader = new DocxLoader(filePath);
docs = await loader.load();
} else if (mimeType === 'text/plain') {
// Traitement direct des fichiers texte
const text = fs.readFileSync(filePath, 'utf-8');
docs = [new Document({
pageContent: text,
metadata: {
source: filePath,
type: 'text',
mime_type: mimeType
}
})];
} else {
throw new Error(`Type de fichier non supporté: ${mimeType}`);
}
console.log(`📑 ${docs.length} pages extraites`);
// Amélioration du traitement des documents
const enhancedDocs = docs.map((doc, index) => {
return new Document({
pageContent: doc.pageContent,
metadata: {
...doc.metadata,
source: filePath,
page: index + 1,
total_pages: docs.length,
mime_type: mimeType,
extraction_date: new Date().toISOString()
}
});
});
return enhancedDocs;
} catch (error) {
console.error(`❌ Erreur lors de l'extraction: ${error.message}`);
throw error;
}
};
// Fonction utilitaire pour normaliser les embeddings
const normalizeL2 = (vector: number[]): number[] => {
const norm = Math.sqrt(vector.reduce((sum, val) => sum + val * val, 0));
return norm === 0 ? vector : vector.map(v => v / norm);
};
// Optimisation de la génération des embeddings
const generateEmbeddings = async (
texts: string[],
embeddingsModel: Embeddings,
dimensions: number = 1536
): Promise<number[][]> => {
try {
// Nettoyage et préparation des textes
const cleanedTexts = texts.map(text =>
text.replace(/\s+/g, ' ')
.trim()
.slice(0, 8000) // Limite OpenAI
).filter(text => text.length > 0);
if (cleanedTexts.length === 0) {
throw new Error("Aucun texte valide à traiter");
}
// Traitement par lots de 100 (limite OpenAI)
const batchSize = 100;
const embeddings: number[][] = [];
for (let i = 0; i < cleanedTexts.length; i += batchSize) {
const batch = cleanedTexts.slice(i, i + batchSize);
console.log(`🔄 Traitement du lot ${Math.floor(i / batchSize) + 1}/${Math.ceil(cleanedTexts.length / batchSize)}`);
const batchEmbeddings = await embeddingsModel.embedDocuments(batch);
// Redimensionnement et normalisation si nécessaire
const processedEmbeddings = batchEmbeddings.map(emb => {
const resized = (emb as number[]).slice(0, dimensions);
return normalizeL2(resized);
});
embeddings.push(...processedEmbeddings);
}
return embeddings;
} catch (error) {
console.error("❌ Erreur lors de la génération des embeddings:", error);
throw error;
}
};
router.post(
'/',
upload.fields([
@ -85,16 +212,12 @@ router.post(
}
const embeddingModels = await getAvailableEmbeddingModelProviders();
console.log("🔍 [Uploads] Modèles disponibles:", Object.keys(embeddingModels));
const provider = embedding_model_provider ?? Object.keys(embeddingModels)[0];
const embeddingModel: Embeddings = embedding_model ?? Object.keys(embeddingModels[provider])[0];
console.log("🤖 [Uploads] Modèle sélectionné:", { provider, model: embeddingModel });
const embeddingModel = embedding_model ?? Object.keys(embeddingModels[provider])[0];
let embeddingsModel: Embeddings | undefined;
if (embeddingModels[provider] && embeddingModels[provider][embeddingModel]) {
embeddingsModel = embeddingModels[provider][embeddingModel].model as Embeddings | undefined;
embeddingsModel = embeddingModels[provider][embeddingModel].model as Embeddings;
}
if (!embeddingsModel) {
@ -104,138 +227,90 @@ router.post(
}
const files = req.files['files'] as Express.Multer.File[];
console.log("📁 [Uploads] Fichiers reçus:", files?.map(f => ({
name: f.originalname,
path: f.path,
type: f.mimetype
})));
if (!files || files.length === 0) {
if (!files?.length) {
console.warn("⚠️ [Uploads] Aucun fichier reçu");
res.status(400).json({ message: 'No files uploaded' });
return;
}
const processedDocs: Document[] = [];
const ragChain = new RAGDocumentChain();
let totalPages = 0;
await Promise.all(
// Traitement parallèle des fichiers
const results = await Promise.all(
files.map(async (file) => {
console.log(`📄 [Uploads] Traitement du fichier: ${file.originalname}`);
let docs: Document[] = [];
try {
console.log(`📄 [Uploads] Traitement du fichier: ${file.originalname}`);
let docs: Document[] = [];
const cacheKey = `${file.path}_${embedding_model}`;
if (embeddingsCache.has(cacheKey)) {
console.log("🎯 [Uploads] Utilisation du cache pour", file.originalname);
return {
fileName: file.originalname,
fileId: file.filename.replace(/\.\w+$/, ''),
cached: true
};
}
if (file.mimetype === 'application/pdf') {
console.log(`📚 [Uploads] Chargement du PDF: ${file.path}`);
const loader = new PDFLoader(file.path, {
splitPages: true
});
docs = await loader.load();
totalPages += docs.length;
} else if (file.mimetype === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
console.log(`📝 [Uploads] Chargement du DOCX: ${file.path}`);
const loader = new DocxLoader(file.path);
docs = await loader.load();
totalPages += docs.length;
} else if (file.mimetype === 'text/plain') {
console.log(`📄 [Uploads] Chargement du TXT: ${file.path}`);
const text = fs.readFileSync(file.path, 'utf-8');
docs = [new Document({
pageContent: text,
metadata: {
title: file.originalname,
source: file.path,
type: 'text'
}
})];
totalPages += 1;
}
docs = await extractDocument(file.path, file.mimetype);
const processedDocs = await processDocumentInBatches(docs);
console.log(`✂️ [Uploads] ${processedDocs.length} documents traités`);
const preprocessedDocs = docs.map(preprocessDocument);
const scoredDocs = preprocessedDocs.filter(doc => scoreDocument(doc) > 0);
console.log(`✂️ [Uploads] Splitting du document en ${scoredDocs.length} parties valides`);
const splitted = await splitter.splitDocuments(scoredDocs);
// Utilisation de la nouvelle fonction d'embeddings
const embeddings = await generateEmbeddings(
processedDocs.map(doc => doc.pageContent),
embeddingsModel,
1536 // Dimension par défaut pour text-embedding-3-small
);
const enrichedDocs = splitted.map((doc, index) => {
const pageNumber = Math.floor(index / (splitted.length / docs.length)) + 1;
return new Document({
pageContent: doc.pageContent,
metadata: {
...doc.metadata,
source: file.path,
title: file.originalname,
page_number: pageNumber,
chunk_index: index,
total_chunks: splitted.length,
file_type: file.mimetype,
search_text: doc.pageContent.substring(0, 100).trim()
// Mise en cache du premier embedding
if (embeddings.length > 0) {
embeddingsCache.set(cacheKey, embeddings[0]);
}
// Sauvegarde avec les embeddings normalisés
const pathToSave = file.path.replace(/\.\w+$/, '-extracted.json');
fs.writeFileSync(pathToSave, JSON.stringify({
title: file.originalname,
contents: processedDocs.map((doc, index) => ({
content: doc.pageContent,
metadata: doc.metadata,
embedding: embeddings[index]
})),
pageCount: docs.length,
processingDate: new Date().toISOString()
}, null, 2));
return {
fileName: file.originalname,
fileId: file.filename.replace(/\.\w+$/, ''),
stats: {
chunks: processedDocs.length,
pages: docs.length,
embeddingsGenerated: embeddings.length
}
});
});
processedDocs.push(...enrichedDocs);
const pathToSave = file.path.replace(/\.\w+$/, '-extracted.json');
const contentToSave = {
title: file.originalname,
contents: enrichedDocs.map((doc) => ({
content: doc.pageContent,
metadata: doc.metadata
})),
pageCount: docs.length,
processingDate: new Date().toISOString()
};
fs.writeFileSync(pathToSave, JSON.stringify(contentToSave, null, 2));
console.log(`🧮 [Uploads] Génération des embeddings pour ${enrichedDocs.length} chunks`);
const embeddings = await embeddingsModel.embedDocuments(
enrichedDocs.map((doc) => doc.pageContent)
);
const pathToSaveEmbeddings = file.path.replace(/\.\w+$/, '-embeddings.json');
const embeddingsToSave = {
title: file.originalname,
embeddings: embeddings.map((embedding, index) => ({
vector: embedding,
metadata: enrichedDocs[index].metadata
}))
};
fs.writeFileSync(pathToSaveEmbeddings, JSON.stringify(embeddingsToSave));
};
} catch (error) {
console.error(`❌ Erreur lors du traitement de ${file.originalname}:`, error);
return {
fileName: file.originalname,
fileId: file.filename.replace(/\.\w+$/, ''),
error: error.message
};
}
})
);
console.log("🔄 [Uploads] Initialisation du vectorStore avec", processedDocs.length, "documents");
const initResult = await ragChain.initializeVectorStoreFromDocuments(
processedDocs,
embeddingsModel
);
res.status(200).json({ files: results });
console.log("✅ [Uploads] VectorStore initialisé:", initResult);
res.status(200).json({
files: files.map((file) => ({
fileName: file.originalname,
fileExtension: file.filename.split('.').pop(),
fileId: file.filename.replace(/\.\w+$/, ''),
stats: {
chunks: processedDocs.filter(d => d.metadata.source === file.path).length,
pages: totalPages
}
})),
});
} catch (err: any) {
console.error("❌ [Uploads] Erreur:", {
message: err.message,
stack: err.stack,
name: err.name
});
logger.error(`Error in uploading file results: ${err.message}`);
res.status(500).json({ message: 'An error has occurred.' });
}
},
}
);
router.get('/:fileId/view', async (req, res) => {
@ -287,4 +362,55 @@ router.get('/:fileId/view', async (req, res) => {
}
});
router.get('/:fileId/content', async (req, res) => {
try {
const { fileId } = req.params;
// Chercher le fichier PDF dans le dossier uploads
const uploadsDir = path.join(process.cwd(), 'uploads');
const files = fs.readdirSync(uploadsDir);
const pdfFile = files.find(file => file.startsWith(fileId) && file.endsWith('.pdf'));
if (!pdfFile) {
console.error(`❌ PDF non trouvé pour l'ID: ${fileId}`);
return res.status(404).json({ error: 'Document PDF non trouvé' });
}
const filePath = path.join(uploadsDir, pdfFile);
console.log("📄 Envoi du fichier PDF:", filePath);
// Headers pour le PDF
res.setHeader('Content-Type', 'application/pdf');
res.setHeader('Content-Disposition', `inline; filename="${pdfFile}"`);
res.setHeader('Cache-Control', 'public, max-age=3600'); // Cache d'une heure
// Envoyer le fichier
res.sendFile(filePath);
} catch (error) {
console.error('❌ Erreur lors de l\'accès au PDF:', error);
res.status(500).json({ error: 'Erreur lors de l\'accès au document' });
}
});
// Route pour les métadonnées du document
router.get('/:fileId/metadata', async (req, res) => {
try {
const { fileId } = req.params;
// Chercher le fichier JSON des métadonnées
const uploadsDir = path.join(process.cwd(), 'uploads');
const metadataPath = path.join(uploadsDir, `${fileId}-extracted.json`);
if (!fs.existsSync(metadataPath)) {
return res.status(404).json({ error: 'Métadonnées non trouvées' });
}
const metadata = JSON.parse(fs.readFileSync(metadataPath, 'utf-8'));
res.json(metadata);
} catch (error) {
console.error('❌ Erreur lors de la lecture des métadonnées:', error);
res.status(500).json({ error: 'Erreur lors de la lecture des métadonnées' });
}
});
export default router;