Update uploads.ts
RAG optimise
This commit is contained in:
parent
dda24ad349
commit
453a572664
1 changed files with 249 additions and 123 deletions
|
@ -6,20 +6,24 @@ import crypto from 'crypto';
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import { Embeddings } from '@langchain/core/embeddings';
|
import { Embeddings } from '@langchain/core/embeddings';
|
||||||
import { getAvailableEmbeddingModelProviders } from '../lib/providers';
|
import { getAvailableEmbeddingModelProviders } from '../lib/providers';
|
||||||
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
|
import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
|
||||||
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx';
|
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx';
|
||||||
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
|
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||||
import { Document } from '@langchain/core/documents';
|
import { Document } from '@langchain/core/documents';
|
||||||
import { RAGDocumentChain } from '../chains/rag_document_upload';
|
import { RAGDocumentChain } from '../chains/rag_document_upload';
|
||||||
import { Chroma } from "langchain/vectorstores/chroma";
|
import { Chroma } from "@langchain/community/vectorstores/chroma";
|
||||||
|
|
||||||
const router = express.Router();
|
const router = express.Router();
|
||||||
|
|
||||||
|
// Ajout d'un cache pour les embeddings avec le bon type
|
||||||
|
const embeddingsCache = new Map<string, number[]>();
|
||||||
|
|
||||||
|
// Configuration optimisée du text splitter
|
||||||
const splitter = new RecursiveCharacterTextSplitter({
|
const splitter = new RecursiveCharacterTextSplitter({
|
||||||
chunkSize: 1000,
|
chunkSize: 1500,
|
||||||
chunkOverlap: 200,
|
chunkOverlap: 150,
|
||||||
separators: ["\n\n", "\n", ".", "!", "?", ";", ":", " ", ""],
|
separators: ["\n\n", "\n", ".", "!", "?", ";", ":", " ", ""],
|
||||||
keepSeparator: true,
|
keepSeparator: false,
|
||||||
lengthFunction: (text) => text.length
|
lengthFunction: (text) => text.length
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -62,6 +66,129 @@ const scoreDocument = (doc: Document): number => {
|
||||||
return wordCount > 10 && sentenceCount > 0 ? 1 : 0;
|
return wordCount > 10 && sentenceCount > 0 ? 1 : 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Optimisation du traitement des documents
|
||||||
|
const processDocumentInBatches = async (
|
||||||
|
docs: Document[],
|
||||||
|
batchSize: number = 50
|
||||||
|
): Promise<Document[]> => {
|
||||||
|
const processedDocs: Document[] = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < docs.length; i += batchSize) {
|
||||||
|
const batch = docs.slice(i, i + batchSize);
|
||||||
|
const processed = await Promise.all(
|
||||||
|
batch.map(async (doc) => preprocessDocument(doc))
|
||||||
|
);
|
||||||
|
processedDocs.push(...processed);
|
||||||
|
}
|
||||||
|
|
||||||
|
return processedDocs;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Optimisation de l'extraction du texte avec des loaders natifs
|
||||||
|
const extractDocument = async (filePath: string, mimeType: string): Promise<Document[]> => {
|
||||||
|
try {
|
||||||
|
console.log(`📄 Extraction du document: ${filePath} (${mimeType})`);
|
||||||
|
|
||||||
|
let docs: Document[] = [];
|
||||||
|
|
||||||
|
if (mimeType === 'application/pdf') {
|
||||||
|
const loader = new PDFLoader(filePath, {
|
||||||
|
splitPages: true,
|
||||||
|
parsedItemSeparator: "\n",
|
||||||
|
});
|
||||||
|
docs = await loader.load();
|
||||||
|
} else if (mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
|
||||||
|
const loader = new DocxLoader(filePath);
|
||||||
|
docs = await loader.load();
|
||||||
|
} else if (mimeType === 'text/plain') {
|
||||||
|
// Traitement direct des fichiers texte
|
||||||
|
const text = fs.readFileSync(filePath, 'utf-8');
|
||||||
|
docs = [new Document({
|
||||||
|
pageContent: text,
|
||||||
|
metadata: {
|
||||||
|
source: filePath,
|
||||||
|
type: 'text',
|
||||||
|
mime_type: mimeType
|
||||||
|
}
|
||||||
|
})];
|
||||||
|
} else {
|
||||||
|
throw new Error(`Type de fichier non supporté: ${mimeType}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`📑 ${docs.length} pages extraites`);
|
||||||
|
|
||||||
|
// Amélioration du traitement des documents
|
||||||
|
const enhancedDocs = docs.map((doc, index) => {
|
||||||
|
return new Document({
|
||||||
|
pageContent: doc.pageContent,
|
||||||
|
metadata: {
|
||||||
|
...doc.metadata,
|
||||||
|
source: filePath,
|
||||||
|
page: index + 1,
|
||||||
|
total_pages: docs.length,
|
||||||
|
mime_type: mimeType,
|
||||||
|
extraction_date: new Date().toISOString()
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return enhancedDocs;
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`❌ Erreur lors de l'extraction: ${error.message}`);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Fonction utilitaire pour normaliser les embeddings
|
||||||
|
const normalizeL2 = (vector: number[]): number[] => {
|
||||||
|
const norm = Math.sqrt(vector.reduce((sum, val) => sum + val * val, 0));
|
||||||
|
return norm === 0 ? vector : vector.map(v => v / norm);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Optimisation de la génération des embeddings
|
||||||
|
const generateEmbeddings = async (
|
||||||
|
texts: string[],
|
||||||
|
embeddingsModel: Embeddings,
|
||||||
|
dimensions: number = 1536
|
||||||
|
): Promise<number[][]> => {
|
||||||
|
try {
|
||||||
|
// Nettoyage et préparation des textes
|
||||||
|
const cleanedTexts = texts.map(text =>
|
||||||
|
text.replace(/\s+/g, ' ')
|
||||||
|
.trim()
|
||||||
|
.slice(0, 8000) // Limite OpenAI
|
||||||
|
).filter(text => text.length > 0);
|
||||||
|
|
||||||
|
if (cleanedTexts.length === 0) {
|
||||||
|
throw new Error("Aucun texte valide à traiter");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Traitement par lots de 100 (limite OpenAI)
|
||||||
|
const batchSize = 100;
|
||||||
|
const embeddings: number[][] = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < cleanedTexts.length; i += batchSize) {
|
||||||
|
const batch = cleanedTexts.slice(i, i + batchSize);
|
||||||
|
console.log(`🔄 Traitement du lot ${Math.floor(i / batchSize) + 1}/${Math.ceil(cleanedTexts.length / batchSize)}`);
|
||||||
|
|
||||||
|
const batchEmbeddings = await embeddingsModel.embedDocuments(batch);
|
||||||
|
|
||||||
|
// Redimensionnement et normalisation si nécessaire
|
||||||
|
const processedEmbeddings = batchEmbeddings.map(emb => {
|
||||||
|
const resized = (emb as number[]).slice(0, dimensions);
|
||||||
|
return normalizeL2(resized);
|
||||||
|
});
|
||||||
|
|
||||||
|
embeddings.push(...processedEmbeddings);
|
||||||
|
}
|
||||||
|
|
||||||
|
return embeddings;
|
||||||
|
} catch (error) {
|
||||||
|
console.error("❌ Erreur lors de la génération des embeddings:", error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
router.post(
|
router.post(
|
||||||
'/',
|
'/',
|
||||||
upload.fields([
|
upload.fields([
|
||||||
|
@ -85,16 +212,12 @@ router.post(
|
||||||
}
|
}
|
||||||
|
|
||||||
const embeddingModels = await getAvailableEmbeddingModelProviders();
|
const embeddingModels = await getAvailableEmbeddingModelProviders();
|
||||||
console.log("🔍 [Uploads] Modèles disponibles:", Object.keys(embeddingModels));
|
|
||||||
|
|
||||||
const provider = embedding_model_provider ?? Object.keys(embeddingModels)[0];
|
const provider = embedding_model_provider ?? Object.keys(embeddingModels)[0];
|
||||||
const embeddingModel: Embeddings = embedding_model ?? Object.keys(embeddingModels[provider])[0];
|
const embeddingModel = embedding_model ?? Object.keys(embeddingModels[provider])[0];
|
||||||
|
|
||||||
console.log("🤖 [Uploads] Modèle sélectionné:", { provider, model: embeddingModel });
|
|
||||||
|
|
||||||
let embeddingsModel: Embeddings | undefined;
|
let embeddingsModel: Embeddings | undefined;
|
||||||
if (embeddingModels[provider] && embeddingModels[provider][embeddingModel]) {
|
if (embeddingModels[provider] && embeddingModels[provider][embeddingModel]) {
|
||||||
embeddingsModel = embeddingModels[provider][embeddingModel].model as Embeddings | undefined;
|
embeddingsModel = embeddingModels[provider][embeddingModel].model as Embeddings;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!embeddingsModel) {
|
if (!embeddingsModel) {
|
||||||
|
@ -104,138 +227,90 @@ router.post(
|
||||||
}
|
}
|
||||||
|
|
||||||
const files = req.files['files'] as Express.Multer.File[];
|
const files = req.files['files'] as Express.Multer.File[];
|
||||||
console.log("📁 [Uploads] Fichiers reçus:", files?.map(f => ({
|
if (!files?.length) {
|
||||||
name: f.originalname,
|
|
||||||
path: f.path,
|
|
||||||
type: f.mimetype
|
|
||||||
})));
|
|
||||||
|
|
||||||
if (!files || files.length === 0) {
|
|
||||||
console.warn("⚠️ [Uploads] Aucun fichier reçu");
|
console.warn("⚠️ [Uploads] Aucun fichier reçu");
|
||||||
res.status(400).json({ message: 'No files uploaded' });
|
res.status(400).json({ message: 'No files uploaded' });
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const processedDocs: Document[] = [];
|
// Traitement parallèle des fichiers
|
||||||
const ragChain = new RAGDocumentChain();
|
const results = await Promise.all(
|
||||||
let totalPages = 0;
|
|
||||||
|
|
||||||
await Promise.all(
|
|
||||||
files.map(async (file) => {
|
files.map(async (file) => {
|
||||||
console.log(`📄 [Uploads] Traitement du fichier: ${file.originalname}`);
|
try {
|
||||||
let docs: Document[] = [];
|
console.log(`📄 [Uploads] Traitement du fichier: ${file.originalname}`);
|
||||||
|
|
||||||
|
let docs: Document[] = [];
|
||||||
|
const cacheKey = `${file.path}_${embedding_model}`;
|
||||||
|
|
||||||
|
if (embeddingsCache.has(cacheKey)) {
|
||||||
|
console.log("🎯 [Uploads] Utilisation du cache pour", file.originalname);
|
||||||
|
return {
|
||||||
|
fileName: file.originalname,
|
||||||
|
fileId: file.filename.replace(/\.\w+$/, ''),
|
||||||
|
cached: true
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
if (file.mimetype === 'application/pdf') {
|
docs = await extractDocument(file.path, file.mimetype);
|
||||||
console.log(`📚 [Uploads] Chargement du PDF: ${file.path}`);
|
const processedDocs = await processDocumentInBatches(docs);
|
||||||
const loader = new PDFLoader(file.path, {
|
console.log(`✂️ [Uploads] ${processedDocs.length} documents traités`);
|
||||||
splitPages: true
|
|
||||||
});
|
|
||||||
docs = await loader.load();
|
|
||||||
totalPages += docs.length;
|
|
||||||
} else if (file.mimetype === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
|
|
||||||
console.log(`📝 [Uploads] Chargement du DOCX: ${file.path}`);
|
|
||||||
const loader = new DocxLoader(file.path);
|
|
||||||
docs = await loader.load();
|
|
||||||
totalPages += docs.length;
|
|
||||||
} else if (file.mimetype === 'text/plain') {
|
|
||||||
console.log(`📄 [Uploads] Chargement du TXT: ${file.path}`);
|
|
||||||
const text = fs.readFileSync(file.path, 'utf-8');
|
|
||||||
docs = [new Document({
|
|
||||||
pageContent: text,
|
|
||||||
metadata: {
|
|
||||||
title: file.originalname,
|
|
||||||
source: file.path,
|
|
||||||
type: 'text'
|
|
||||||
}
|
|
||||||
})];
|
|
||||||
totalPages += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
const preprocessedDocs = docs.map(preprocessDocument);
|
// Utilisation de la nouvelle fonction d'embeddings
|
||||||
const scoredDocs = preprocessedDocs.filter(doc => scoreDocument(doc) > 0);
|
const embeddings = await generateEmbeddings(
|
||||||
|
processedDocs.map(doc => doc.pageContent),
|
||||||
console.log(`✂️ [Uploads] Splitting du document en ${scoredDocs.length} parties valides`);
|
embeddingsModel,
|
||||||
const splitted = await splitter.splitDocuments(scoredDocs);
|
1536 // Dimension par défaut pour text-embedding-3-small
|
||||||
|
);
|
||||||
|
|
||||||
const enrichedDocs = splitted.map((doc, index) => {
|
// Mise en cache du premier embedding
|
||||||
const pageNumber = Math.floor(index / (splitted.length / docs.length)) + 1;
|
if (embeddings.length > 0) {
|
||||||
return new Document({
|
embeddingsCache.set(cacheKey, embeddings[0]);
|
||||||
pageContent: doc.pageContent,
|
}
|
||||||
metadata: {
|
|
||||||
...doc.metadata,
|
// Sauvegarde avec les embeddings normalisés
|
||||||
source: file.path,
|
const pathToSave = file.path.replace(/\.\w+$/, '-extracted.json');
|
||||||
title: file.originalname,
|
fs.writeFileSync(pathToSave, JSON.stringify({
|
||||||
page_number: pageNumber,
|
title: file.originalname,
|
||||||
chunk_index: index,
|
contents: processedDocs.map((doc, index) => ({
|
||||||
total_chunks: splitted.length,
|
content: doc.pageContent,
|
||||||
file_type: file.mimetype,
|
metadata: doc.metadata,
|
||||||
search_text: doc.pageContent.substring(0, 100).trim()
|
embedding: embeddings[index]
|
||||||
|
})),
|
||||||
|
pageCount: docs.length,
|
||||||
|
processingDate: new Date().toISOString()
|
||||||
|
}, null, 2));
|
||||||
|
|
||||||
|
return {
|
||||||
|
fileName: file.originalname,
|
||||||
|
fileId: file.filename.replace(/\.\w+$/, ''),
|
||||||
|
stats: {
|
||||||
|
chunks: processedDocs.length,
|
||||||
|
pages: docs.length,
|
||||||
|
embeddingsGenerated: embeddings.length
|
||||||
}
|
}
|
||||||
});
|
};
|
||||||
});
|
} catch (error) {
|
||||||
|
console.error(`❌ Erreur lors du traitement de ${file.originalname}:`, error);
|
||||||
processedDocs.push(...enrichedDocs);
|
return {
|
||||||
|
fileName: file.originalname,
|
||||||
const pathToSave = file.path.replace(/\.\w+$/, '-extracted.json');
|
fileId: file.filename.replace(/\.\w+$/, ''),
|
||||||
const contentToSave = {
|
error: error.message
|
||||||
title: file.originalname,
|
};
|
||||||
contents: enrichedDocs.map((doc) => ({
|
}
|
||||||
content: doc.pageContent,
|
|
||||||
metadata: doc.metadata
|
|
||||||
})),
|
|
||||||
pageCount: docs.length,
|
|
||||||
processingDate: new Date().toISOString()
|
|
||||||
};
|
|
||||||
|
|
||||||
fs.writeFileSync(pathToSave, JSON.stringify(contentToSave, null, 2));
|
|
||||||
|
|
||||||
console.log(`🧮 [Uploads] Génération des embeddings pour ${enrichedDocs.length} chunks`);
|
|
||||||
const embeddings = await embeddingsModel.embedDocuments(
|
|
||||||
enrichedDocs.map((doc) => doc.pageContent)
|
|
||||||
);
|
|
||||||
|
|
||||||
const pathToSaveEmbeddings = file.path.replace(/\.\w+$/, '-embeddings.json');
|
|
||||||
const embeddingsToSave = {
|
|
||||||
title: file.originalname,
|
|
||||||
embeddings: embeddings.map((embedding, index) => ({
|
|
||||||
vector: embedding,
|
|
||||||
metadata: enrichedDocs[index].metadata
|
|
||||||
}))
|
|
||||||
};
|
|
||||||
|
|
||||||
fs.writeFileSync(pathToSaveEmbeddings, JSON.stringify(embeddingsToSave));
|
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
||||||
console.log("🔄 [Uploads] Initialisation du vectorStore avec", processedDocs.length, "documents");
|
res.status(200).json({ files: results });
|
||||||
const initResult = await ragChain.initializeVectorStoreFromDocuments(
|
|
||||||
processedDocs,
|
|
||||||
embeddingsModel
|
|
||||||
);
|
|
||||||
|
|
||||||
console.log("✅ [Uploads] VectorStore initialisé:", initResult);
|
|
||||||
|
|
||||||
res.status(200).json({
|
|
||||||
files: files.map((file) => ({
|
|
||||||
fileName: file.originalname,
|
|
||||||
fileExtension: file.filename.split('.').pop(),
|
|
||||||
fileId: file.filename.replace(/\.\w+$/, ''),
|
|
||||||
stats: {
|
|
||||||
chunks: processedDocs.filter(d => d.metadata.source === file.path).length,
|
|
||||||
pages: totalPages
|
|
||||||
}
|
|
||||||
})),
|
|
||||||
});
|
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
console.error("❌ [Uploads] Erreur:", {
|
console.error("❌ [Uploads] Erreur:", {
|
||||||
message: err.message,
|
message: err.message,
|
||||||
stack: err.stack,
|
stack: err.stack,
|
||||||
name: err.name
|
name: err.name
|
||||||
});
|
});
|
||||||
logger.error(`Error in uploading file results: ${err.message}`);
|
|
||||||
res.status(500).json({ message: 'An error has occurred.' });
|
res.status(500).json({ message: 'An error has occurred.' });
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
router.get('/:fileId/view', async (req, res) => {
|
router.get('/:fileId/view', async (req, res) => {
|
||||||
|
@ -287,4 +362,55 @@ router.get('/:fileId/view', async (req, res) => {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
router.get('/:fileId/content', async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { fileId } = req.params;
|
||||||
|
|
||||||
|
// Chercher le fichier PDF dans le dossier uploads
|
||||||
|
const uploadsDir = path.join(process.cwd(), 'uploads');
|
||||||
|
const files = fs.readdirSync(uploadsDir);
|
||||||
|
const pdfFile = files.find(file => file.startsWith(fileId) && file.endsWith('.pdf'));
|
||||||
|
|
||||||
|
if (!pdfFile) {
|
||||||
|
console.error(`❌ PDF non trouvé pour l'ID: ${fileId}`);
|
||||||
|
return res.status(404).json({ error: 'Document PDF non trouvé' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const filePath = path.join(uploadsDir, pdfFile);
|
||||||
|
console.log("📄 Envoi du fichier PDF:", filePath);
|
||||||
|
|
||||||
|
// Headers pour le PDF
|
||||||
|
res.setHeader('Content-Type', 'application/pdf');
|
||||||
|
res.setHeader('Content-Disposition', `inline; filename="${pdfFile}"`);
|
||||||
|
res.setHeader('Cache-Control', 'public, max-age=3600'); // Cache d'une heure
|
||||||
|
|
||||||
|
// Envoyer le fichier
|
||||||
|
res.sendFile(filePath);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('❌ Erreur lors de l\'accès au PDF:', error);
|
||||||
|
res.status(500).json({ error: 'Erreur lors de l\'accès au document' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Route pour les métadonnées du document
|
||||||
|
router.get('/:fileId/metadata', async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { fileId } = req.params;
|
||||||
|
|
||||||
|
// Chercher le fichier JSON des métadonnées
|
||||||
|
const uploadsDir = path.join(process.cwd(), 'uploads');
|
||||||
|
const metadataPath = path.join(uploadsDir, `${fileId}-extracted.json`);
|
||||||
|
|
||||||
|
if (!fs.existsSync(metadataPath)) {
|
||||||
|
return res.status(404).json({ error: 'Métadonnées non trouvées' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const metadata = JSON.parse(fs.readFileSync(metadataPath, 'utf-8'));
|
||||||
|
res.json(metadata);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('❌ Erreur lors de la lecture des métadonnées:', error);
|
||||||
|
res.status(500).json({ error: 'Erreur lors de la lecture des métadonnées' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
export default router;
|
export default router;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue