feat(web-search): add URL & PDF searching capibilities

2024-07-30 10:09:05 +05:30 · 2024-07-30 10:09:05 +05:30 · 8e4f0c6a6d
commit 8e4f0c6a6d
parent 6f50e25bf3
4 changed files with 338 additions and 19 deletions
--- a/src/lib/linkDocument.ts
+++ b/src/lib/linkDocument.ts
@ -0,0 +1,81 @@
+import axios from 'axios';
+import { htmlToText } from 'html-to-text'
+import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
+import { Document } from '@langchain/core/documents';
+import pdfParse from 'pdf-parse'
+
+export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
+  const splitter = new RecursiveCharacterTextSplitter();
+
+  let docs: Document[] = [];
+
+  await Promise.all(
+    links.map(async (link) => {
+      link =
+        link.startsWith('http://') || link.startsWith('https://')
+          ? link
+          : `https://${link}`;
+
+      const res = await axios.get(link, {
+        responseType: 'arraybuffer',
+      });
+
+      const isPdf = res.headers['content-type'] === 'application/pdf';
+
+      if (isPdf) {
+        const pdfText = await pdfParse(res.data)
+        const parsedText = pdfText.text
+          .replace(/(\r\n|\n|\r)/gm, ' ')
+          .replace(/\s+/g, ' ')
+          .trim();
+
+        const splittedText = await splitter.splitText(parsedText);
+        const title = 'PDF Document'
+
+        const linkDocs = splittedText.map((text) => {
+          return new Document({
+            pageContent: text,
+            metadata: {
+              title: title,
+              url: link,
+            },
+          });
+        });
+
+        docs.push(...linkDocs);
+        return;
+      }
+
+      const parsedText = htmlToText(res.data.toString('utf8'), {
+        selectors: [
+          {
+            selector: 'a',
+            options: {
+              ignoreHref: true,
+            }
+          },
+        ]
+      })
+        .replace(/(\r\n|\n|\r)/gm, ' ')
+        .replace(/\s+/g, ' ')
+        .trim();
+
+      const splittedText = await splitter.splitText(parsedText);
+      const title = res.data.toString('utf8').match(/<title>(.*?)<\/title>/)?.[1];
+
+      const linkDocs = splittedText.map((text) => {
+        return new Document({
+          pageContent: text,
+          metadata: {
+            title: title || link,
+            url: link,
+          },
+        });
+      });
+
+      docs.push(...linkDocs);
+    }),
+  );
+
+  return docs;
+};